stringtriebuilder.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2010-2012,2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: stringtriebuilder.h
  9. * encoding: US-ASCII
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2010dec24
  14. * created by: Markus W. Scherer
  15. */
  16. #ifndef __STRINGTRIEBUILDER_H__
  17. #define __STRINGTRIEBUILDER_H__
  18. #include "unicode/utypes.h"
  19. #include "unicode/uobject.h"
  20. /**
  21. * \file
  22. * \brief C++ API: Builder API for trie builders
  23. */
  24. // Forward declaration.
  25. struct UHashtable;
  26. typedef struct UHashtable UHashtable;
  27. /**
  28. * Build options for BytesTrieBuilder and CharsTrieBuilder.
  29. * @stable ICU 4.8
  30. */
  31. enum UStringTrieBuildOption {
  32. /**
  33. * Builds a trie quickly.
  34. * @stable ICU 4.8
  35. */
  36. USTRINGTRIE_BUILD_FAST,
  37. /**
  38. * Builds a trie more slowly, attempting to generate
  39. * a shorter but equivalent serialization.
  40. * This build option also uses more memory.
  41. *
  42. * This option can be effective when many integer values are the same
  43. * and string/byte sequence suffixes can be shared.
  44. * Runtime speed is not expected to improve.
  45. * @stable ICU 4.8
  46. */
  47. USTRINGTRIE_BUILD_SMALL
  48. };
  49. U_NAMESPACE_BEGIN
  50. /**
  51. * Base class for string trie builder classes.
  52. *
  53. * This class is not intended for public subclassing.
  54. * @stable ICU 4.8
  55. */
  56. class U_COMMON_API StringTrieBuilder : public UObject {
  57. public:
  58. #ifndef U_HIDE_INTERNAL_API
  59. /** @internal */
  60. static UBool hashNode(const void *node);
  61. /** @internal */
  62. static UBool equalNodes(const void *left, const void *right);
  63. #endif /* U_HIDE_INTERNAL_API */
  64. protected:
  65. // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
  66. // or else the compiler will create a public default constructor.
  67. /** @internal */
  68. StringTrieBuilder();
  69. /** @internal */
  70. virtual ~StringTrieBuilder();
  71. #ifndef U_HIDE_INTERNAL_API
  72. /** @internal */
  73. void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
  74. /** @internal */
  75. void deleteCompactBuilder();
  76. /** @internal */
  77. void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
  78. /** @internal */
  79. int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
  80. /** @internal */
  81. int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
  82. #endif /* U_HIDE_INTERNAL_API */
  83. class Node;
  84. #ifndef U_HIDE_INTERNAL_API
  85. /** @internal */
  86. Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
  87. /** @internal */
  88. Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
  89. int32_t length, UErrorCode &errorCode);
  90. #endif /* U_HIDE_INTERNAL_API */
  91. /** @internal */
  92. virtual int32_t getElementStringLength(int32_t i) const = 0;
  93. /** @internal */
  94. virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
  95. /** @internal */
  96. virtual int32_t getElementValue(int32_t i) const = 0;
  97. // Finds the first unit index after this one where
  98. // the first and last element have different units again.
  99. /** @internal */
  100. virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
  101. // Number of different units at unitIndex.
  102. /** @internal */
  103. virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
  104. /** @internal */
  105. virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
  106. /** @internal */
  107. virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
  108. /** @internal */
  109. virtual UBool matchNodesCanHaveValues() const = 0;
  110. /** @internal */
  111. virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
  112. /** @internal */
  113. virtual int32_t getMinLinearMatch() const = 0;
  114. /** @internal */
  115. virtual int32_t getMaxLinearMatchLength() const = 0;
  116. #ifndef U_HIDE_INTERNAL_API
  117. // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
  118. /** @internal */
  119. static const int32_t kMaxBranchLinearSubNodeLength=5;
  120. // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
  121. // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
  122. /** @internal */
  123. static const int32_t kMaxSplitBranchLevels=14;
  124. /**
  125. * Makes sure that there is only one unique node registered that is
  126. * equivalent to newNode.
  127. * @param newNode Input node. The builder takes ownership.
  128. * @param errorCode ICU in/out UErrorCode.
  129. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
  130. * @return newNode if it is the first of its kind, or
  131. * an equivalent node if newNode is a duplicate.
  132. * @internal
  133. */
  134. Node *registerNode(Node *newNode, UErrorCode &errorCode);
  135. /**
  136. * Makes sure that there is only one unique FinalValueNode registered
  137. * with this value.
  138. * Avoids creating a node if the value is a duplicate.
  139. * @param value A final value.
  140. * @param errorCode ICU in/out UErrorCode.
  141. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
  142. * @return A FinalValueNode with the given value.
  143. * @internal
  144. */
  145. Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
  146. #endif /* U_HIDE_INTERNAL_API */
  147. /*
  148. * C++ note:
  149. * registerNode() and registerFinalValue() take ownership of their input nodes,
  150. * and only return owned nodes.
  151. * If they see a failure UErrorCode, they will delete the input node.
  152. * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
  153. * If there is a failure, they return NULL.
  154. *
  155. * NULL Node pointers can be safely passed into other Nodes because
  156. * they call the static Node::hashCode() which checks for a NULL pointer first.
  157. *
  158. * Therefore, as long as builder functions register a new node,
  159. * they need to check for failures only before explicitly dereferencing
  160. * a Node pointer, or before setting a new UErrorCode.
  161. */
  162. // Hash set of nodes, maps from nodes to integer 1.
  163. /** @internal */
  164. UHashtable *nodes;
  165. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  166. // it is needed for layout of other objects.
  167. /** @internal */
  168. class Node : public UObject {
  169. public:
  170. Node(int32_t initialHash) : hash(initialHash), offset(0) {}
  171. inline int32_t hashCode() const { return hash; }
  172. // Handles node==NULL.
  173. static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
  174. // Base class operator==() compares the actual class types.
  175. virtual UBool operator==(const Node &other) const;
  176. inline UBool operator!=(const Node &other) const { return !operator==(other); }
  177. /**
  178. * Traverses the Node graph and numbers branch edges, with rightmost edges first.
  179. * This is to avoid writing a duplicate node twice.
  180. *
  181. * Branch nodes in this trie data structure are not symmetric.
  182. * Most branch edges "jump" to other nodes but the rightmost branch edges
  183. * just continue without a jump.
  184. * Therefore, write() must write the rightmost branch edge last
  185. * (trie units are written backwards), and must write it at that point even if
  186. * it is a duplicate of a node previously written elsewhere.
  187. *
  188. * This function visits and marks right branch edges first.
  189. * Edges are numbered with increasingly negative values because we share the
  190. * offset field which gets positive values when nodes are written.
  191. * A branch edge also remembers the first number for any of its edges.
  192. *
  193. * When a further-left branch edge has a number in the range of the rightmost
  194. * edge's numbers, then it will be written as part of the required right edge
  195. * and we can avoid writing it first.
  196. *
  197. * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
  198. * edge numbers.
  199. *
  200. * @param edgeNumber The first edge number for this node and its sub-nodes.
  201. * @return An edge number that is at least the maximum-negative
  202. * of the input edge number and the numbers of this node and all of its sub-nodes.
  203. */
  204. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  205. // write() must set the offset to a positive value.
  206. virtual void write(StringTrieBuilder &builder) = 0;
  207. // See markRightEdgesFirst.
  208. inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
  209. StringTrieBuilder &builder) {
  210. // Note: Edge numbers are negative, lastRight<=firstRight.
  211. // If offset>0 then this node and its sub-nodes have been written already
  212. // and we need not write them again.
  213. // If this node is part of the unwritten right branch edge,
  214. // then we wait until that is written.
  215. if(offset<0 && (offset<lastRight || firstRight<offset)) {
  216. write(builder);
  217. }
  218. }
  219. inline int32_t getOffset() const { return offset; }
  220. protected:
  221. int32_t hash;
  222. int32_t offset;
  223. };
  224. #ifndef U_HIDE_INTERNAL_API
  225. // This class should not be overridden because
  226. // registerFinalValue() compares a stack-allocated FinalValueNode
  227. // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
  228. // with the input node, and the
  229. // !Node::operator==(other) used inside FinalValueNode::operator==(other)
  230. // will be false if the typeid's are different.
  231. /** @internal */
  232. class FinalValueNode : public Node {
  233. public:
  234. FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
  235. virtual UBool operator==(const Node &other) const;
  236. virtual void write(StringTrieBuilder &builder);
  237. protected:
  238. int32_t value;
  239. };
  240. #endif /* U_HIDE_INTERNAL_API */
  241. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  242. // it is needed for layout of other objects.
  243. /**
  244. * @internal
  245. */
  246. class ValueNode : public Node {
  247. public:
  248. ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
  249. virtual UBool operator==(const Node &other) const;
  250. void setValue(int32_t v) {
  251. hasValue=TRUE;
  252. value=v;
  253. hash=hash*37+v;
  254. }
  255. protected:
  256. UBool hasValue;
  257. int32_t value;
  258. };
  259. #ifndef U_HIDE_INTERNAL_API
  260. /**
  261. * @internal
  262. */
  263. class IntermediateValueNode : public ValueNode {
  264. public:
  265. IntermediateValueNode(int32_t v, Node *nextNode)
  266. : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
  267. virtual UBool operator==(const Node &other) const;
  268. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  269. virtual void write(StringTrieBuilder &builder);
  270. protected:
  271. Node *next;
  272. };
  273. #endif /* U_HIDE_INTERNAL_API */
  274. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  275. // it is needed for layout of other objects.
  276. /**
  277. * @internal
  278. */
  279. class LinearMatchNode : public ValueNode {
  280. public:
  281. LinearMatchNode(int32_t len, Node *nextNode)
  282. : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
  283. length(len), next(nextNode) {}
  284. virtual UBool operator==(const Node &other) const;
  285. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  286. protected:
  287. int32_t length;
  288. Node *next;
  289. };
  290. #ifndef U_HIDE_INTERNAL_API
  291. /**
  292. * @internal
  293. */
  294. class BranchNode : public Node {
  295. public:
  296. BranchNode(int32_t initialHash) : Node(initialHash) {}
  297. protected:
  298. int32_t firstEdgeNumber;
  299. };
  300. /**
  301. * @internal
  302. */
  303. class ListBranchNode : public BranchNode {
  304. public:
  305. ListBranchNode() : BranchNode(0x444444), length(0) {}
  306. virtual UBool operator==(const Node &other) const;
  307. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  308. virtual void write(StringTrieBuilder &builder);
  309. // Adds a unit with a final value.
  310. void add(int32_t c, int32_t value) {
  311. units[length]=(UChar)c;
  312. equal[length]=NULL;
  313. values[length]=value;
  314. ++length;
  315. hash=(hash*37+c)*37+value;
  316. }
  317. // Adds a unit which leads to another match node.
  318. void add(int32_t c, Node *node) {
  319. units[length]=(UChar)c;
  320. equal[length]=node;
  321. values[length]=0;
  322. ++length;
  323. hash=(hash*37+c)*37+hashCode(node);
  324. }
  325. protected:
  326. Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
  327. int32_t length;
  328. int32_t values[kMaxBranchLinearSubNodeLength];
  329. UChar units[kMaxBranchLinearSubNodeLength];
  330. };
  331. /**
  332. * @internal
  333. */
  334. class SplitBranchNode : public BranchNode {
  335. public:
  336. SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
  337. : BranchNode(((0x555555*37+middleUnit)*37+
  338. hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
  339. unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
  340. virtual UBool operator==(const Node &other) const;
  341. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  342. virtual void write(StringTrieBuilder &builder);
  343. protected:
  344. UChar unit;
  345. Node *lessThan;
  346. Node *greaterOrEqual;
  347. };
  348. // Branch head node, for writing the actual node lead unit.
  349. /** @internal */
  350. class BranchHeadNode : public ValueNode {
  351. public:
  352. BranchHeadNode(int32_t len, Node *subNode)
  353. : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
  354. length(len), next(subNode) {}
  355. virtual UBool operator==(const Node &other) const;
  356. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  357. virtual void write(StringTrieBuilder &builder);
  358. protected:
  359. int32_t length;
  360. Node *next; // A branch sub-node.
  361. };
  362. #endif /* U_HIDE_INTERNAL_API */
  363. /** @internal */
  364. virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
  365. Node *nextNode) const = 0;
  366. /** @internal */
  367. virtual int32_t write(int32_t unit) = 0;
  368. /** @internal */
  369. virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
  370. /** @internal */
  371. virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
  372. /** @internal */
  373. virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
  374. /** @internal */
  375. virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
  376. };
  377. U_NAMESPACE_END
  378. #endif // __STRINGTRIEBUILDER_H__