coleitr.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 1997-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ******************************************************************************
  8. */
  9. /**
  10. * \file
  11. * \brief C++ API: Collation Element Iterator.
  12. */
  13. /**
  14. * File coleitr.h
  15. *
  16. * Created by: Helena Shih
  17. *
  18. * Modification History:
  19. *
  20. * Date Name Description
  21. *
  22. * 8/18/97 helena Added internal API documentation.
  23. * 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
  24. * 12/10/99 aliu Ported Thai collation support from Java.
  25. * 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
  26. * 02/19/01 swquek Removed CollationElementsIterator() since it is
  27. * private constructor and no calls are made to it
  28. * 2012-2014 markus Rewritten in C++ again.
  29. */
  30. #ifndef COLEITR_H
  31. #define COLEITR_H
  32. #include "unicode/utypes.h"
  33. #if !UCONFIG_NO_COLLATION
  34. #include "unicode/unistr.h"
  35. #include "unicode/uobject.h"
  36. struct UCollationElements;
  37. struct UHashtable;
  38. U_NAMESPACE_BEGIN
  39. struct CollationData;
  40. class CollationIterator;
  41. class RuleBasedCollator;
  42. class UCollationPCE;
  43. class UVector32;
  44. /**
  45. * The CollationElementIterator class is used as an iterator to walk through
  46. * each character of an international string. Use the iterator to return the
  47. * ordering priority of the positioned character. The ordering priority of a
  48. * character, which we refer to as a key, defines how a character is collated in
  49. * the given collation object.
  50. * For example, consider the following in Slovak and in traditional Spanish collation:
  51. * <pre>
  52. * "ca" -> the first key is key('c') and second key is key('a').
  53. * "cha" -> the first key is key('ch') and second key is key('a').</pre>
  54. * And in German phonebook collation,
  55. * <pre> \htmlonly "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
  56. * the third key is key('b'). \endhtmlonly </pre>
  57. * The key of a character, is an integer composed of primary order(short),
  58. * secondary order(char), and tertiary order(char). Java strictly defines the
  59. * size and signedness of its primitive data types. Therefore, the static
  60. * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
  61. * int32_t to ensure the correctness of the key value.
  62. * <p>Example of the iterator usage: (without error checking)
  63. * <pre>
  64. * \code
  65. * void CollationElementIterator_Example()
  66. * {
  67. * UnicodeString str = "This is a test";
  68. * UErrorCode success = U_ZERO_ERROR;
  69. * RuleBasedCollator* rbc =
  70. * (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
  71. * CollationElementIterator* c =
  72. * rbc->createCollationElementIterator( str );
  73. * int32_t order = c->next(success);
  74. * c->reset();
  75. * order = c->previous(success);
  76. * delete c;
  77. * delete rbc;
  78. * }
  79. * \endcode
  80. * </pre>
  81. * <p>
  82. * The method next() returns the collation order of the next character based on
  83. * the comparison level of the collator. The method previous() returns the
  84. * collation order of the previous character based on the comparison level of
  85. * the collator. The Collation Element Iterator moves only in one direction
  86. * between calls to reset(), setOffset(), or setText(). That is, next()
  87. * and previous() can not be inter-used. Whenever previous() is to be called after
  88. * next() or vice versa, reset(), setOffset() or setText() has to be called first
  89. * to reset the status, shifting pointers to either the end or the start of
  90. * the string (reset() or setText()), or the specified position (setOffset()).
  91. * Hence at the next call of next() or previous(), the first or last collation order,
  92. * or collation order at the spefcifieid position will be returned. If a change of
  93. * direction is done without one of these calls, the result is undefined.
  94. * <p>
  95. * The result of a forward iterate (next()) and reversed result of the backward
  96. * iterate (previous()) on the same string are equivalent, if collation orders
  97. * with the value 0 are ignored.
  98. * Character based on the comparison level of the collator. A collation order
  99. * consists of primary order, secondary order and tertiary order. The data
  100. * type of the collation order is <strong>int32_t</strong>.
  101. *
  102. * Note, CollationElementIterator should not be subclassed.
  103. * @see Collator
  104. * @see RuleBasedCollator
  105. * @version 1.8 Jan 16 2001
  106. */
  107. class U_I18N_API CollationElementIterator U_FINAL : public UObject {
  108. public:
  109. // CollationElementIterator public data member ------------------------------
  110. enum {
  111. /**
  112. * NULLORDER indicates that an error has occured while processing
  113. * @stable ICU 2.0
  114. */
  115. NULLORDER = (int32_t)0xffffffff
  116. };
  117. // CollationElementIterator public constructor/destructor -------------------
  118. /**
  119. * Copy constructor.
  120. *
  121. * @param other the object to be copied from
  122. * @stable ICU 2.0
  123. */
  124. CollationElementIterator(const CollationElementIterator& other);
  125. /**
  126. * Destructor
  127. * @stable ICU 2.0
  128. */
  129. virtual ~CollationElementIterator();
  130. // CollationElementIterator public methods ----------------------------------
  131. /**
  132. * Returns true if "other" is the same as "this"
  133. *
  134. * @param other the object to be compared
  135. * @return true if "other" is the same as "this"
  136. * @stable ICU 2.0
  137. */
  138. UBool operator==(const CollationElementIterator& other) const;
  139. /**
  140. * Returns true if "other" is not the same as "this".
  141. *
  142. * @param other the object to be compared
  143. * @return true if "other" is not the same as "this"
  144. * @stable ICU 2.0
  145. */
  146. UBool operator!=(const CollationElementIterator& other) const;
  147. /**
  148. * Resets the cursor to the beginning of the string.
  149. * @stable ICU 2.0
  150. */
  151. void reset(void);
  152. /**
  153. * Gets the ordering priority of the next character in the string.
  154. * @param status the error code status.
  155. * @return the next character's ordering. otherwise returns NULLORDER if an
  156. * error has occured or if the end of string has been reached
  157. * @stable ICU 2.0
  158. */
  159. int32_t next(UErrorCode& status);
  160. /**
  161. * Get the ordering priority of the previous collation element in the string.
  162. * @param status the error code status.
  163. * @return the previous element's ordering. otherwise returns NULLORDER if an
  164. * error has occured or if the start of string has been reached
  165. * @stable ICU 2.0
  166. */
  167. int32_t previous(UErrorCode& status);
  168. /**
  169. * Gets the primary order of a collation order.
  170. * @param order the collation order
  171. * @return the primary order of a collation order.
  172. * @stable ICU 2.0
  173. */
  174. static inline int32_t primaryOrder(int32_t order);
  175. /**
  176. * Gets the secondary order of a collation order.
  177. * @param order the collation order
  178. * @return the secondary order of a collation order.
  179. * @stable ICU 2.0
  180. */
  181. static inline int32_t secondaryOrder(int32_t order);
  182. /**
  183. * Gets the tertiary order of a collation order.
  184. * @param order the collation order
  185. * @return the tertiary order of a collation order.
  186. * @stable ICU 2.0
  187. */
  188. static inline int32_t tertiaryOrder(int32_t order);
  189. /**
  190. * Return the maximum length of any expansion sequences that end with the
  191. * specified comparison order.
  192. * @param order a collation order returned by previous or next.
  193. * @return maximum size of the expansion sequences ending with the collation
  194. * element or 1 if collation element does not occur at the end of any
  195. * expansion sequence
  196. * @stable ICU 2.0
  197. */
  198. int32_t getMaxExpansion(int32_t order) const;
  199. /**
  200. * Gets the comparison order in the desired strength. Ignore the other
  201. * differences.
  202. * @param order The order value
  203. * @stable ICU 2.0
  204. */
  205. int32_t strengthOrder(int32_t order) const;
  206. /**
  207. * Sets the source string.
  208. * @param str the source string.
  209. * @param status the error code status.
  210. * @stable ICU 2.0
  211. */
  212. void setText(const UnicodeString& str, UErrorCode& status);
  213. /**
  214. * Sets the source string.
  215. * @param str the source character iterator.
  216. * @param status the error code status.
  217. * @stable ICU 2.0
  218. */
  219. void setText(CharacterIterator& str, UErrorCode& status);
  220. /**
  221. * Checks if a comparison order is ignorable.
  222. * @param order the collation order.
  223. * @return TRUE if a character is ignorable, FALSE otherwise.
  224. * @stable ICU 2.0
  225. */
  226. static inline UBool isIgnorable(int32_t order);
  227. /**
  228. * Gets the offset of the currently processed character in the source string.
  229. * @return the offset of the character.
  230. * @stable ICU 2.0
  231. */
  232. int32_t getOffset(void) const;
  233. /**
  234. * Sets the offset of the currently processed character in the source string.
  235. * @param newOffset the new offset.
  236. * @param status the error code status.
  237. * @return the offset of the character.
  238. * @stable ICU 2.0
  239. */
  240. void setOffset(int32_t newOffset, UErrorCode& status);
  241. /**
  242. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  243. *
  244. * @stable ICU 2.2
  245. */
  246. virtual UClassID getDynamicClassID() const;
  247. /**
  248. * ICU "poor man's RTTI", returns a UClassID for this class.
  249. *
  250. * @stable ICU 2.2
  251. */
  252. static UClassID U_EXPORT2 getStaticClassID();
  253. #ifndef U_HIDE_INTERNAL_API
  254. /** @internal */
  255. static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
  256. return reinterpret_cast<CollationElementIterator *>(uc);
  257. }
  258. /** @internal */
  259. static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
  260. return reinterpret_cast<const CollationElementIterator *>(uc);
  261. }
  262. /** @internal */
  263. inline UCollationElements *toUCollationElements() {
  264. return reinterpret_cast<UCollationElements *>(this);
  265. }
  266. /** @internal */
  267. inline const UCollationElements *toUCollationElements() const {
  268. return reinterpret_cast<const UCollationElements *>(this);
  269. }
  270. #endif // U_HIDE_INTERNAL_API
  271. private:
  272. friend class RuleBasedCollator;
  273. friend class UCollationPCE;
  274. /**
  275. * CollationElementIterator constructor. This takes the source string and the
  276. * collation object. The cursor will walk thru the source string based on the
  277. * predefined collation rules. If the source string is empty, NULLORDER will
  278. * be returned on the calls to next().
  279. * @param sourceText the source string.
  280. * @param order the collation object.
  281. * @param status the error code status.
  282. */
  283. CollationElementIterator(const UnicodeString& sourceText,
  284. const RuleBasedCollator* order, UErrorCode& status);
  285. // Note: The constructors should take settings & tailoring, not a collator,
  286. // to avoid circular dependencies.
  287. // However, for operator==() we would need to be able to compare tailoring data for equality
  288. // without making CollationData or CollationTailoring depend on TailoredSet.
  289. // (See the implementation of RuleBasedCollator::operator==().)
  290. // That might require creating an intermediate class that would be used
  291. // by both CollationElementIterator and RuleBasedCollator
  292. // but only contain the part of RBC== related to data and rules.
  293. /**
  294. * CollationElementIterator constructor. This takes the source string and the
  295. * collation object. The cursor will walk thru the source string based on the
  296. * predefined collation rules. If the source string is empty, NULLORDER will
  297. * be returned on the calls to next().
  298. * @param sourceText the source string.
  299. * @param order the collation object.
  300. * @param status the error code status.
  301. */
  302. CollationElementIterator(const CharacterIterator& sourceText,
  303. const RuleBasedCollator* order, UErrorCode& status);
  304. /**
  305. * Assignment operator
  306. *
  307. * @param other the object to be copied
  308. */
  309. const CollationElementIterator&
  310. operator=(const CollationElementIterator& other);
  311. CollationElementIterator(); // default constructor not implemented
  312. /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
  313. inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
  314. static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
  315. static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
  316. // CollationElementIterator private data members ----------------------------
  317. CollationIterator *iter_; // owned
  318. const RuleBasedCollator *rbc_; // aliased
  319. uint32_t otherHalf_;
  320. /**
  321. * <0: backwards; 0: just after reset() (previous() begins from end);
  322. * 1: just after setOffset(); >1: forward
  323. */
  324. int8_t dir_;
  325. /**
  326. * Stores offsets from expansions and from unsafe-backwards iteration,
  327. * so that getOffset() returns intermediate offsets for the CEs
  328. * that are consistent with forward iteration.
  329. */
  330. UVector32 *offsets_;
  331. UnicodeString string_;
  332. };
  333. // CollationElementIterator inline method definitions --------------------------
  334. inline int32_t CollationElementIterator::primaryOrder(int32_t order)
  335. {
  336. return (order >> 16) & 0xffff;
  337. }
  338. inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
  339. {
  340. return (order >> 8) & 0xff;
  341. }
  342. inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
  343. {
  344. return order & 0xff;
  345. }
  346. inline UBool CollationElementIterator::isIgnorable(int32_t order)
  347. {
  348. return (order & 0xffff0000) == 0;
  349. }
  350. U_NAMESPACE_END
  351. #endif /* #if !UCONFIG_NO_COLLATION */
  352. #endif