tblcoll.h 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 1996-2016, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. ******************************************************************************
  8. */
  9. /**
  10. * \file
  11. * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
  12. */
  13. /**
  14. * File tblcoll.h
  15. *
  16. * Created by: Helena Shih
  17. *
  18. * Modification History:
  19. *
  20. * Date Name Description
  21. * 2/5/97 aliu Added streamIn and streamOut methods. Added
  22. * constructor which reads RuleBasedCollator object from
  23. * a binary file. Added writeToFile method which streams
  24. * RuleBasedCollator out to a binary file. The streamIn
  25. * and streamOut methods use istream and ostream objects
  26. * in binary mode.
  27. * 2/12/97 aliu Modified to use TableCollationData sub-object to
  28. * hold invariant data.
  29. * 2/13/97 aliu Moved several methods into this class from Collation.
  30. * Added a private RuleBasedCollator(Locale&) constructor,
  31. * to be used by Collator::createDefault(). General
  32. * clean up.
  33. * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
  34. * constructor and getDynamicClassID.
  35. * 3/5/97 aliu Modified constructFromFile() to add parameter
  36. * specifying whether or not binary loading is to be
  37. * attempted. This is required for dynamic rule loading.
  38. * 05/07/97 helena Added memory allocation error detection.
  39. * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to
  40. * use MergeCollation::getPattern.
  41. * 6/20/97 helena Java class name change.
  42. * 8/18/97 helena Added internal API documentation.
  43. * 09/03/97 helena Added createCollationKeyValues().
  44. * 02/10/98 damiba Added compare with "length" parameter
  45. * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java
  46. * 04/23/99 stephen Removed EDecompositionMode, merged with
  47. * Normalizer::EMode
  48. * 06/14/99 stephen Removed kResourceBundleSuffix
  49. * 11/02/99 helena Collator performance enhancements. Eliminates the
  50. * UnicodeString construction and special case for NO_OP.
  51. * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator
  52. * internal state management.
  53. * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
  54. * to implementation file.
  55. * 01/29/01 synwee Modified into a C++ wrapper which calls C API
  56. * (ucol.h)
  57. * 2012-2014 markus Rewritten in C++ again.
  58. */
  59. #ifndef TBLCOLL_H
  60. #define TBLCOLL_H
  61. #include "unicode/utypes.h"
  62. #if !UCONFIG_NO_COLLATION
  63. #include "unicode/coll.h"
  64. #include "unicode/locid.h"
  65. #include "unicode/uiter.h"
  66. #include "unicode/ucol.h"
  67. U_NAMESPACE_BEGIN
  68. struct CollationCacheEntry;
  69. struct CollationData;
  70. struct CollationSettings;
  71. struct CollationTailoring;
  72. /**
  73. * @stable ICU 2.0
  74. */
  75. class StringSearch;
  76. /**
  77. * @stable ICU 2.0
  78. */
  79. class CollationElementIterator;
  80. class CollationKey;
  81. class SortKeyByteSink;
  82. class UnicodeSet;
  83. class UnicodeString;
  84. class UVector64;
  85. /**
  86. * The RuleBasedCollator class provides the implementation of
  87. * Collator, using data-driven tables. The user can create a customized
  88. * table-based collation.
  89. * <p>
  90. * For more information about the collation service see
  91. * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
  92. * <p>
  93. * Collation service provides correct sorting orders for most locales supported in ICU.
  94. * If specific data for a locale is not available, the orders eventually falls back
  95. * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
  96. * <p>
  97. * Sort ordering may be customized by providing your own set of rules. For more on
  98. * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
  99. * Collation Customization</a> section of the User Guide.
  100. * <p>
  101. * Note, RuleBasedCollator is not to be subclassed.
  102. * @see Collator
  103. */
  104. class U_I18N_API RuleBasedCollator : public Collator {
  105. public:
  106. /**
  107. * RuleBasedCollator constructor. This takes the table rules and builds a
  108. * collation table out of them. Please see RuleBasedCollator class
  109. * description for more details on the collation rule syntax.
  110. * @param rules the collation rules to build the collation table from.
  111. * @param status reporting a success or an error.
  112. * @stable ICU 2.0
  113. */
  114. RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
  115. /**
  116. * RuleBasedCollator constructor. This takes the table rules and builds a
  117. * collation table out of them. Please see RuleBasedCollator class
  118. * description for more details on the collation rule syntax.
  119. * @param rules the collation rules to build the collation table from.
  120. * @param collationStrength strength for comparison
  121. * @param status reporting a success or an error.
  122. * @stable ICU 2.0
  123. */
  124. RuleBasedCollator(const UnicodeString& rules,
  125. ECollationStrength collationStrength,
  126. UErrorCode& status);
  127. /**
  128. * RuleBasedCollator constructor. This takes the table rules and builds a
  129. * collation table out of them. Please see RuleBasedCollator class
  130. * description for more details on the collation rule syntax.
  131. * @param rules the collation rules to build the collation table from.
  132. * @param decompositionMode the normalisation mode
  133. * @param status reporting a success or an error.
  134. * @stable ICU 2.0
  135. */
  136. RuleBasedCollator(const UnicodeString& rules,
  137. UColAttributeValue decompositionMode,
  138. UErrorCode& status);
  139. /**
  140. * RuleBasedCollator constructor. This takes the table rules and builds a
  141. * collation table out of them. Please see RuleBasedCollator class
  142. * description for more details on the collation rule syntax.
  143. * @param rules the collation rules to build the collation table from.
  144. * @param collationStrength strength for comparison
  145. * @param decompositionMode the normalisation mode
  146. * @param status reporting a success or an error.
  147. * @stable ICU 2.0
  148. */
  149. RuleBasedCollator(const UnicodeString& rules,
  150. ECollationStrength collationStrength,
  151. UColAttributeValue decompositionMode,
  152. UErrorCode& status);
  153. #ifndef U_HIDE_INTERNAL_API
  154. /**
  155. * TODO: document & propose as public API
  156. * @internal
  157. */
  158. RuleBasedCollator(const UnicodeString &rules,
  159. UParseError &parseError, UnicodeString &reason,
  160. UErrorCode &errorCode);
  161. #endif /* U_HIDE_INTERNAL_API */
  162. /**
  163. * Copy constructor.
  164. * @param other the RuleBasedCollator object to be copied
  165. * @stable ICU 2.0
  166. */
  167. RuleBasedCollator(const RuleBasedCollator& other);
  168. /** Opens a collator from a collator binary image created using
  169. * cloneBinary. Binary image used in instantiation of the
  170. * collator remains owned by the user and should stay around for
  171. * the lifetime of the collator. The API also takes a base collator
  172. * which must be the root collator.
  173. * @param bin binary image owned by the user and required through the
  174. * lifetime of the collator
  175. * @param length size of the image. If negative, the API will try to
  176. * figure out the length of the image
  177. * @param base Base collator, for lookup of untailored characters.
  178. * Must be the root collator, must not be NULL.
  179. * The base is required to be present through the lifetime of the collator.
  180. * @param status for catching errors
  181. * @return newly created collator
  182. * @see cloneBinary
  183. * @stable ICU 3.4
  184. */
  185. RuleBasedCollator(const uint8_t *bin, int32_t length,
  186. const RuleBasedCollator *base,
  187. UErrorCode &status);
  188. /**
  189. * Destructor.
  190. * @stable ICU 2.0
  191. */
  192. virtual ~RuleBasedCollator();
  193. /**
  194. * Assignment operator.
  195. * @param other other RuleBasedCollator object to copy from.
  196. * @stable ICU 2.0
  197. */
  198. RuleBasedCollator& operator=(const RuleBasedCollator& other);
  199. /**
  200. * Returns true if argument is the same as this object.
  201. * @param other Collator object to be compared.
  202. * @return true if arguments is the same as this object.
  203. * @stable ICU 2.0
  204. */
  205. virtual UBool operator==(const Collator& other) const;
  206. /**
  207. * Makes a copy of this object.
  208. * @return a copy of this object, owned by the caller
  209. * @stable ICU 2.0
  210. */
  211. virtual Collator* clone(void) const;
  212. /**
  213. * Creates a collation element iterator for the source string. The caller of
  214. * this method is responsible for the memory management of the return
  215. * pointer.
  216. * @param source the string over which the CollationElementIterator will
  217. * iterate.
  218. * @return the collation element iterator of the source string using this as
  219. * the based Collator.
  220. * @stable ICU 2.2
  221. */
  222. virtual CollationElementIterator* createCollationElementIterator(
  223. const UnicodeString& source) const;
  224. /**
  225. * Creates a collation element iterator for the source. The caller of this
  226. * method is responsible for the memory management of the returned pointer.
  227. * @param source the CharacterIterator which produces the characters over
  228. * which the CollationElementItgerator will iterate.
  229. * @return the collation element iterator of the source using this as the
  230. * based Collator.
  231. * @stable ICU 2.2
  232. */
  233. virtual CollationElementIterator* createCollationElementIterator(
  234. const CharacterIterator& source) const;
  235. // Make deprecated versions of Collator::compare() visible.
  236. using Collator::compare;
  237. /**
  238. * The comparison function compares the character data stored in two
  239. * different strings. Returns information about whether a string is less
  240. * than, greater than or equal to another string.
  241. * @param source the source string to be compared with.
  242. * @param target the string that is to be compared with the source string.
  243. * @param status possible error code
  244. * @return Returns an enum value. UCOL_GREATER if source is greater
  245. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  246. * than target
  247. * @stable ICU 2.6
  248. **/
  249. virtual UCollationResult compare(const UnicodeString& source,
  250. const UnicodeString& target,
  251. UErrorCode &status) const;
  252. /**
  253. * Does the same thing as compare but limits the comparison to a specified
  254. * length
  255. * @param source the source string to be compared with.
  256. * @param target the string that is to be compared with the source string.
  257. * @param length the length the comparison is limited to
  258. * @param status possible error code
  259. * @return Returns an enum value. UCOL_GREATER if source (up to the specified
  260. * length) is greater than target; UCOL_EQUAL if source (up to specified
  261. * length) is equal to target; UCOL_LESS if source (up to the specified
  262. * length) is less than target.
  263. * @stable ICU 2.6
  264. */
  265. virtual UCollationResult compare(const UnicodeString& source,
  266. const UnicodeString& target,
  267. int32_t length,
  268. UErrorCode &status) const;
  269. /**
  270. * The comparison function compares the character data stored in two
  271. * different string arrays. Returns information about whether a string array
  272. * is less than, greater than or equal to another string array.
  273. * @param source the source string array to be compared with.
  274. * @param sourceLength the length of the source string array. If this value
  275. * is equal to -1, the string array is null-terminated.
  276. * @param target the string that is to be compared with the source string.
  277. * @param targetLength the length of the target string array. If this value
  278. * is equal to -1, the string array is null-terminated.
  279. * @param status possible error code
  280. * @return Returns an enum value. UCOL_GREATER if source is greater
  281. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  282. * than target
  283. * @stable ICU 2.6
  284. */
  285. virtual UCollationResult compare(const UChar* source, int32_t sourceLength,
  286. const UChar* target, int32_t targetLength,
  287. UErrorCode &status) const;
  288. /**
  289. * Compares two strings using the Collator.
  290. * Returns whether the first one compares less than/equal to/greater than
  291. * the second one.
  292. * This version takes UCharIterator input.
  293. * @param sIter the first ("source") string iterator
  294. * @param tIter the second ("target") string iterator
  295. * @param status ICU status
  296. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  297. * @stable ICU 4.2
  298. */
  299. virtual UCollationResult compare(UCharIterator &sIter,
  300. UCharIterator &tIter,
  301. UErrorCode &status) const;
  302. /**
  303. * Compares two UTF-8 strings using the Collator.
  304. * Returns whether the first one compares less than/equal to/greater than
  305. * the second one.
  306. * This version takes UTF-8 input.
  307. * Note that a StringPiece can be implicitly constructed
  308. * from a std::string or a NUL-terminated const char * string.
  309. * @param source the first UTF-8 string
  310. * @param target the second UTF-8 string
  311. * @param status ICU status
  312. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  313. * @stable ICU 51
  314. */
  315. virtual UCollationResult compareUTF8(const StringPiece &source,
  316. const StringPiece &target,
  317. UErrorCode &status) const;
  318. /**
  319. * Transforms the string into a series of characters
  320. * that can be compared with CollationKey.compare().
  321. *
  322. * Note that sort keys are often less efficient than simply doing comparison.
  323. * For more details, see the ICU User Guide.
  324. *
  325. * @param source the source string.
  326. * @param key the transformed key of the source string.
  327. * @param status the error code status.
  328. * @return the transformed key.
  329. * @see CollationKey
  330. * @stable ICU 2.0
  331. */
  332. virtual CollationKey& getCollationKey(const UnicodeString& source,
  333. CollationKey& key,
  334. UErrorCode& status) const;
  335. /**
  336. * Transforms a specified region of the string into a series of characters
  337. * that can be compared with CollationKey.compare.
  338. *
  339. * Note that sort keys are often less efficient than simply doing comparison.
  340. * For more details, see the ICU User Guide.
  341. *
  342. * @param source the source string.
  343. * @param sourceLength the length of the source string.
  344. * @param key the transformed key of the source string.
  345. * @param status the error code status.
  346. * @return the transformed key.
  347. * @see CollationKey
  348. * @stable ICU 2.0
  349. */
  350. virtual CollationKey& getCollationKey(const UChar *source,
  351. int32_t sourceLength,
  352. CollationKey& key,
  353. UErrorCode& status) const;
  354. /**
  355. * Generates the hash code for the rule-based collation object.
  356. * @return the hash code.
  357. * @stable ICU 2.0
  358. */
  359. virtual int32_t hashCode() const;
  360. /**
  361. * Gets the locale of the Collator
  362. * @param type can be either requested, valid or actual locale. For more
  363. * information see the definition of ULocDataLocaleType in
  364. * uloc.h
  365. * @param status the error code status.
  366. * @return locale where the collation data lives. If the collator
  367. * was instantiated from rules, locale is empty.
  368. * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
  369. */
  370. virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
  371. /**
  372. * Gets the tailoring rules for this collator.
  373. * @return the collation tailoring from which this collator was created
  374. * @stable ICU 2.0
  375. */
  376. const UnicodeString& getRules() const;
  377. /**
  378. * Gets the version information for a Collator.
  379. * @param info the version # information, the result will be filled in
  380. * @stable ICU 2.0
  381. */
  382. virtual void getVersion(UVersionInfo info) const;
  383. #ifndef U_HIDE_DEPRECATED_API
  384. /**
  385. * Returns the maximum length of any expansion sequences that end with the
  386. * specified comparison order.
  387. *
  388. * This is specific to the kind of collation element values and sequences
  389. * returned by the CollationElementIterator.
  390. * Call CollationElementIterator::getMaxExpansion() instead.
  391. *
  392. * @param order a collation order returned by CollationElementIterator::previous
  393. * or CollationElementIterator::next.
  394. * @return maximum size of the expansion sequences ending with the collation
  395. * element, or 1 if the collation element does not occur at the end of
  396. * any expansion sequence
  397. * @see CollationElementIterator#getMaxExpansion
  398. * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
  399. */
  400. int32_t getMaxExpansion(int32_t order) const;
  401. #endif /* U_HIDE_DEPRECATED_API */
  402. /**
  403. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
  404. * method is to implement a simple version of RTTI, since not all C++
  405. * compilers support genuine RTTI. Polymorphic operator==() and clone()
  406. * methods call this method.
  407. * @return The class ID for this object. All objects of a given class have
  408. * the same class ID. Objects of other classes have different class
  409. * IDs.
  410. * @stable ICU 2.0
  411. */
  412. virtual UClassID getDynamicClassID(void) const;
  413. /**
  414. * Returns the class ID for this class. This is useful only for comparing to
  415. * a return value from getDynamicClassID(). For example:
  416. * <pre>
  417. * Base* polymorphic_pointer = createPolymorphicObject();
  418. * if (polymorphic_pointer->getDynamicClassID() ==
  419. * Derived::getStaticClassID()) ...
  420. * </pre>
  421. * @return The class ID for all objects of this class.
  422. * @stable ICU 2.0
  423. */
  424. static UClassID U_EXPORT2 getStaticClassID(void);
  425. #ifndef U_HIDE_DEPRECATED_API
  426. /**
  427. * Do not use this method: The caller and the ICU library might use different heaps.
  428. * Use cloneBinary() instead which writes to caller-provided memory.
  429. *
  430. * Returns a binary format of this collator.
  431. * @param length Returns the length of the data, in bytes
  432. * @param status the error code status.
  433. * @return memory, owned by the caller, of size 'length' bytes.
  434. * @deprecated ICU 52. Use cloneBinary() instead.
  435. */
  436. uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
  437. #endif /* U_HIDE_DEPRECATED_API */
  438. /** Creates a binary image of a collator. This binary image can be stored and
  439. * later used to instantiate a collator using ucol_openBinary.
  440. * This API supports preflighting.
  441. * @param buffer a fill-in buffer to receive the binary image
  442. * @param capacity capacity of the destination buffer
  443. * @param status for catching errors
  444. * @return size of the image
  445. * @see ucol_openBinary
  446. * @stable ICU 3.4
  447. */
  448. int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
  449. /**
  450. * Returns current rules. Delta defines whether full rules are returned or
  451. * just the tailoring.
  452. *
  453. * getRules(void) should normally be used instead.
  454. * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
  455. * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
  456. * @param buffer UnicodeString to store the result rules
  457. * @stable ICU 2.2
  458. * @see UCOL_FULL_RULES
  459. */
  460. void getRules(UColRuleOption delta, UnicodeString &buffer) const;
  461. /**
  462. * Universal attribute setter
  463. * @param attr attribute type
  464. * @param value attribute value
  465. * @param status to indicate whether the operation went on smoothly or there were errors
  466. * @stable ICU 2.2
  467. */
  468. virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
  469. UErrorCode &status);
  470. /**
  471. * Universal attribute getter.
  472. * @param attr attribute type
  473. * @param status to indicate whether the operation went on smoothly or there were errors
  474. * @return attribute value
  475. * @stable ICU 2.2
  476. */
  477. virtual UColAttributeValue getAttribute(UColAttribute attr,
  478. UErrorCode &status) const;
  479. /**
  480. * Sets the variable top to the top of the specified reordering group.
  481. * The variable top determines the highest-sorting character
  482. * which is affected by UCOL_ALTERNATE_HANDLING.
  483. * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
  484. * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
  485. * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
  486. * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
  487. * @param errorCode Standard ICU error code. Its input value must
  488. * pass the U_SUCCESS() test, or else the function returns
  489. * immediately. Check for U_FAILURE() on output or use with
  490. * function chaining. (See User Guide for details.)
  491. * @return *this
  492. * @see getMaxVariable
  493. * @stable ICU 53
  494. */
  495. virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
  496. /**
  497. * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
  498. * @return the maximum variable reordering group.
  499. * @see setMaxVariable
  500. * @stable ICU 53
  501. */
  502. virtual UColReorderCode getMaxVariable() const;
  503. /**
  504. * Sets the variable top to the primary weight of the specified string.
  505. *
  506. * Beginning with ICU 53, the variable top is pinned to
  507. * the top of one of the supported reordering groups,
  508. * and it must not be beyond the last of those groups.
  509. * See setMaxVariable().
  510. * @param varTop one or more (if contraction) UChars to which the variable top should be set
  511. * @param len length of variable top string. If -1 it is considered to be zero terminated.
  512. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  513. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  514. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  515. * the last reordering group supported by setMaxVariable()
  516. * @return variable top primary weight
  517. * @deprecated ICU 53 Call setMaxVariable() instead.
  518. */
  519. virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status);
  520. /**
  521. * Sets the variable top to the primary weight of the specified string.
  522. *
  523. * Beginning with ICU 53, the variable top is pinned to
  524. * the top of one of the supported reordering groups,
  525. * and it must not be beyond the last of those groups.
  526. * See setMaxVariable().
  527. * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set
  528. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  529. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  530. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  531. * the last reordering group supported by setMaxVariable()
  532. * @return variable top primary weight
  533. * @deprecated ICU 53 Call setMaxVariable() instead.
  534. */
  535. virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
  536. /**
  537. * Sets the variable top to the specified primary weight.
  538. *
  539. * Beginning with ICU 53, the variable top is pinned to
  540. * the top of one of the supported reordering groups,
  541. * and it must not be beyond the last of those groups.
  542. * See setMaxVariable().
  543. * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
  544. * @param status error code
  545. * @deprecated ICU 53 Call setMaxVariable() instead.
  546. */
  547. virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
  548. /**
  549. * Gets the variable top value of a Collator.
  550. * @param status error code (not changed by function). If error code is set, the return value is undefined.
  551. * @return the variable top primary weight
  552. * @see getMaxVariable
  553. * @stable ICU 2.0
  554. */
  555. virtual uint32_t getVariableTop(UErrorCode &status) const;
  556. /**
  557. * Get a UnicodeSet that contains all the characters and sequences tailored in
  558. * this collator.
  559. * @param status error code of the operation
  560. * @return a pointer to a UnicodeSet object containing all the
  561. * code points and sequences that may sort differently than
  562. * in the root collator. The object must be disposed of by using delete
  563. * @stable ICU 2.4
  564. */
  565. virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
  566. /**
  567. * Get the sort key as an array of bytes from a UnicodeString.
  568. *
  569. * Note that sort keys are often less efficient than simply doing comparison.
  570. * For more details, see the ICU User Guide.
  571. *
  572. * @param source string to be processed.
  573. * @param result buffer to store result in. If NULL, number of bytes needed
  574. * will be returned.
  575. * @param resultLength length of the result buffer. If if not enough the
  576. * buffer will be filled to capacity.
  577. * @return Number of bytes needed for storing the sort key
  578. * @stable ICU 2.0
  579. */
  580. virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
  581. int32_t resultLength) const;
  582. /**
  583. * Get the sort key as an array of bytes from a UChar buffer.
  584. *
  585. * Note that sort keys are often less efficient than simply doing comparison.
  586. * For more details, see the ICU User Guide.
  587. *
  588. * @param source string to be processed.
  589. * @param sourceLength length of string to be processed. If -1, the string
  590. * is 0 terminated and length will be decided by the function.
  591. * @param result buffer to store result in. If NULL, number of bytes needed
  592. * will be returned.
  593. * @param resultLength length of the result buffer. If if not enough the
  594. * buffer will be filled to capacity.
  595. * @return Number of bytes needed for storing the sort key
  596. * @stable ICU 2.2
  597. */
  598. virtual int32_t getSortKey(const UChar *source, int32_t sourceLength,
  599. uint8_t *result, int32_t resultLength) const;
  600. /**
  601. * Retrieves the reordering codes for this collator.
  602. * @param dest The array to fill with the script ordering.
  603. * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
  604. * will only return the length of the result without writing any codes (pre-flighting).
  605. * @param status A reference to an error code value, which must not indicate
  606. * a failure before the function call.
  607. * @return The length of the script ordering array.
  608. * @see ucol_setReorderCodes
  609. * @see Collator#getEquivalentReorderCodes
  610. * @see Collator#setReorderCodes
  611. * @stable ICU 4.8
  612. */
  613. virtual int32_t getReorderCodes(int32_t *dest,
  614. int32_t destCapacity,
  615. UErrorCode& status) const;
  616. /**
  617. * Sets the ordering of scripts for this collator.
  618. * @param reorderCodes An array of script codes in the new order. This can be NULL if the
  619. * length is also set to 0. An empty array will clear any reordering codes on the collator.
  620. * @param reorderCodesLength The length of reorderCodes.
  621. * @param status error code
  622. * @see ucol_setReorderCodes
  623. * @see Collator#getReorderCodes
  624. * @see Collator#getEquivalentReorderCodes
  625. * @stable ICU 4.8
  626. */
  627. virtual void setReorderCodes(const int32_t* reorderCodes,
  628. int32_t reorderCodesLength,
  629. UErrorCode& status) ;
  630. /**
  631. * Implements ucol_strcollUTF8().
  632. * @internal
  633. */
  634. virtual UCollationResult internalCompareUTF8(
  635. const char *left, int32_t leftLength,
  636. const char *right, int32_t rightLength,
  637. UErrorCode &errorCode) const;
  638. /** Get the short definition string for a collator. This internal API harvests the collator's
  639. * locale and the attribute set and produces a string that can be used for opening
  640. * a collator with the same attributes using the ucol_openFromShortString API.
  641. * This string will be normalized.
  642. * The structure and the syntax of the string is defined in the "Naming collators"
  643. * section of the users guide:
  644. * http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
  645. * This function supports preflighting.
  646. *
  647. * This is internal, and intended to be used with delegate converters.
  648. *
  649. * @param locale a locale that will appear as a collators locale in the resulting
  650. * short string definition. If NULL, the locale will be harvested
  651. * from the collator.
  652. * @param buffer space to hold the resulting string
  653. * @param capacity capacity of the buffer
  654. * @param status for returning errors. All the preflighting errors are featured
  655. * @return length of the resulting string
  656. * @see ucol_openFromShortString
  657. * @see ucol_normalizeShortDefinitionString
  658. * @see ucol_getShortDefinitionString
  659. * @internal
  660. */
  661. virtual int32_t internalGetShortDefinitionString(const char *locale,
  662. char *buffer,
  663. int32_t capacity,
  664. UErrorCode &status) const;
  665. /**
  666. * Implements ucol_nextSortKeyPart().
  667. * @internal
  668. */
  669. virtual int32_t internalNextSortKeyPart(
  670. UCharIterator *iter, uint32_t state[2],
  671. uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
  672. // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API
  673. /**
  674. * Only for use in ucol_openRules().
  675. * @internal
  676. */
  677. RuleBasedCollator();
  678. #ifndef U_HIDE_INTERNAL_API
  679. /**
  680. * Implements ucol_getLocaleByType().
  681. * Needed because the lifetime of the locale ID string must match that of the collator.
  682. * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
  683. * @internal
  684. */
  685. const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
  686. /**
  687. * Implements ucol_getContractionsAndExpansions().
  688. * Gets this collator's sets of contraction strings and/or
  689. * characters and strings that map to multiple collation elements (expansions).
  690. * If addPrefixes is TRUE, then contractions that are expressed as
  691. * prefix/pre-context rules are included.
  692. * @param contractions if not NULL, the set to hold the contractions
  693. * @param expansions if not NULL, the set to hold the expansions
  694. * @param addPrefixes include prefix contextual mappings
  695. * @param errorCode in/out ICU error code
  696. * @internal
  697. */
  698. void internalGetContractionsAndExpansions(
  699. UnicodeSet *contractions, UnicodeSet *expansions,
  700. UBool addPrefixes, UErrorCode &errorCode) const;
  701. /**
  702. * Adds the contractions that start with character c to the set.
  703. * Ignores prefixes. Used by AlphabeticIndex.
  704. * @internal
  705. */
  706. void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
  707. /**
  708. * Implements from-rule constructors, and ucol_openRules().
  709. * @internal
  710. */
  711. void internalBuildTailoring(
  712. const UnicodeString &rules,
  713. int32_t strength,
  714. UColAttributeValue decompositionMode,
  715. UParseError *outParseError, UnicodeString *outReason,
  716. UErrorCode &errorCode);
  717. /** @internal */
  718. static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
  719. return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
  720. }
  721. /** @internal */
  722. static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
  723. return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
  724. }
  725. /**
  726. * Appends the CEs for the string to the vector.
  727. * @internal for tests & tools
  728. */
  729. void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
  730. #endif // U_HIDE_INTERNAL_API
  731. protected:
  732. /**
  733. * Used internally by registration to define the requested and valid locales.
  734. * @param requestedLocale the requested locale
  735. * @param validLocale the valid locale
  736. * @param actualLocale the actual locale
  737. * @internal
  738. */
  739. virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
  740. private:
  741. friend class CollationElementIterator;
  742. friend class Collator;
  743. RuleBasedCollator(const CollationCacheEntry *entry);
  744. /**
  745. * Enumeration of attributes that are relevant for short definition strings
  746. * (e.g., ucol_getShortDefinitionString()).
  747. * Effectively extends UColAttribute.
  748. */
  749. enum Attributes {
  750. ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
  751. ATTR_LIMIT
  752. };
  753. void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode);
  754. // Both lengths must be <0 or else both must be >=0.
  755. UCollationResult doCompare(const UChar *left, int32_t leftLength,
  756. const UChar *right, int32_t rightLength,
  757. UErrorCode &errorCode) const;
  758. UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
  759. const uint8_t *right, int32_t rightLength,
  760. UErrorCode &errorCode) const;
  761. void writeSortKey(const UChar *s, int32_t length,
  762. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  763. void writeIdenticalLevel(const UChar *s, const UChar *limit,
  764. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  765. const CollationSettings &getDefaultSettings() const;
  766. void setAttributeDefault(int32_t attribute) {
  767. explicitlySetAttributes &= ~((uint32_t)1 << attribute);
  768. }
  769. void setAttributeExplicitly(int32_t attribute) {
  770. explicitlySetAttributes |= (uint32_t)1 << attribute;
  771. }
  772. UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
  773. // assert(0 <= attribute < ATTR_LIMIT);
  774. return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
  775. }
  776. /**
  777. * Tests whether a character is "unsafe" for use as a collation starting point.
  778. *
  779. * @param c code point or code unit
  780. * @return TRUE if c is unsafe
  781. * @see CollationElementIterator#setOffset(int)
  782. */
  783. UBool isUnsafe(UChar32 c) const;
  784. static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
  785. UBool initMaxExpansions(UErrorCode &errorCode) const;
  786. void setFastLatinOptions(CollationSettings &ownedSettings) const;
  787. const CollationData *data;
  788. const CollationSettings *settings; // reference-counted
  789. const CollationTailoring *tailoring; // alias of cacheEntry->tailoring
  790. const CollationCacheEntry *cacheEntry; // reference-counted
  791. Locale validLocale;
  792. uint32_t explicitlySetAttributes;
  793. UBool actualLocaleIsSameAsValid;
  794. };
  795. U_NAMESPACE_END
  796. #endif // !UCONFIG_NO_COLLATION
  797. #endif // TBLCOLL_H