uset.h 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uset.h
  11. * encoding: US-ASCII
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar07
  16. * created by: Markus W. Scherer
  17. *
  18. * C version of UnicodeSet.
  19. */
  20. /**
  21. * \file
  22. * \brief C API: Unicode Set
  23. *
  24. * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
  25. */
  26. #ifndef __USET_H__
  27. #define __USET_H__
  28. #include "unicode/utypes.h"
  29. #include "unicode/uchar.h"
  30. #include "unicode/localpointer.h"
  31. #ifndef UCNV_H
  32. struct USet;
  33. /**
  34. * A UnicodeSet. Use the uset_* API to manipulate. Create with
  35. * uset_open*, and destroy with uset_close.
  36. * @stable ICU 2.4
  37. */
  38. typedef struct USet USet;
  39. #endif
  40. /**
  41. * Bitmask values to be passed to uset_openPatternOptions() or
  42. * uset_applyPattern() taking an option parameter.
  43. * @stable ICU 2.4
  44. */
  45. enum {
  46. /**
  47. * Ignore white space within patterns unless quoted or escaped.
  48. * @stable ICU 2.4
  49. */
  50. USET_IGNORE_SPACE = 1,
  51. /**
  52. * Enable case insensitive matching. E.g., "[ab]" with this flag
  53. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  54. * match all except 'a', 'A', 'b', and 'B'. This performs a full
  55. * closure over case mappings, e.g. U+017F for s.
  56. *
  57. * The resulting set is a superset of the input for the code points but
  58. * not for the strings.
  59. * It performs a case mapping closure of the code points and adds
  60. * full case folding strings for the code points, and reduces strings of
  61. * the original set to their full case folding equivalents.
  62. *
  63. * This is designed for case-insensitive matches, for example
  64. * in regular expressions. The full code point case closure allows checking of
  65. * an input character directly against the closure set.
  66. * Strings are matched by comparing the case-folded form from the closure
  67. * set with an incremental case folding of the string in question.
  68. *
  69. * The closure set will also contain single code points if the original
  70. * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
  71. * This is not necessary (that is, redundant) for the above matching method
  72. * but results in the same closure sets regardless of whether the original
  73. * set contained the code point or a string.
  74. *
  75. * @stable ICU 2.4
  76. */
  77. USET_CASE_INSENSITIVE = 2,
  78. /**
  79. * Enable case insensitive matching. E.g., "[ab]" with this flag
  80. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  81. * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
  82. * title-, and uppercase mappings as well as the case folding
  83. * of each existing element in the set.
  84. * @stable ICU 3.2
  85. */
  86. USET_ADD_CASE_MAPPINGS = 4
  87. };
  88. /**
  89. * Argument values for whether span() and similar functions continue while
  90. * the current character is contained vs. not contained in the set.
  91. *
  92. * The functionality is straightforward for sets with only single code points,
  93. * without strings (which is the common case):
  94. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
  95. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
  96. * - span() and spanBack() partition any string the same way when
  97. * alternating between span(USET_SPAN_NOT_CONTAINED) and
  98. * span(either "contained" condition).
  99. * - Using a complemented (inverted) set and the opposite span conditions
  100. * yields the same results.
  101. *
  102. * When a set contains multi-code point strings, then these statements may not
  103. * be true, depending on the strings in the set (for example, whether they
  104. * overlap with each other) and the string that is processed.
  105. * For a set with strings:
  106. * - The complement of the set contains the opposite set of code points,
  107. * but the same set of strings.
  108. * Therefore, complementing both the set and the span conditions
  109. * may yield different results.
  110. * - When starting spans at different positions in a string
  111. * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
  112. * because a set string may start before the later position.
  113. * - span(USET_SPAN_SIMPLE) may be shorter than
  114. * span(USET_SPAN_CONTAINED) because it will not recursively try
  115. * all possible paths.
  116. * For example, with a set which contains the three strings "xy", "xya" and "ax",
  117. * span("xyax", USET_SPAN_CONTAINED) will return 4 but
  118. * span("xyax", USET_SPAN_SIMPLE) will return 3.
  119. * span(USET_SPAN_SIMPLE) will never be longer than
  120. * span(USET_SPAN_CONTAINED).
  121. * - With either "contained" condition, span() and spanBack() may partition
  122. * a string in different ways.
  123. * For example, with a set which contains the two strings "ab" and "ba",
  124. * and when processing the string "aba",
  125. * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
  126. * while spanBack() will yield boundaries of { 0, 1, 3 }.
  127. *
  128. * Note: If it is important to get the same boundaries whether iterating forward
  129. * or backward through a string, then either only span() should be used and
  130. * the boundaries cached for backward operation, or an ICU BreakIterator
  131. * could be used.
  132. *
  133. * Note: Unpaired surrogates are treated like surrogate code points.
  134. * Similarly, set strings match only on code point boundaries,
  135. * never in the middle of a surrogate pair.
  136. * Illegal UTF-8 sequences are treated like U+FFFD.
  137. * When processing UTF-8 strings, malformed set strings
  138. * (strings with unpaired surrogates which cannot be converted to UTF-8)
  139. * are ignored.
  140. *
  141. * @stable ICU 3.8
  142. */
  143. typedef enum USetSpanCondition {
  144. /**
  145. * Continues a span() while there is no set element at the current position.
  146. * Increments by one code point at a time.
  147. * Stops before the first set element (character or string).
  148. * (For code points only, this is like while contains(current)==FALSE).
  149. *
  150. * When span() returns, the substring between where it started and the position
  151. * it returned consists only of characters that are not in the set,
  152. * and none of its strings overlap with the span.
  153. *
  154. * @stable ICU 3.8
  155. */
  156. USET_SPAN_NOT_CONTAINED = 0,
  157. /**
  158. * Spans the longest substring that is a concatenation of set elements (characters or strings).
  159. * (For characters only, this is like while contains(current)==TRUE).
  160. *
  161. * When span() returns, the substring between where it started and the position
  162. * it returned consists only of set elements (characters or strings) that are in the set.
  163. *
  164. * If a set contains strings, then the span will be the longest substring for which there
  165. * exists at least one non-overlapping concatenation of set elements (characters or strings).
  166. * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
  167. * (Java/ICU/Perl regex stops at the first match of an OR.)
  168. *
  169. * @stable ICU 3.8
  170. */
  171. USET_SPAN_CONTAINED = 1,
  172. /**
  173. * Continues a span() while there is a set element at the current position.
  174. * Increments by the longest matching element at each position.
  175. * (For characters only, this is like while contains(current)==TRUE).
  176. *
  177. * When span() returns, the substring between where it started and the position
  178. * it returned consists only of set elements (characters or strings) that are in the set.
  179. *
  180. * If a set only contains single characters, then this is the same
  181. * as USET_SPAN_CONTAINED.
  182. *
  183. * If a set contains strings, then the span will be the longest substring
  184. * with a match at each position with the longest single set element (character or string).
  185. *
  186. * Use this span condition together with other longest-match algorithms,
  187. * such as ICU converters (ucnv_getUnicodeSet()).
  188. *
  189. * @stable ICU 3.8
  190. */
  191. USET_SPAN_SIMPLE = 2,
  192. #ifndef U_HIDE_DEPRECATED_API
  193. /**
  194. * One more than the last span condition.
  195. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  196. */
  197. USET_SPAN_CONDITION_COUNT
  198. #endif // U_HIDE_DEPRECATED_API
  199. } USetSpanCondition;
  200. enum {
  201. /**
  202. * Capacity of USerializedSet::staticArray.
  203. * Enough for any single-code point set.
  204. * Also provides padding for nice sizeof(USerializedSet).
  205. * @stable ICU 2.4
  206. */
  207. USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
  208. };
  209. /**
  210. * A serialized form of a Unicode set. Limited manipulations are
  211. * possible directly on a serialized set. See below.
  212. * @stable ICU 2.4
  213. */
  214. typedef struct USerializedSet {
  215. /**
  216. * The serialized Unicode Set.
  217. * @stable ICU 2.4
  218. */
  219. const uint16_t *array;
  220. /**
  221. * The length of the array that contains BMP characters.
  222. * @stable ICU 2.4
  223. */
  224. int32_t bmpLength;
  225. /**
  226. * The total length of the array.
  227. * @stable ICU 2.4
  228. */
  229. int32_t length;
  230. /**
  231. * A small buffer for the array to reduce memory allocations.
  232. * @stable ICU 2.4
  233. */
  234. uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
  235. } USerializedSet;
  236. /*********************************************************************
  237. * USet API
  238. *********************************************************************/
  239. /**
  240. * Create an empty USet object.
  241. * Equivalent to uset_open(1, 0).
  242. * @return a newly created USet. The caller must call uset_close() on
  243. * it when done.
  244. * @stable ICU 4.2
  245. */
  246. U_STABLE USet* U_EXPORT2
  247. uset_openEmpty(void);
  248. /**
  249. * Creates a USet object that contains the range of characters
  250. * start..end, inclusive. If <code>start > end</code>
  251. * then an empty set is created (same as using uset_openEmpty()).
  252. * @param start first character of the range, inclusive
  253. * @param end last character of the range, inclusive
  254. * @return a newly created USet. The caller must call uset_close() on
  255. * it when done.
  256. * @stable ICU 2.4
  257. */
  258. U_STABLE USet* U_EXPORT2
  259. uset_open(UChar32 start, UChar32 end);
  260. /**
  261. * Creates a set from the given pattern. See the UnicodeSet class
  262. * description for the syntax of the pattern language.
  263. * @param pattern a string specifying what characters are in the set
  264. * @param patternLength the length of the pattern, or -1 if null
  265. * terminated
  266. * @param ec the error code
  267. * @stable ICU 2.4
  268. */
  269. U_STABLE USet* U_EXPORT2
  270. uset_openPattern(const UChar* pattern, int32_t patternLength,
  271. UErrorCode* ec);
  272. /**
  273. * Creates a set from the given pattern. See the UnicodeSet class
  274. * description for the syntax of the pattern language.
  275. * @param pattern a string specifying what characters are in the set
  276. * @param patternLength the length of the pattern, or -1 if null
  277. * terminated
  278. * @param options bitmask for options to apply to the pattern.
  279. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  280. * @param ec the error code
  281. * @stable ICU 2.4
  282. */
  283. U_STABLE USet* U_EXPORT2
  284. uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
  285. uint32_t options,
  286. UErrorCode* ec);
  287. /**
  288. * Disposes of the storage used by a USet object. This function should
  289. * be called exactly once for objects returned by uset_open().
  290. * @param set the object to dispose of
  291. * @stable ICU 2.4
  292. */
  293. U_STABLE void U_EXPORT2
  294. uset_close(USet* set);
  295. #if U_SHOW_CPLUSPLUS_API
  296. U_NAMESPACE_BEGIN
  297. /**
  298. * \class LocalUSetPointer
  299. * "Smart pointer" class, closes a USet via uset_close().
  300. * For most methods see the LocalPointerBase base class.
  301. *
  302. * @see LocalPointerBase
  303. * @see LocalPointer
  304. * @stable ICU 4.4
  305. */
  306. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
  307. U_NAMESPACE_END
  308. #endif
  309. /**
  310. * Returns a copy of this object.
  311. * If this set is frozen, then the clone will be frozen as well.
  312. * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
  313. * @param set the original set
  314. * @return the newly allocated copy of the set
  315. * @see uset_cloneAsThawed
  316. * @stable ICU 3.8
  317. */
  318. U_STABLE USet * U_EXPORT2
  319. uset_clone(const USet *set);
  320. /**
  321. * Determines whether the set has been frozen (made immutable) or not.
  322. * See the ICU4J Freezable interface for details.
  323. * @param set the set
  324. * @return TRUE/FALSE for whether the set has been frozen
  325. * @see uset_freeze
  326. * @see uset_cloneAsThawed
  327. * @stable ICU 3.8
  328. */
  329. U_STABLE UBool U_EXPORT2
  330. uset_isFrozen(const USet *set);
  331. /**
  332. * Freeze the set (make it immutable).
  333. * Once frozen, it cannot be unfrozen and is therefore thread-safe
  334. * until it is deleted.
  335. * See the ICU4J Freezable interface for details.
  336. * Freezing the set may also make some operations faster, for example
  337. * uset_contains() and uset_span().
  338. * A frozen set will not be modified. (It remains frozen.)
  339. * @param set the set
  340. * @return the same set, now frozen
  341. * @see uset_isFrozen
  342. * @see uset_cloneAsThawed
  343. * @stable ICU 3.8
  344. */
  345. U_STABLE void U_EXPORT2
  346. uset_freeze(USet *set);
  347. /**
  348. * Clone the set and make the clone mutable.
  349. * See the ICU4J Freezable interface for details.
  350. * @param set the set
  351. * @return the mutable clone
  352. * @see uset_freeze
  353. * @see uset_isFrozen
  354. * @see uset_clone
  355. * @stable ICU 3.8
  356. */
  357. U_STABLE USet * U_EXPORT2
  358. uset_cloneAsThawed(const USet *set);
  359. /**
  360. * Causes the USet object to represent the range <code>start - end</code>.
  361. * If <code>start > end</code> then this USet is set to an empty range.
  362. * A frozen set will not be modified.
  363. * @param set the object to set to the given range
  364. * @param start first character in the set, inclusive
  365. * @param end last character in the set, inclusive
  366. * @stable ICU 3.2
  367. */
  368. U_STABLE void U_EXPORT2
  369. uset_set(USet* set,
  370. UChar32 start, UChar32 end);
  371. /**
  372. * Modifies the set to represent the set specified by the given
  373. * pattern. See the UnicodeSet class description for the syntax of
  374. * the pattern language. See also the User Guide chapter about UnicodeSet.
  375. * <em>Empties the set passed before applying the pattern.</em>
  376. * A frozen set will not be modified.
  377. * @param set The set to which the pattern is to be applied.
  378. * @param pattern A pointer to UChar string specifying what characters are in the set.
  379. * The character at pattern[0] must be a '['.
  380. * @param patternLength The length of the UChar string. -1 if NUL terminated.
  381. * @param options A bitmask for options to apply to the pattern.
  382. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  383. * @param status Returns an error if the pattern cannot be parsed.
  384. * @return Upon successful parse, the value is either
  385. * the index of the character after the closing ']'
  386. * of the parsed pattern.
  387. * If the status code indicates failure, then the return value
  388. * is the index of the error in the source.
  389. *
  390. * @stable ICU 2.8
  391. */
  392. U_STABLE int32_t U_EXPORT2
  393. uset_applyPattern(USet *set,
  394. const UChar *pattern, int32_t patternLength,
  395. uint32_t options,
  396. UErrorCode *status);
  397. /**
  398. * Modifies the set to contain those code points which have the given value
  399. * for the given binary or enumerated property, as returned by
  400. * u_getIntPropertyValue. Prior contents of this set are lost.
  401. * A frozen set will not be modified.
  402. *
  403. * @param set the object to contain the code points defined by the property
  404. *
  405. * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
  406. * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
  407. * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
  408. *
  409. * @param value a value in the range u_getIntPropertyMinValue(prop)..
  410. * u_getIntPropertyMaxValue(prop), with one exception. If prop is
  411. * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
  412. * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
  413. * categories such as [:L:] to be represented.
  414. *
  415. * @param ec error code input/output parameter
  416. *
  417. * @stable ICU 3.2
  418. */
  419. U_STABLE void U_EXPORT2
  420. uset_applyIntPropertyValue(USet* set,
  421. UProperty prop, int32_t value, UErrorCode* ec);
  422. /**
  423. * Modifies the set to contain those code points which have the
  424. * given value for the given property. Prior contents of this
  425. * set are lost.
  426. * A frozen set will not be modified.
  427. *
  428. * @param set the object to contain the code points defined by the given
  429. * property and value alias
  430. *
  431. * @param prop a string specifying a property alias, either short or long.
  432. * The name is matched loosely. See PropertyAliases.txt for names and a
  433. * description of loose matching. If the value string is empty, then this
  434. * string is interpreted as either a General_Category value alias, a Script
  435. * value alias, a binary property alias, or a special ID. Special IDs are
  436. * matched loosely and correspond to the following sets:
  437. *
  438. * "ANY" = [\\u0000-\\U0010FFFF],
  439. * "ASCII" = [\\u0000-\\u007F],
  440. * "Assigned" = [:^Cn:].
  441. *
  442. * @param propLength the length of the prop, or -1 if NULL
  443. *
  444. * @param value a string specifying a value alias, either short or long.
  445. * The name is matched loosely. See PropertyValueAliases.txt for names
  446. * and a description of loose matching. In addition to aliases listed,
  447. * numeric values and canonical combining classes may be expressed
  448. * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
  449. * may also be empty.
  450. *
  451. * @param valueLength the length of the value, or -1 if NULL
  452. *
  453. * @param ec error code input/output parameter
  454. *
  455. * @stable ICU 3.2
  456. */
  457. U_STABLE void U_EXPORT2
  458. uset_applyPropertyAlias(USet* set,
  459. const UChar *prop, int32_t propLength,
  460. const UChar *value, int32_t valueLength,
  461. UErrorCode* ec);
  462. /**
  463. * Return true if the given position, in the given pattern, appears
  464. * to be the start of a UnicodeSet pattern.
  465. *
  466. * @param pattern a string specifying the pattern
  467. * @param patternLength the length of the pattern, or -1 if NULL
  468. * @param pos the given position
  469. * @stable ICU 3.2
  470. */
  471. U_STABLE UBool U_EXPORT2
  472. uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
  473. int32_t pos);
  474. /**
  475. * Returns a string representation of this set. If the result of
  476. * calling this function is passed to a uset_openPattern(), it
  477. * will produce another set that is equal to this one.
  478. * @param set the set
  479. * @param result the string to receive the rules, may be NULL
  480. * @param resultCapacity the capacity of result, may be 0 if result is NULL
  481. * @param escapeUnprintable if TRUE then convert unprintable
  482. * character to their hex escape representations, \\uxxxx or
  483. * \\Uxxxxxxxx. Unprintable characters are those other than
  484. * U+000A, U+0020..U+007E.
  485. * @param ec error code.
  486. * @return length of string, possibly larger than resultCapacity
  487. * @stable ICU 2.4
  488. */
  489. U_STABLE int32_t U_EXPORT2
  490. uset_toPattern(const USet* set,
  491. UChar* result, int32_t resultCapacity,
  492. UBool escapeUnprintable,
  493. UErrorCode* ec);
  494. /**
  495. * Adds the given character to the given USet. After this call,
  496. * uset_contains(set, c) will return TRUE.
  497. * A frozen set will not be modified.
  498. * @param set the object to which to add the character
  499. * @param c the character to add
  500. * @stable ICU 2.4
  501. */
  502. U_STABLE void U_EXPORT2
  503. uset_add(USet* set, UChar32 c);
  504. /**
  505. * Adds all of the elements in the specified set to this set if
  506. * they're not already present. This operation effectively
  507. * modifies this set so that its value is the <i>union</i> of the two
  508. * sets. The behavior of this operation is unspecified if the specified
  509. * collection is modified while the operation is in progress.
  510. * A frozen set will not be modified.
  511. *
  512. * @param set the object to which to add the set
  513. * @param additionalSet the source set whose elements are to be added to this set.
  514. * @stable ICU 2.6
  515. */
  516. U_STABLE void U_EXPORT2
  517. uset_addAll(USet* set, const USet *additionalSet);
  518. /**
  519. * Adds the given range of characters to the given USet. After this call,
  520. * uset_contains(set, start, end) will return TRUE.
  521. * A frozen set will not be modified.
  522. * @param set the object to which to add the character
  523. * @param start the first character of the range to add, inclusive
  524. * @param end the last character of the range to add, inclusive
  525. * @stable ICU 2.2
  526. */
  527. U_STABLE void U_EXPORT2
  528. uset_addRange(USet* set, UChar32 start, UChar32 end);
  529. /**
  530. * Adds the given string to the given USet. After this call,
  531. * uset_containsString(set, str, strLen) will return TRUE.
  532. * A frozen set will not be modified.
  533. * @param set the object to which to add the character
  534. * @param str the string to add
  535. * @param strLen the length of the string or -1 if null terminated.
  536. * @stable ICU 2.4
  537. */
  538. U_STABLE void U_EXPORT2
  539. uset_addString(USet* set, const UChar* str, int32_t strLen);
  540. /**
  541. * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
  542. * If this set already any particular character, it has no effect on that character.
  543. * A frozen set will not be modified.
  544. * @param set the object to which to add the character
  545. * @param str the source string
  546. * @param strLen the length of the string or -1 if null terminated.
  547. * @stable ICU 3.4
  548. */
  549. U_STABLE void U_EXPORT2
  550. uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
  551. /**
  552. * Removes the given character from the given USet. After this call,
  553. * uset_contains(set, c) will return FALSE.
  554. * A frozen set will not be modified.
  555. * @param set the object from which to remove the character
  556. * @param c the character to remove
  557. * @stable ICU 2.4
  558. */
  559. U_STABLE void U_EXPORT2
  560. uset_remove(USet* set, UChar32 c);
  561. /**
  562. * Removes the given range of characters from the given USet. After this call,
  563. * uset_contains(set, start, end) will return FALSE.
  564. * A frozen set will not be modified.
  565. * @param set the object to which to add the character
  566. * @param start the first character of the range to remove, inclusive
  567. * @param end the last character of the range to remove, inclusive
  568. * @stable ICU 2.2
  569. */
  570. U_STABLE void U_EXPORT2
  571. uset_removeRange(USet* set, UChar32 start, UChar32 end);
  572. /**
  573. * Removes the given string to the given USet. After this call,
  574. * uset_containsString(set, str, strLen) will return FALSE.
  575. * A frozen set will not be modified.
  576. * @param set the object to which to add the character
  577. * @param str the string to remove
  578. * @param strLen the length of the string or -1 if null terminated.
  579. * @stable ICU 2.4
  580. */
  581. U_STABLE void U_EXPORT2
  582. uset_removeString(USet* set, const UChar* str, int32_t strLen);
  583. /**
  584. * Removes from this set all of its elements that are contained in the
  585. * specified set. This operation effectively modifies this
  586. * set so that its value is the <i>asymmetric set difference</i> of
  587. * the two sets.
  588. * A frozen set will not be modified.
  589. * @param set the object from which the elements are to be removed
  590. * @param removeSet the object that defines which elements will be
  591. * removed from this set
  592. * @stable ICU 3.2
  593. */
  594. U_STABLE void U_EXPORT2
  595. uset_removeAll(USet* set, const USet* removeSet);
  596. /**
  597. * Retain only the elements in this set that are contained in the
  598. * specified range. If <code>start > end</code> then an empty range is
  599. * retained, leaving the set empty. This is equivalent to
  600. * a boolean logic AND, or a set INTERSECTION.
  601. * A frozen set will not be modified.
  602. *
  603. * @param set the object for which to retain only the specified range
  604. * @param start first character, inclusive, of range to be retained
  605. * to this set.
  606. * @param end last character, inclusive, of range to be retained
  607. * to this set.
  608. * @stable ICU 3.2
  609. */
  610. U_STABLE void U_EXPORT2
  611. uset_retain(USet* set, UChar32 start, UChar32 end);
  612. /**
  613. * Retains only the elements in this set that are contained in the
  614. * specified set. In other words, removes from this set all of
  615. * its elements that are not contained in the specified set. This
  616. * operation effectively modifies this set so that its value is
  617. * the <i>intersection</i> of the two sets.
  618. * A frozen set will not be modified.
  619. *
  620. * @param set the object on which to perform the retain
  621. * @param retain set that defines which elements this set will retain
  622. * @stable ICU 3.2
  623. */
  624. U_STABLE void U_EXPORT2
  625. uset_retainAll(USet* set, const USet* retain);
  626. /**
  627. * Reallocate this objects internal structures to take up the least
  628. * possible space, without changing this object's value.
  629. * A frozen set will not be modified.
  630. *
  631. * @param set the object on which to perfrom the compact
  632. * @stable ICU 3.2
  633. */
  634. U_STABLE void U_EXPORT2
  635. uset_compact(USet* set);
  636. /**
  637. * Inverts this set. This operation modifies this set so that
  638. * its value is its complement. This operation does not affect
  639. * the multicharacter strings, if any.
  640. * A frozen set will not be modified.
  641. * @param set the set
  642. * @stable ICU 2.4
  643. */
  644. U_STABLE void U_EXPORT2
  645. uset_complement(USet* set);
  646. /**
  647. * Complements in this set all elements contained in the specified
  648. * set. Any character in the other set will be removed if it is
  649. * in this set, or will be added if it is not in this set.
  650. * A frozen set will not be modified.
  651. *
  652. * @param set the set with which to complement
  653. * @param complement set that defines which elements will be xor'ed
  654. * from this set.
  655. * @stable ICU 3.2
  656. */
  657. U_STABLE void U_EXPORT2
  658. uset_complementAll(USet* set, const USet* complement);
  659. /**
  660. * Removes all of the elements from this set. This set will be
  661. * empty after this call returns.
  662. * A frozen set will not be modified.
  663. * @param set the set
  664. * @stable ICU 2.4
  665. */
  666. U_STABLE void U_EXPORT2
  667. uset_clear(USet* set);
  668. /**
  669. * Close this set over the given attribute. For the attribute
  670. * USET_CASE, the result is to modify this set so that:
  671. *
  672. * 1. For each character or string 'a' in this set, all strings or
  673. * characters 'b' such that foldCase(a) == foldCase(b) are added
  674. * to this set.
  675. *
  676. * 2. For each string 'e' in the resulting set, if e !=
  677. * foldCase(e), 'e' will be removed.
  678. *
  679. * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  680. *
  681. * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  682. * == b denotes that the contents are the same, not pointer
  683. * comparison.)
  684. *
  685. * A frozen set will not be modified.
  686. *
  687. * @param set the set
  688. *
  689. * @param attributes bitmask for attributes to close over.
  690. * Currently only the USET_CASE bit is supported. Any undefined bits
  691. * are ignored.
  692. * @stable ICU 4.2
  693. */
  694. U_STABLE void U_EXPORT2
  695. uset_closeOver(USet* set, int32_t attributes);
  696. /**
  697. * Remove all strings from this set.
  698. *
  699. * @param set the set
  700. * @stable ICU 4.2
  701. */
  702. U_STABLE void U_EXPORT2
  703. uset_removeAllStrings(USet* set);
  704. /**
  705. * Returns TRUE if the given USet contains no characters and no
  706. * strings.
  707. * @param set the set
  708. * @return true if set is empty
  709. * @stable ICU 2.4
  710. */
  711. U_STABLE UBool U_EXPORT2
  712. uset_isEmpty(const USet* set);
  713. /**
  714. * Returns TRUE if the given USet contains the given character.
  715. * This function works faster with a frozen set.
  716. * @param set the set
  717. * @param c The codepoint to check for within the set
  718. * @return true if set contains c
  719. * @stable ICU 2.4
  720. */
  721. U_STABLE UBool U_EXPORT2
  722. uset_contains(const USet* set, UChar32 c);
  723. /**
  724. * Returns TRUE if the given USet contains all characters c
  725. * where start <= c && c <= end.
  726. * @param set the set
  727. * @param start the first character of the range to test, inclusive
  728. * @param end the last character of the range to test, inclusive
  729. * @return TRUE if set contains the range
  730. * @stable ICU 2.2
  731. */
  732. U_STABLE UBool U_EXPORT2
  733. uset_containsRange(const USet* set, UChar32 start, UChar32 end);
  734. /**
  735. * Returns TRUE if the given USet contains the given string.
  736. * @param set the set
  737. * @param str the string
  738. * @param strLen the length of the string or -1 if null terminated.
  739. * @return true if set contains str
  740. * @stable ICU 2.4
  741. */
  742. U_STABLE UBool U_EXPORT2
  743. uset_containsString(const USet* set, const UChar* str, int32_t strLen);
  744. /**
  745. * Returns the index of the given character within this set, where
  746. * the set is ordered by ascending code point. If the character
  747. * is not in this set, return -1. The inverse of this method is
  748. * <code>charAt()</code>.
  749. * @param set the set
  750. * @param c the character to obtain the index for
  751. * @return an index from 0..size()-1, or -1
  752. * @stable ICU 3.2
  753. */
  754. U_STABLE int32_t U_EXPORT2
  755. uset_indexOf(const USet* set, UChar32 c);
  756. /**
  757. * Returns the character at the given index within this set, where
  758. * the set is ordered by ascending code point. If the index is
  759. * out of range, return (UChar32)-1. The inverse of this method is
  760. * <code>indexOf()</code>.
  761. * @param set the set
  762. * @param charIndex an index from 0..size()-1 to obtain the char for
  763. * @return the character at the given index, or (UChar32)-1.
  764. * @stable ICU 3.2
  765. */
  766. U_STABLE UChar32 U_EXPORT2
  767. uset_charAt(const USet* set, int32_t charIndex);
  768. /**
  769. * Returns the number of characters and strings contained in the given
  770. * USet.
  771. * @param set the set
  772. * @return a non-negative integer counting the characters and strings
  773. * contained in set
  774. * @stable ICU 2.4
  775. */
  776. U_STABLE int32_t U_EXPORT2
  777. uset_size(const USet* set);
  778. /**
  779. * Returns the number of items in this set. An item is either a range
  780. * of characters or a single multicharacter string.
  781. * @param set the set
  782. * @return a non-negative integer counting the character ranges
  783. * and/or strings contained in set
  784. * @stable ICU 2.4
  785. */
  786. U_STABLE int32_t U_EXPORT2
  787. uset_getItemCount(const USet* set);
  788. /**
  789. * Returns an item of this set. An item is either a range of
  790. * characters or a single multicharacter string.
  791. * @param set the set
  792. * @param itemIndex a non-negative integer in the range 0..
  793. * uset_getItemCount(set)-1
  794. * @param start pointer to variable to receive first character
  795. * in range, inclusive
  796. * @param end pointer to variable to receive last character in range,
  797. * inclusive
  798. * @param str buffer to receive the string, may be NULL
  799. * @param strCapacity capacity of str, or 0 if str is NULL
  800. * @param ec error code
  801. * @return the length of the string (>= 2), or 0 if the item is a
  802. * range, in which case it is the range *start..*end, or -1 if
  803. * itemIndex is out of range
  804. * @stable ICU 2.4
  805. */
  806. U_STABLE int32_t U_EXPORT2
  807. uset_getItem(const USet* set, int32_t itemIndex,
  808. UChar32* start, UChar32* end,
  809. UChar* str, int32_t strCapacity,
  810. UErrorCode* ec);
  811. /**
  812. * Returns true if set1 contains all the characters and strings
  813. * of set2. It answers the question, 'Is set1 a superset of set2?'
  814. * @param set1 set to be checked for containment
  815. * @param set2 set to be checked for containment
  816. * @return true if the test condition is met
  817. * @stable ICU 3.2
  818. */
  819. U_STABLE UBool U_EXPORT2
  820. uset_containsAll(const USet* set1, const USet* set2);
  821. /**
  822. * Returns true if this set contains all the characters
  823. * of the given string. This is does not check containment of grapheme
  824. * clusters, like uset_containsString.
  825. * @param set set of characters to be checked for containment
  826. * @param str string containing codepoints to be checked for containment
  827. * @param strLen the length of the string or -1 if null terminated.
  828. * @return true if the test condition is met
  829. * @stable ICU 3.4
  830. */
  831. U_STABLE UBool U_EXPORT2
  832. uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
  833. /**
  834. * Returns true if set1 contains none of the characters and strings
  835. * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
  836. * @param set1 set to be checked for containment
  837. * @param set2 set to be checked for containment
  838. * @return true if the test condition is met
  839. * @stable ICU 3.2
  840. */
  841. U_STABLE UBool U_EXPORT2
  842. uset_containsNone(const USet* set1, const USet* set2);
  843. /**
  844. * Returns true if set1 contains some of the characters and strings
  845. * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
  846. * @param set1 set to be checked for containment
  847. * @param set2 set to be checked for containment
  848. * @return true if the test condition is met
  849. * @stable ICU 3.2
  850. */
  851. U_STABLE UBool U_EXPORT2
  852. uset_containsSome(const USet* set1, const USet* set2);
  853. /**
  854. * Returns the length of the initial substring of the input string which
  855. * consists only of characters and strings that are contained in this set
  856. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  857. * or only of characters and strings that are not contained
  858. * in this set (USET_SPAN_NOT_CONTAINED).
  859. * See USetSpanCondition for details.
  860. * Similar to the strspn() C library function.
  861. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  862. * This function works faster with a frozen set and with a non-negative string length argument.
  863. * @param set the set
  864. * @param s start of the string
  865. * @param length of the string; can be -1 for NUL-terminated
  866. * @param spanCondition specifies the containment condition
  867. * @return the length of the initial substring according to the spanCondition;
  868. * 0 if the start of the string does not fit the spanCondition
  869. * @stable ICU 3.8
  870. * @see USetSpanCondition
  871. */
  872. U_STABLE int32_t U_EXPORT2
  873. uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  874. /**
  875. * Returns the start of the trailing substring of the input string which
  876. * consists only of characters and strings that are contained in this set
  877. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  878. * or only of characters and strings that are not contained
  879. * in this set (USET_SPAN_NOT_CONTAINED).
  880. * See USetSpanCondition for details.
  881. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  882. * This function works faster with a frozen set and with a non-negative string length argument.
  883. * @param set the set
  884. * @param s start of the string
  885. * @param length of the string; can be -1 for NUL-terminated
  886. * @param spanCondition specifies the containment condition
  887. * @return the start of the trailing substring according to the spanCondition;
  888. * the string length if the end of the string does not fit the spanCondition
  889. * @stable ICU 3.8
  890. * @see USetSpanCondition
  891. */
  892. U_STABLE int32_t U_EXPORT2
  893. uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  894. /**
  895. * Returns the length of the initial substring of the input string which
  896. * consists only of characters and strings that are contained in this set
  897. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  898. * or only of characters and strings that are not contained
  899. * in this set (USET_SPAN_NOT_CONTAINED).
  900. * See USetSpanCondition for details.
  901. * Similar to the strspn() C library function.
  902. * Malformed byte sequences are treated according to contains(0xfffd).
  903. * This function works faster with a frozen set and with a non-negative string length argument.
  904. * @param set the set
  905. * @param s start of the string (UTF-8)
  906. * @param length of the string; can be -1 for NUL-terminated
  907. * @param spanCondition specifies the containment condition
  908. * @return the length of the initial substring according to the spanCondition;
  909. * 0 if the start of the string does not fit the spanCondition
  910. * @stable ICU 3.8
  911. * @see USetSpanCondition
  912. */
  913. U_STABLE int32_t U_EXPORT2
  914. uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  915. /**
  916. * Returns the start of the trailing substring of the input string which
  917. * consists only of characters and strings that are contained in this set
  918. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  919. * or only of characters and strings that are not contained
  920. * in this set (USET_SPAN_NOT_CONTAINED).
  921. * See USetSpanCondition for details.
  922. * Malformed byte sequences are treated according to contains(0xfffd).
  923. * This function works faster with a frozen set and with a non-negative string length argument.
  924. * @param set the set
  925. * @param s start of the string (UTF-8)
  926. * @param length of the string; can be -1 for NUL-terminated
  927. * @param spanCondition specifies the containment condition
  928. * @return the start of the trailing substring according to the spanCondition;
  929. * the string length if the end of the string does not fit the spanCondition
  930. * @stable ICU 3.8
  931. * @see USetSpanCondition
  932. */
  933. U_STABLE int32_t U_EXPORT2
  934. uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  935. /**
  936. * Returns true if set1 contains all of the characters and strings
  937. * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
  938. * @param set1 set to be checked for containment
  939. * @param set2 set to be checked for containment
  940. * @return true if the test condition is met
  941. * @stable ICU 3.2
  942. */
  943. U_STABLE UBool U_EXPORT2
  944. uset_equals(const USet* set1, const USet* set2);
  945. /*********************************************************************
  946. * Serialized set API
  947. *********************************************************************/
  948. /**
  949. * Serializes this set into an array of 16-bit integers. Serialization
  950. * (currently) only records the characters in the set; multicharacter
  951. * strings are ignored.
  952. *
  953. * The array
  954. * has following format (each line is one 16-bit integer):
  955. *
  956. * length = (n+2*m) | (m!=0?0x8000:0)
  957. * bmpLength = n; present if m!=0
  958. * bmp[0]
  959. * bmp[1]
  960. * ...
  961. * bmp[n-1]
  962. * supp-high[0]
  963. * supp-low[0]
  964. * supp-high[1]
  965. * supp-low[1]
  966. * ...
  967. * supp-high[m-1]
  968. * supp-low[m-1]
  969. *
  970. * The array starts with a header. After the header are n bmp
  971. * code points, then m supplementary code points. Either n or m
  972. * or both may be zero. n+2*m is always <= 0x7FFF.
  973. *
  974. * If there are no supplementary characters (if m==0) then the
  975. * header is one 16-bit integer, 'length', with value n.
  976. *
  977. * If there are supplementary characters (if m!=0) then the header
  978. * is two 16-bit integers. The first, 'length', has value
  979. * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
  980. *
  981. * After the header the code points are stored in ascending order.
  982. * Supplementary code points are stored as most significant 16
  983. * bits followed by least significant 16 bits.
  984. *
  985. * @param set the set
  986. * @param dest pointer to buffer of destCapacity 16-bit integers.
  987. * May be NULL only if destCapacity is zero.
  988. * @param destCapacity size of dest, or zero. Must not be negative.
  989. * @param pErrorCode pointer to the error code. Will be set to
  990. * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
  991. * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
  992. * @return the total length of the serialized format, including
  993. * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  994. * than U_BUFFER_OVERFLOW_ERROR.
  995. * @stable ICU 2.4
  996. */
  997. U_STABLE int32_t U_EXPORT2
  998. uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
  999. /**
  1000. * Given a serialized array, fill in the given serialized set object.
  1001. * @param fillSet pointer to result
  1002. * @param src pointer to start of array
  1003. * @param srcLength length of array
  1004. * @return true if the given array is valid, otherwise false
  1005. * @stable ICU 2.4
  1006. */
  1007. U_STABLE UBool U_EXPORT2
  1008. uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
  1009. /**
  1010. * Set the USerializedSet to contain the given character (and nothing
  1011. * else).
  1012. * @param fillSet pointer to result
  1013. * @param c The codepoint to set
  1014. * @stable ICU 2.4
  1015. */
  1016. U_STABLE void U_EXPORT2
  1017. uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
  1018. /**
  1019. * Returns TRUE if the given USerializedSet contains the given
  1020. * character.
  1021. * @param set the serialized set
  1022. * @param c The codepoint to check for within the set
  1023. * @return true if set contains c
  1024. * @stable ICU 2.4
  1025. */
  1026. U_STABLE UBool U_EXPORT2
  1027. uset_serializedContains(const USerializedSet* set, UChar32 c);
  1028. /**
  1029. * Returns the number of disjoint ranges of characters contained in
  1030. * the given serialized set. Ignores any strings contained in the
  1031. * set.
  1032. * @param set the serialized set
  1033. * @return a non-negative integer counting the character ranges
  1034. * contained in set
  1035. * @stable ICU 2.4
  1036. */
  1037. U_STABLE int32_t U_EXPORT2
  1038. uset_getSerializedRangeCount(const USerializedSet* set);
  1039. /**
  1040. * Returns a range of characters contained in the given serialized
  1041. * set.
  1042. * @param set the serialized set
  1043. * @param rangeIndex a non-negative integer in the range 0..
  1044. * uset_getSerializedRangeCount(set)-1
  1045. * @param pStart pointer to variable to receive first character
  1046. * in range, inclusive
  1047. * @param pEnd pointer to variable to receive last character in range,
  1048. * inclusive
  1049. * @return true if rangeIndex is valid, otherwise false
  1050. * @stable ICU 2.4
  1051. */
  1052. U_STABLE UBool U_EXPORT2
  1053. uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
  1054. UChar32* pStart, UChar32* pEnd);
  1055. #endif