ucasemap.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2005-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: ucasemap.h
  11. * encoding: US-ASCII
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2005may06
  16. * created by: Markus W. Scherer
  17. *
  18. * Case mapping service object and functions using it.
  19. */
  20. #ifndef __UCASEMAP_H__
  21. #define __UCASEMAP_H__
  22. #include "unicode/utypes.h"
  23. #include "unicode/ustring.h"
  24. #include "unicode/localpointer.h"
  25. /**
  26. * \file
  27. * \brief C API: Unicode case mapping functions using a UCaseMap service object.
  28. *
  29. * The service object takes care of memory allocations, data loading, and setup
  30. * for the attributes, as usual.
  31. *
  32. * Currently, the functionality provided here does not overlap with uchar.h
  33. * and ustring.h, except for ucasemap_toTitle().
  34. *
  35. * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
  36. */
  37. /**
  38. * UCaseMap is an opaque service object for newer ICU case mapping functions.
  39. * Older functions did not use a service object.
  40. * @stable ICU 3.4
  41. */
  42. struct UCaseMap;
  43. typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
  44. /**
  45. * Open a UCaseMap service object for a locale and a set of options.
  46. * The locale ID and options are preprocessed so that functions using the
  47. * service object need not process them in each call.
  48. *
  49. * @param locale ICU locale ID, used for language-dependent
  50. * upper-/lower-/title-casing according to the Unicode standard.
  51. * Usual semantics: ""=root, NULL=default locale, etc.
  52. * @param options Options bit set, used for case folding and string comparisons.
  53. * Same flags as for u_foldCase(), u_strFoldCase(),
  54. * u_strCaseCompare(), etc.
  55. * Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
  56. * @param pErrorCode Must be a valid pointer to an error code value,
  57. * which must not indicate a failure before the function call.
  58. * @return Pointer to a UCaseMap service object, if successful.
  59. *
  60. * @see U_FOLD_CASE_DEFAULT
  61. * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
  62. * @see U_TITLECASE_NO_LOWERCASE
  63. * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
  64. * @stable ICU 3.4
  65. */
  66. U_STABLE UCaseMap * U_EXPORT2
  67. ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
  68. /**
  69. * Close a UCaseMap service object.
  70. * @param csm Object to be closed.
  71. * @stable ICU 3.4
  72. */
  73. U_STABLE void U_EXPORT2
  74. ucasemap_close(UCaseMap *csm);
  75. #if U_SHOW_CPLUSPLUS_API
  76. U_NAMESPACE_BEGIN
  77. /**
  78. * \class LocalUCaseMapPointer
  79. * "Smart pointer" class, closes a UCaseMap via ucasemap_close().
  80. * For most methods see the LocalPointerBase base class.
  81. *
  82. * @see LocalPointerBase
  83. * @see LocalPointer
  84. * @stable ICU 4.4
  85. */
  86. U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
  87. U_NAMESPACE_END
  88. #endif
  89. /**
  90. * Get the locale ID that is used for language-dependent case mappings.
  91. * @param csm UCaseMap service object.
  92. * @return locale ID
  93. * @stable ICU 3.4
  94. */
  95. U_STABLE const char * U_EXPORT2
  96. ucasemap_getLocale(const UCaseMap *csm);
  97. /**
  98. * Get the options bit set that is used for case folding and string comparisons.
  99. * @param csm UCaseMap service object.
  100. * @return options bit set
  101. * @stable ICU 3.4
  102. */
  103. U_STABLE uint32_t U_EXPORT2
  104. ucasemap_getOptions(const UCaseMap *csm);
  105. /**
  106. * Set the locale ID that is used for language-dependent case mappings.
  107. *
  108. * @param csm UCaseMap service object.
  109. * @param locale Locale ID, see ucasemap_open().
  110. * @param pErrorCode Must be a valid pointer to an error code value,
  111. * which must not indicate a failure before the function call.
  112. *
  113. * @see ucasemap_open
  114. * @stable ICU 3.4
  115. */
  116. U_STABLE void U_EXPORT2
  117. ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
  118. /**
  119. * Set the options bit set that is used for case folding and string comparisons.
  120. *
  121. * @param csm UCaseMap service object.
  122. * @param options Options bit set, see ucasemap_open().
  123. * @param pErrorCode Must be a valid pointer to an error code value,
  124. * which must not indicate a failure before the function call.
  125. *
  126. * @see ucasemap_open
  127. * @stable ICU 3.4
  128. */
  129. U_STABLE void U_EXPORT2
  130. ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
  131. /**
  132. * Do not lowercase non-initial parts of words when titlecasing.
  133. * Option bit for titlecasing APIs that take an options bit set.
  134. *
  135. * By default, titlecasing will titlecase the first cased character
  136. * of a word and lowercase all other characters.
  137. * With this option, the other characters will not be modified.
  138. *
  139. * @see ucasemap_setOptions
  140. * @see ucasemap_toTitle
  141. * @see ucasemap_utf8ToTitle
  142. * @see UnicodeString::toTitle
  143. * @stable ICU 3.8
  144. */
  145. #define U_TITLECASE_NO_LOWERCASE 0x100
  146. /**
  147. * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
  148. * titlecase exactly the characters at breaks from the iterator.
  149. * Option bit for titlecasing APIs that take an options bit set.
  150. *
  151. * By default, titlecasing will take each break iterator index,
  152. * adjust it by looking for the next cased character, and titlecase that one.
  153. * Other characters are lowercased.
  154. *
  155. * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
  156. *
  157. * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
  158. * #29, "Text Boundaries." Between each pair of word boundaries, find the first
  159. * cased character F. If F exists, map F to default_title(F); then map each
  160. * subsequent character C to default_lower(C).
  161. *
  162. * @see ucasemap_setOptions
  163. * @see ucasemap_toTitle
  164. * @see ucasemap_utf8ToTitle
  165. * @see UnicodeString::toTitle
  166. * @see U_TITLECASE_NO_LOWERCASE
  167. * @stable ICU 3.8
  168. */
  169. #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
  170. #if !UCONFIG_NO_BREAK_ITERATION
  171. /**
  172. * Get the break iterator that is used for titlecasing.
  173. * Do not modify the returned break iterator.
  174. * @param csm UCaseMap service object.
  175. * @return titlecasing break iterator
  176. * @stable ICU 3.8
  177. */
  178. U_STABLE const UBreakIterator * U_EXPORT2
  179. ucasemap_getBreakIterator(const UCaseMap *csm);
  180. /**
  181. * Set the break iterator that is used for titlecasing.
  182. * The UCaseMap service object releases a previously set break iterator
  183. * and "adopts" this new one, taking ownership of it.
  184. * It will be released in a subsequent call to ucasemap_setBreakIterator()
  185. * or ucasemap_close().
  186. *
  187. * Break iterator operations are not thread-safe. Therefore, titlecasing
  188. * functions use non-const UCaseMap objects. It is not possible to titlecase
  189. * strings concurrently using the same UCaseMap.
  190. *
  191. * @param csm UCaseMap service object.
  192. * @param iterToAdopt Break iterator to be adopted for titlecasing.
  193. * @param pErrorCode Must be a valid pointer to an error code value,
  194. * which must not indicate a failure before the function call.
  195. *
  196. * @see ucasemap_toTitle
  197. * @see ucasemap_utf8ToTitle
  198. * @stable ICU 3.8
  199. */
  200. U_STABLE void U_EXPORT2
  201. ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
  202. /**
  203. * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
  204. * except that it takes ucasemap_setOptions() into account and has performance
  205. * advantages from being able to use a UCaseMap object for multiple case mapping
  206. * operations, saving setup time.
  207. *
  208. * Casing is locale-dependent and context-sensitive.
  209. * Titlecasing uses a break iterator to find the first characters of words
  210. * that are to be titlecased. It titlecases those characters and lowercases
  211. * all others. (This can be modified with ucasemap_setOptions().)
  212. *
  213. * Note: This function takes a non-const UCaseMap pointer because it will
  214. * open a default break iterator if no break iterator was set yet,
  215. * and effectively call ucasemap_setBreakIterator();
  216. * also because the break iterator is stateful and will be modified during
  217. * the iteration.
  218. *
  219. * The titlecase break iterator can be provided to customize for arbitrary
  220. * styles, using rules and dictionaries beyond the standard iterators.
  221. * The standard titlecase iterator for the root locale implements the
  222. * algorithm of Unicode TR 21.
  223. *
  224. * This function uses only the setUText(), first(), next() and close() methods of the
  225. * provided break iterator.
  226. *
  227. * The result may be longer or shorter than the original.
  228. * The source string and the destination buffer must not overlap.
  229. *
  230. * @param csm UCaseMap service object. This pointer is non-const!
  231. * See the note above for details.
  232. * @param dest A buffer for the result string. The result will be NUL-terminated if
  233. * the buffer is large enough.
  234. * The contents is undefined in case of failure.
  235. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  236. * dest may be NULL and the function will only return the length of the result
  237. * without writing any of the result string.
  238. * @param src The original string.
  239. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  240. * @param pErrorCode Must be a valid pointer to an error code value,
  241. * which must not indicate a failure before the function call.
  242. * @return The length of the result string, if successful - or in case of a buffer overflow,
  243. * in which case it will be greater than destCapacity.
  244. *
  245. * @see u_strToTitle
  246. * @stable ICU 3.8
  247. */
  248. U_STABLE int32_t U_EXPORT2
  249. ucasemap_toTitle(UCaseMap *csm,
  250. UChar *dest, int32_t destCapacity,
  251. const UChar *src, int32_t srcLength,
  252. UErrorCode *pErrorCode);
  253. #endif
  254. /**
  255. * Lowercase the characters in a UTF-8 string.
  256. * Casing is locale-dependent and context-sensitive.
  257. * The result may be longer or shorter than the original.
  258. * The source string and the destination buffer must not overlap.
  259. *
  260. * @param csm UCaseMap service object.
  261. * @param dest A buffer for the result string. The result will be NUL-terminated if
  262. * the buffer is large enough.
  263. * The contents is undefined in case of failure.
  264. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  265. * dest may be NULL and the function will only return the length of the result
  266. * without writing any of the result string.
  267. * @param src The original string.
  268. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  269. * @param pErrorCode Must be a valid pointer to an error code value,
  270. * which must not indicate a failure before the function call.
  271. * @return The length of the result string, if successful - or in case of a buffer overflow,
  272. * in which case it will be greater than destCapacity.
  273. *
  274. * @see u_strToLower
  275. * @stable ICU 3.4
  276. */
  277. U_STABLE int32_t U_EXPORT2
  278. ucasemap_utf8ToLower(const UCaseMap *csm,
  279. char *dest, int32_t destCapacity,
  280. const char *src, int32_t srcLength,
  281. UErrorCode *pErrorCode);
  282. /**
  283. * Uppercase the characters in a UTF-8 string.
  284. * Casing is locale-dependent and context-sensitive.
  285. * The result may be longer or shorter than the original.
  286. * The source string and the destination buffer must not overlap.
  287. *
  288. * @param csm UCaseMap service object.
  289. * @param dest A buffer for the result string. The result will be NUL-terminated if
  290. * the buffer is large enough.
  291. * The contents is undefined in case of failure.
  292. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  293. * dest may be NULL and the function will only return the length of the result
  294. * without writing any of the result string.
  295. * @param src The original string.
  296. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  297. * @param pErrorCode Must be a valid pointer to an error code value,
  298. * which must not indicate a failure before the function call.
  299. * @return The length of the result string, if successful - or in case of a buffer overflow,
  300. * in which case it will be greater than destCapacity.
  301. *
  302. * @see u_strToUpper
  303. * @stable ICU 3.4
  304. */
  305. U_STABLE int32_t U_EXPORT2
  306. ucasemap_utf8ToUpper(const UCaseMap *csm,
  307. char *dest, int32_t destCapacity,
  308. const char *src, int32_t srcLength,
  309. UErrorCode *pErrorCode);
  310. #if !UCONFIG_NO_BREAK_ITERATION
  311. /**
  312. * Titlecase a UTF-8 string.
  313. * Casing is locale-dependent and context-sensitive.
  314. * Titlecasing uses a break iterator to find the first characters of words
  315. * that are to be titlecased. It titlecases those characters and lowercases
  316. * all others. (This can be modified with ucasemap_setOptions().)
  317. *
  318. * Note: This function takes a non-const UCaseMap pointer because it will
  319. * open a default break iterator if no break iterator was set yet,
  320. * and effectively call ucasemap_setBreakIterator();
  321. * also because the break iterator is stateful and will be modified during
  322. * the iteration.
  323. *
  324. * The titlecase break iterator can be provided to customize for arbitrary
  325. * styles, using rules and dictionaries beyond the standard iterators.
  326. * The standard titlecase iterator for the root locale implements the
  327. * algorithm of Unicode TR 21.
  328. *
  329. * This function uses only the setUText(), first(), next() and close() methods of the
  330. * provided break iterator.
  331. *
  332. * The result may be longer or shorter than the original.
  333. * The source string and the destination buffer must not overlap.
  334. *
  335. * @param csm UCaseMap service object. This pointer is non-const!
  336. * See the note above for details.
  337. * @param dest A buffer for the result string. The result will be NUL-terminated if
  338. * the buffer is large enough.
  339. * The contents is undefined in case of failure.
  340. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  341. * dest may be NULL and the function will only return the length of the result
  342. * without writing any of the result string.
  343. * @param src The original string.
  344. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  345. * @param pErrorCode Must be a valid pointer to an error code value,
  346. * which must not indicate a failure before the function call.
  347. * @return The length of the result string, if successful - or in case of a buffer overflow,
  348. * in which case it will be greater than destCapacity.
  349. *
  350. * @see u_strToTitle
  351. * @see U_TITLECASE_NO_LOWERCASE
  352. * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
  353. * @stable ICU 3.8
  354. */
  355. U_STABLE int32_t U_EXPORT2
  356. ucasemap_utf8ToTitle(UCaseMap *csm,
  357. char *dest, int32_t destCapacity,
  358. const char *src, int32_t srcLength,
  359. UErrorCode *pErrorCode);
  360. #endif
  361. /**
  362. * Case-folds the characters in a UTF-8 string.
  363. *
  364. * Case-folding is locale-independent and not context-sensitive,
  365. * but there is an option for whether to include or exclude mappings for dotted I
  366. * and dotless i that are marked with 'T' in CaseFolding.txt.
  367. *
  368. * The result may be longer or shorter than the original.
  369. * The source string and the destination buffer must not overlap.
  370. *
  371. * @param csm UCaseMap service object.
  372. * @param dest A buffer for the result string. The result will be NUL-terminated if
  373. * the buffer is large enough.
  374. * The contents is undefined in case of failure.
  375. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  376. * dest may be NULL and the function will only return the length of the result
  377. * without writing any of the result string.
  378. * @param src The original string.
  379. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  380. * @param pErrorCode Must be a valid pointer to an error code value,
  381. * which must not indicate a failure before the function call.
  382. * @return The length of the result string, if successful - or in case of a buffer overflow,
  383. * in which case it will be greater than destCapacity.
  384. *
  385. * @see u_strFoldCase
  386. * @see ucasemap_setOptions
  387. * @see U_FOLD_CASE_DEFAULT
  388. * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
  389. * @stable ICU 3.8
  390. */
  391. U_STABLE int32_t U_EXPORT2
  392. ucasemap_utf8FoldCase(const UCaseMap *csm,
  393. char *dest, int32_t destCapacity,
  394. const char *src, int32_t srcLength,
  395. UErrorCode *pErrorCode);
  396. #endif