ustring.h 73 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1998-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. *
  9. * File ustring.h
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 12/07/98 bertrand Creation.
  15. ******************************************************************************
  16. */
  17. #ifndef USTRING_H
  18. #define USTRING_H
  19. #include "unicode/utypes.h"
  20. #include "unicode/putil.h"
  21. #include "unicode/uiter.h"
  22. /**
  23. * \def UBRK_TYPEDEF_UBREAK_ITERATOR
  24. * @internal
  25. */
  26. #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  27. # define UBRK_TYPEDEF_UBREAK_ITERATOR
  28. /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
  29. typedef struct UBreakIterator UBreakIterator;
  30. #endif
  31. /**
  32. * \file
  33. * \brief C API: Unicode string handling functions
  34. *
  35. * These C API functions provide general Unicode string handling.
  36. *
  37. * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
  38. * functions. (For example, they do not check for bad arguments like NULL string pointers.)
  39. * In some cases, only the thread-safe variant of such a function is implemented here
  40. * (see u_strtok_r()).
  41. *
  42. * Other functions provide more Unicode-specific functionality like locale-specific
  43. * upper/lower-casing and string comparison in code point order.
  44. *
  45. * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
  46. * UTF-16 encodes each Unicode code point with either one or two UChar code units.
  47. * (This is the default form of Unicode, and a forward-compatible extension of the original,
  48. * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
  49. * in 1996.)
  50. *
  51. * Some APIs accept a 32-bit UChar32 value for a single code point.
  52. *
  53. * ICU also handles 16-bit Unicode text with unpaired surrogates.
  54. * Such text is not well-formed UTF-16.
  55. * Code-point-related functions treat unpaired surrogates as surrogate code points,
  56. * i.e., as separate units.
  57. *
  58. * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
  59. * it is much more efficient even for random access because the code unit values
  60. * for single-unit characters vs. lead units vs. trail units are completely disjoint.
  61. * This means that it is easy to determine character (code point) boundaries from
  62. * random offsets in the string.
  63. *
  64. * Unicode (UTF-16) string processing is optimized for the single-unit case.
  65. * Although it is important to support supplementary characters
  66. * (which use pairs of lead/trail code units called "surrogates"),
  67. * their occurrence is rare. Almost all characters in modern use require only
  68. * a single UChar code unit (i.e., their code point values are <=0xffff).
  69. *
  70. * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html).
  71. * For a discussion of the handling of unpaired surrogates see also
  72. * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
  73. */
  74. /**
  75. * \defgroup ustring_ustrlen String Length
  76. * \ingroup ustring_strlen
  77. */
  78. /*@{*/
  79. /**
  80. * Determine the length of an array of UChar.
  81. *
  82. * @param s The array of UChars, NULL (U+0000) terminated.
  83. * @return The number of UChars in <code>chars</code>, minus the terminator.
  84. * @stable ICU 2.0
  85. */
  86. U_STABLE int32_t U_EXPORT2
  87. u_strlen(const UChar *s);
  88. /*@}*/
  89. /**
  90. * Count Unicode code points in the length UChar code units of the string.
  91. * A code point may occupy either one or two UChar code units.
  92. * Counting code points involves reading all code units.
  93. *
  94. * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
  95. *
  96. * @param s The input string.
  97. * @param length The number of UChar code units to be checked, or -1 to count all
  98. * code points before the first NUL (U+0000).
  99. * @return The number of code points in the specified code units.
  100. * @stable ICU 2.0
  101. */
  102. U_STABLE int32_t U_EXPORT2
  103. u_countChar32(const UChar *s, int32_t length);
  104. /**
  105. * Check if the string contains more Unicode code points than a certain number.
  106. * This is more efficient than counting all code points in the entire string
  107. * and comparing that number with a threshold.
  108. * This function may not need to scan the string at all if the length is known
  109. * (not -1 for NUL-termination) and falls within a certain range, and
  110. * never needs to count more than 'number+1' code points.
  111. * Logically equivalent to (u_countChar32(s, length)>number).
  112. * A Unicode code point may occupy either one or two UChar code units.
  113. *
  114. * @param s The input string.
  115. * @param length The length of the string, or -1 if it is NUL-terminated.
  116. * @param number The number of code points in the string is compared against
  117. * the 'number' parameter.
  118. * @return Boolean value for whether the string contains more Unicode code points
  119. * than 'number'. Same as (u_countChar32(s, length)>number).
  120. * @stable ICU 2.4
  121. */
  122. U_STABLE UBool U_EXPORT2
  123. u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
  124. /**
  125. * Concatenate two ustrings. Appends a copy of <code>src</code>,
  126. * including the null terminator, to <code>dst</code>. The initial copied
  127. * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
  128. *
  129. * @param dst The destination string.
  130. * @param src The source string.
  131. * @return A pointer to <code>dst</code>.
  132. * @stable ICU 2.0
  133. */
  134. U_STABLE UChar* U_EXPORT2
  135. u_strcat(UChar *dst,
  136. const UChar *src);
  137. /**
  138. * Concatenate two ustrings.
  139. * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
  140. * Adds a terminating NUL.
  141. * If src is too long, then only <code>n-1</code> characters will be copied
  142. * before the terminating NUL.
  143. * If <code>n&lt;=0</code> then dst is not modified.
  144. *
  145. * @param dst The destination string.
  146. * @param src The source string (can be NULL/invalid if n<=0).
  147. * @param n The maximum number of characters to append; no-op if <=0.
  148. * @return A pointer to <code>dst</code>.
  149. * @stable ICU 2.0
  150. */
  151. U_STABLE UChar* U_EXPORT2
  152. u_strncat(UChar *dst,
  153. const UChar *src,
  154. int32_t n);
  155. /**
  156. * Find the first occurrence of a substring in a string.
  157. * The substring is found at code point boundaries.
  158. * That means that if the substring begins with
  159. * a trail surrogate or ends with a lead surrogate,
  160. * then it is found only if these surrogates stand alone in the text.
  161. * Otherwise, the substring edge units would be matched against
  162. * halves of surrogate pairs.
  163. *
  164. * @param s The string to search (NUL-terminated).
  165. * @param substring The substring to find (NUL-terminated).
  166. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  167. * or <code>s</code> itself if the <code>substring</code> is empty,
  168. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  169. * @stable ICU 2.0
  170. *
  171. * @see u_strrstr
  172. * @see u_strFindFirst
  173. * @see u_strFindLast
  174. */
  175. U_STABLE UChar * U_EXPORT2
  176. u_strstr(const UChar *s, const UChar *substring);
  177. /**
  178. * Find the first occurrence of a substring in a string.
  179. * The substring is found at code point boundaries.
  180. * That means that if the substring begins with
  181. * a trail surrogate or ends with a lead surrogate,
  182. * then it is found only if these surrogates stand alone in the text.
  183. * Otherwise, the substring edge units would be matched against
  184. * halves of surrogate pairs.
  185. *
  186. * @param s The string to search.
  187. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  188. * @param substring The substring to find (NUL-terminated).
  189. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  190. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  191. * or <code>s</code> itself if the <code>substring</code> is empty,
  192. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  193. * @stable ICU 2.4
  194. *
  195. * @see u_strstr
  196. * @see u_strFindLast
  197. */
  198. U_STABLE UChar * U_EXPORT2
  199. u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  200. /**
  201. * Find the first occurrence of a BMP code point in a string.
  202. * A surrogate code point is found only if its match in the text is not
  203. * part of a surrogate pair.
  204. * A NUL character is found at the string terminator.
  205. *
  206. * @param s The string to search (NUL-terminated).
  207. * @param c The BMP code point to find.
  208. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  209. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  210. * @stable ICU 2.0
  211. *
  212. * @see u_strchr32
  213. * @see u_memchr
  214. * @see u_strstr
  215. * @see u_strFindFirst
  216. */
  217. U_STABLE UChar * U_EXPORT2
  218. u_strchr(const UChar *s, UChar c);
  219. /**
  220. * Find the first occurrence of a code point in a string.
  221. * A surrogate code point is found only if its match in the text is not
  222. * part of a surrogate pair.
  223. * A NUL character is found at the string terminator.
  224. *
  225. * @param s The string to search (NUL-terminated).
  226. * @param c The code point to find.
  227. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  228. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  229. * @stable ICU 2.0
  230. *
  231. * @see u_strchr
  232. * @see u_memchr32
  233. * @see u_strstr
  234. * @see u_strFindFirst
  235. */
  236. U_STABLE UChar * U_EXPORT2
  237. u_strchr32(const UChar *s, UChar32 c);
  238. /**
  239. * Find the last occurrence of a substring in a string.
  240. * The substring is found at code point boundaries.
  241. * That means that if the substring begins with
  242. * a trail surrogate or ends with a lead surrogate,
  243. * then it is found only if these surrogates stand alone in the text.
  244. * Otherwise, the substring edge units would be matched against
  245. * halves of surrogate pairs.
  246. *
  247. * @param s The string to search (NUL-terminated).
  248. * @param substring The substring to find (NUL-terminated).
  249. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  250. * or <code>s</code> itself if the <code>substring</code> is empty,
  251. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  252. * @stable ICU 2.4
  253. *
  254. * @see u_strstr
  255. * @see u_strFindFirst
  256. * @see u_strFindLast
  257. */
  258. U_STABLE UChar * U_EXPORT2
  259. u_strrstr(const UChar *s, const UChar *substring);
  260. /**
  261. * Find the last occurrence of a substring in a string.
  262. * The substring is found at code point boundaries.
  263. * That means that if the substring begins with
  264. * a trail surrogate or ends with a lead surrogate,
  265. * then it is found only if these surrogates stand alone in the text.
  266. * Otherwise, the substring edge units would be matched against
  267. * halves of surrogate pairs.
  268. *
  269. * @param s The string to search.
  270. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  271. * @param substring The substring to find (NUL-terminated).
  272. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  273. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  274. * or <code>s</code> itself if the <code>substring</code> is empty,
  275. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  276. * @stable ICU 2.4
  277. *
  278. * @see u_strstr
  279. * @see u_strFindLast
  280. */
  281. U_STABLE UChar * U_EXPORT2
  282. u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  283. /**
  284. * Find the last occurrence of a BMP code point in a string.
  285. * A surrogate code point is found only if its match in the text is not
  286. * part of a surrogate pair.
  287. * A NUL character is found at the string terminator.
  288. *
  289. * @param s The string to search (NUL-terminated).
  290. * @param c The BMP code point to find.
  291. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  292. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  293. * @stable ICU 2.4
  294. *
  295. * @see u_strrchr32
  296. * @see u_memrchr
  297. * @see u_strrstr
  298. * @see u_strFindLast
  299. */
  300. U_STABLE UChar * U_EXPORT2
  301. u_strrchr(const UChar *s, UChar c);
  302. /**
  303. * Find the last occurrence of a code point in a string.
  304. * A surrogate code point is found only if its match in the text is not
  305. * part of a surrogate pair.
  306. * A NUL character is found at the string terminator.
  307. *
  308. * @param s The string to search (NUL-terminated).
  309. * @param c The code point to find.
  310. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  311. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  312. * @stable ICU 2.4
  313. *
  314. * @see u_strrchr
  315. * @see u_memchr32
  316. * @see u_strrstr
  317. * @see u_strFindLast
  318. */
  319. U_STABLE UChar * U_EXPORT2
  320. u_strrchr32(const UChar *s, UChar32 c);
  321. /**
  322. * Locates the first occurrence in the string <code>string</code> of any of the characters
  323. * in the string <code>matchSet</code>.
  324. * Works just like C's strpbrk but with Unicode.
  325. *
  326. * @param string The string in which to search, NUL-terminated.
  327. * @param matchSet A NUL-terminated string defining a set of code points
  328. * for which to search in the text string.
  329. * @return A pointer to the character in <code>string</code> that matches one of the
  330. * characters in <code>matchSet</code>, or NULL if no such character is found.
  331. * @stable ICU 2.0
  332. */
  333. U_STABLE UChar * U_EXPORT2
  334. u_strpbrk(const UChar *string, const UChar *matchSet);
  335. /**
  336. * Returns the number of consecutive characters in <code>string</code>,
  337. * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
  338. * Works just like C's strcspn but with Unicode.
  339. *
  340. * @param string The string in which to search, NUL-terminated.
  341. * @param matchSet A NUL-terminated string defining a set of code points
  342. * for which to search in the text string.
  343. * @return The number of initial characters in <code>string</code> that do not
  344. * occur in <code>matchSet</code>.
  345. * @see u_strspn
  346. * @stable ICU 2.0
  347. */
  348. U_STABLE int32_t U_EXPORT2
  349. u_strcspn(const UChar *string, const UChar *matchSet);
  350. /**
  351. * Returns the number of consecutive characters in <code>string</code>,
  352. * beginning with the first, that occur somewhere in <code>matchSet</code>.
  353. * Works just like C's strspn but with Unicode.
  354. *
  355. * @param string The string in which to search, NUL-terminated.
  356. * @param matchSet A NUL-terminated string defining a set of code points
  357. * for which to search in the text string.
  358. * @return The number of initial characters in <code>string</code> that do
  359. * occur in <code>matchSet</code>.
  360. * @see u_strcspn
  361. * @stable ICU 2.0
  362. */
  363. U_STABLE int32_t U_EXPORT2
  364. u_strspn(const UChar *string, const UChar *matchSet);
  365. /**
  366. * The string tokenizer API allows an application to break a string into
  367. * tokens. Unlike strtok(), the saveState (the current pointer within the
  368. * original string) is maintained in saveState. In the first call, the
  369. * argument src is a pointer to the string. In subsequent calls to
  370. * return successive tokens of that string, src must be specified as
  371. * NULL. The value saveState is set by this function to maintain the
  372. * function's position within the string, and on each subsequent call
  373. * you must give this argument the same variable. This function does
  374. * handle surrogate pairs. This function is similar to the strtok_r()
  375. * the POSIX Threads Extension (1003.1c-1995) version.
  376. *
  377. * @param src String containing token(s). This string will be modified.
  378. * After the first call to u_strtok_r(), this argument must
  379. * be NULL to get to the next token.
  380. * @param delim Set of delimiter characters (Unicode code points).
  381. * @param saveState The current pointer within the original string,
  382. * which is set by this function. The saveState
  383. * parameter should the address of a local variable of type
  384. * UChar *. (i.e. defined "Uhar *myLocalSaveState" and use
  385. * &myLocalSaveState for this parameter).
  386. * @return A pointer to the next token found in src, or NULL
  387. * when there are no more tokens.
  388. * @stable ICU 2.0
  389. */
  390. U_STABLE UChar * U_EXPORT2
  391. u_strtok_r(UChar *src,
  392. const UChar *delim,
  393. UChar **saveState);
  394. /**
  395. * Compare two Unicode strings for bitwise equality (code unit order).
  396. *
  397. * @param s1 A string to compare.
  398. * @param s2 A string to compare.
  399. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  400. * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
  401. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  402. * @stable ICU 2.0
  403. */
  404. U_STABLE int32_t U_EXPORT2
  405. u_strcmp(const UChar *s1,
  406. const UChar *s2);
  407. /**
  408. * Compare two Unicode strings in code point order.
  409. * See u_strCompare for details.
  410. *
  411. * @param s1 A string to compare.
  412. * @param s2 A string to compare.
  413. * @return a negative/zero/positive integer corresponding to whether
  414. * the first string is less than/equal to/greater than the second one
  415. * in code point order
  416. * @stable ICU 2.0
  417. */
  418. U_STABLE int32_t U_EXPORT2
  419. u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
  420. /**
  421. * Compare two Unicode strings (binary order).
  422. *
  423. * The comparison can be done in code unit order or in code point order.
  424. * They differ only in UTF-16 when
  425. * comparing supplementary code points (U+10000..U+10ffff)
  426. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  427. * In code unit order, high BMP code points sort after supplementary code points
  428. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  429. *
  430. * This functions works with strings of different explicitly specified lengths
  431. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  432. * NUL-terminated strings are possible with length arguments of -1.
  433. *
  434. * @param s1 First source string.
  435. * @param length1 Length of first source string, or -1 if NUL-terminated.
  436. *
  437. * @param s2 Second source string.
  438. * @param length2 Length of second source string, or -1 if NUL-terminated.
  439. *
  440. * @param codePointOrder Choose between code unit order (FALSE)
  441. * and code point order (TRUE).
  442. *
  443. * @return <0 or 0 or >0 as usual for string comparisons
  444. *
  445. * @stable ICU 2.2
  446. */
  447. U_STABLE int32_t U_EXPORT2
  448. u_strCompare(const UChar *s1, int32_t length1,
  449. const UChar *s2, int32_t length2,
  450. UBool codePointOrder);
  451. /**
  452. * Compare two Unicode strings (binary order)
  453. * as presented by UCharIterator objects.
  454. * Works otherwise just like u_strCompare().
  455. *
  456. * Both iterators are reset to their start positions.
  457. * When the function returns, it is undefined where the iterators
  458. * have stopped.
  459. *
  460. * @param iter1 First source string iterator.
  461. * @param iter2 Second source string iterator.
  462. * @param codePointOrder Choose between code unit order (FALSE)
  463. * and code point order (TRUE).
  464. *
  465. * @return <0 or 0 or >0 as usual for string comparisons
  466. *
  467. * @see u_strCompare
  468. *
  469. * @stable ICU 2.6
  470. */
  471. U_STABLE int32_t U_EXPORT2
  472. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder);
  473. #ifndef U_COMPARE_CODE_POINT_ORDER
  474. /* see also unistr.h and unorm.h */
  475. /**
  476. * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
  477. * Compare strings in code point order instead of code unit order.
  478. * @stable ICU 2.2
  479. */
  480. #define U_COMPARE_CODE_POINT_ORDER 0x8000
  481. #endif
  482. /**
  483. * Compare two strings case-insensitively using full case folding.
  484. * This is equivalent to
  485. * u_strCompare(u_strFoldCase(s1, options),
  486. * u_strFoldCase(s2, options),
  487. * (options&U_COMPARE_CODE_POINT_ORDER)!=0).
  488. *
  489. * The comparison can be done in UTF-16 code unit order or in code point order.
  490. * They differ only when comparing supplementary code points (U+10000..U+10ffff)
  491. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  492. * In code unit order, high BMP code points sort after supplementary code points
  493. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  494. *
  495. * This functions works with strings of different explicitly specified lengths
  496. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  497. * NUL-terminated strings are possible with length arguments of -1.
  498. *
  499. * @param s1 First source string.
  500. * @param length1 Length of first source string, or -1 if NUL-terminated.
  501. *
  502. * @param s2 Second source string.
  503. * @param length2 Length of second source string, or -1 if NUL-terminated.
  504. *
  505. * @param options A bit set of options:
  506. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  507. * Comparison in code unit order with default case folding.
  508. *
  509. * - U_COMPARE_CODE_POINT_ORDER
  510. * Set to choose code point order instead of code unit order
  511. * (see u_strCompare for details).
  512. *
  513. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  514. *
  515. * @param pErrorCode Must be a valid pointer to an error code value,
  516. * which must not indicate a failure before the function call.
  517. *
  518. * @return <0 or 0 or >0 as usual for string comparisons
  519. *
  520. * @stable ICU 2.2
  521. */
  522. U_STABLE int32_t U_EXPORT2
  523. u_strCaseCompare(const UChar *s1, int32_t length1,
  524. const UChar *s2, int32_t length2,
  525. uint32_t options,
  526. UErrorCode *pErrorCode);
  527. /**
  528. * Compare two ustrings for bitwise equality.
  529. * Compares at most <code>n</code> characters.
  530. *
  531. * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
  532. * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
  533. * @param n The maximum number of characters to compare; always returns 0 if n<=0.
  534. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  535. * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
  536. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  537. * @stable ICU 2.0
  538. */
  539. U_STABLE int32_t U_EXPORT2
  540. u_strncmp(const UChar *ucs1,
  541. const UChar *ucs2,
  542. int32_t n);
  543. /**
  544. * Compare two Unicode strings in code point order.
  545. * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
  546. * For details, see u_strCompare().
  547. *
  548. * @param s1 A string to compare.
  549. * @param s2 A string to compare.
  550. * @param n The maximum number of characters to compare.
  551. * @return a negative/zero/positive integer corresponding to whether
  552. * the first string is less than/equal to/greater than the second one
  553. * in code point order
  554. * @stable ICU 2.0
  555. */
  556. U_STABLE int32_t U_EXPORT2
  557. u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
  558. /**
  559. * Compare two strings case-insensitively using full case folding.
  560. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
  561. *
  562. * @param s1 A string to compare.
  563. * @param s2 A string to compare.
  564. * @param options A bit set of options:
  565. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  566. * Comparison in code unit order with default case folding.
  567. *
  568. * - U_COMPARE_CODE_POINT_ORDER
  569. * Set to choose code point order instead of code unit order
  570. * (see u_strCompare for details).
  571. *
  572. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  573. *
  574. * @return A negative, zero, or positive integer indicating the comparison result.
  575. * @stable ICU 2.0
  576. */
  577. U_STABLE int32_t U_EXPORT2
  578. u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
  579. /**
  580. * Compare two strings case-insensitively using full case folding.
  581. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
  582. * u_strFoldCase(s2, at most n, options)).
  583. *
  584. * @param s1 A string to compare.
  585. * @param s2 A string to compare.
  586. * @param n The maximum number of characters each string to case-fold and then compare.
  587. * @param options A bit set of options:
  588. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  589. * Comparison in code unit order with default case folding.
  590. *
  591. * - U_COMPARE_CODE_POINT_ORDER
  592. * Set to choose code point order instead of code unit order
  593. * (see u_strCompare for details).
  594. *
  595. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  596. *
  597. * @return A negative, zero, or positive integer indicating the comparison result.
  598. * @stable ICU 2.0
  599. */
  600. U_STABLE int32_t U_EXPORT2
  601. u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
  602. /**
  603. * Compare two strings case-insensitively using full case folding.
  604. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
  605. * u_strFoldCase(s2, n, options)).
  606. *
  607. * @param s1 A string to compare.
  608. * @param s2 A string to compare.
  609. * @param length The number of characters in each string to case-fold and then compare.
  610. * @param options A bit set of options:
  611. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  612. * Comparison in code unit order with default case folding.
  613. *
  614. * - U_COMPARE_CODE_POINT_ORDER
  615. * Set to choose code point order instead of code unit order
  616. * (see u_strCompare for details).
  617. *
  618. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  619. *
  620. * @return A negative, zero, or positive integer indicating the comparison result.
  621. * @stable ICU 2.0
  622. */
  623. U_STABLE int32_t U_EXPORT2
  624. u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
  625. /**
  626. * Copy a ustring. Adds a null terminator.
  627. *
  628. * @param dst The destination string.
  629. * @param src The source string.
  630. * @return A pointer to <code>dst</code>.
  631. * @stable ICU 2.0
  632. */
  633. U_STABLE UChar* U_EXPORT2
  634. u_strcpy(UChar *dst,
  635. const UChar *src);
  636. /**
  637. * Copy a ustring.
  638. * Copies at most <code>n</code> characters. The result will be null terminated
  639. * if the length of <code>src</code> is less than <code>n</code>.
  640. *
  641. * @param dst The destination string.
  642. * @param src The source string (can be NULL/invalid if n<=0).
  643. * @param n The maximum number of characters to copy; no-op if <=0.
  644. * @return A pointer to <code>dst</code>.
  645. * @stable ICU 2.0
  646. */
  647. U_STABLE UChar* U_EXPORT2
  648. u_strncpy(UChar *dst,
  649. const UChar *src,
  650. int32_t n);
  651. #if !UCONFIG_NO_CONVERSION
  652. /**
  653. * Copy a byte string encoded in the default codepage to a ustring.
  654. * Adds a null terminator.
  655. * Performs a host byte to UChar conversion
  656. *
  657. * @param dst The destination string.
  658. * @param src The source string.
  659. * @return A pointer to <code>dst</code>.
  660. * @stable ICU 2.0
  661. */
  662. U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
  663. const char *src );
  664. /**
  665. * Copy a byte string encoded in the default codepage to a ustring.
  666. * Copies at most <code>n</code> characters. The result will be null terminated
  667. * if the length of <code>src</code> is less than <code>n</code>.
  668. * Performs a host byte to UChar conversion
  669. *
  670. * @param dst The destination string.
  671. * @param src The source string.
  672. * @param n The maximum number of characters to copy.
  673. * @return A pointer to <code>dst</code>.
  674. * @stable ICU 2.0
  675. */
  676. U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
  677. const char *src,
  678. int32_t n);
  679. /**
  680. * Copy ustring to a byte string encoded in the default codepage.
  681. * Adds a null terminator.
  682. * Performs a UChar to host byte conversion
  683. *
  684. * @param dst The destination string.
  685. * @param src The source string.
  686. * @return A pointer to <code>dst</code>.
  687. * @stable ICU 2.0
  688. */
  689. U_STABLE char* U_EXPORT2 u_austrcpy(char *dst,
  690. const UChar *src );
  691. /**
  692. * Copy ustring to a byte string encoded in the default codepage.
  693. * Copies at most <code>n</code> characters. The result will be null terminated
  694. * if the length of <code>src</code> is less than <code>n</code>.
  695. * Performs a UChar to host byte conversion
  696. *
  697. * @param dst The destination string.
  698. * @param src The source string.
  699. * @param n The maximum number of characters to copy.
  700. * @return A pointer to <code>dst</code>.
  701. * @stable ICU 2.0
  702. */
  703. U_STABLE char* U_EXPORT2 u_austrncpy(char *dst,
  704. const UChar *src,
  705. int32_t n );
  706. #endif
  707. /**
  708. * Synonym for memcpy(), but with UChars only.
  709. * @param dest The destination string
  710. * @param src The source string (can be NULL/invalid if count<=0)
  711. * @param count The number of characters to copy; no-op if <=0
  712. * @return A pointer to <code>dest</code>
  713. * @stable ICU 2.0
  714. */
  715. U_STABLE UChar* U_EXPORT2
  716. u_memcpy(UChar *dest, const UChar *src, int32_t count);
  717. /**
  718. * Synonym for memmove(), but with UChars only.
  719. * @param dest The destination string
  720. * @param src The source string (can be NULL/invalid if count<=0)
  721. * @param count The number of characters to move; no-op if <=0
  722. * @return A pointer to <code>dest</code>
  723. * @stable ICU 2.0
  724. */
  725. U_STABLE UChar* U_EXPORT2
  726. u_memmove(UChar *dest, const UChar *src, int32_t count);
  727. /**
  728. * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
  729. *
  730. * @param dest The destination string.
  731. * @param c The character to initialize the string.
  732. * @param count The maximum number of characters to set.
  733. * @return A pointer to <code>dest</code>.
  734. * @stable ICU 2.0
  735. */
  736. U_STABLE UChar* U_EXPORT2
  737. u_memset(UChar *dest, UChar c, int32_t count);
  738. /**
  739. * Compare the first <code>count</code> UChars of each buffer.
  740. *
  741. * @param buf1 The first string to compare.
  742. * @param buf2 The second string to compare.
  743. * @param count The maximum number of UChars to compare.
  744. * @return When buf1 < buf2, a negative number is returned.
  745. * When buf1 == buf2, 0 is returned.
  746. * When buf1 > buf2, a positive number is returned.
  747. * @stable ICU 2.0
  748. */
  749. U_STABLE int32_t U_EXPORT2
  750. u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
  751. /**
  752. * Compare two Unicode strings in code point order.
  753. * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
  754. * For details, see u_strCompare().
  755. *
  756. * @param s1 A string to compare.
  757. * @param s2 A string to compare.
  758. * @param count The maximum number of characters to compare.
  759. * @return a negative/zero/positive integer corresponding to whether
  760. * the first string is less than/equal to/greater than the second one
  761. * in code point order
  762. * @stable ICU 2.0
  763. */
  764. U_STABLE int32_t U_EXPORT2
  765. u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
  766. /**
  767. * Find the first occurrence of a BMP code point in a string.
  768. * A surrogate code point is found only if its match in the text is not
  769. * part of a surrogate pair.
  770. * A NUL character is found at the string terminator.
  771. *
  772. * @param s The string to search (contains <code>count</code> UChars).
  773. * @param c The BMP code point to find.
  774. * @param count The length of the string.
  775. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  776. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  777. * @stable ICU 2.0
  778. *
  779. * @see u_strchr
  780. * @see u_memchr32
  781. * @see u_strFindFirst
  782. */
  783. U_STABLE UChar* U_EXPORT2
  784. u_memchr(const UChar *s, UChar c, int32_t count);
  785. /**
  786. * Find the first occurrence of a code point in a string.
  787. * A surrogate code point is found only if its match in the text is not
  788. * part of a surrogate pair.
  789. * A NUL character is found at the string terminator.
  790. *
  791. * @param s The string to search (contains <code>count</code> UChars).
  792. * @param c The code point to find.
  793. * @param count The length of the string.
  794. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  795. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  796. * @stable ICU 2.0
  797. *
  798. * @see u_strchr32
  799. * @see u_memchr
  800. * @see u_strFindFirst
  801. */
  802. U_STABLE UChar* U_EXPORT2
  803. u_memchr32(const UChar *s, UChar32 c, int32_t count);
  804. /**
  805. * Find the last occurrence of a BMP code point in a string.
  806. * A surrogate code point is found only if its match in the text is not
  807. * part of a surrogate pair.
  808. * A NUL character is found at the string terminator.
  809. *
  810. * @param s The string to search (contains <code>count</code> UChars).
  811. * @param c The BMP code point to find.
  812. * @param count The length of the string.
  813. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  814. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  815. * @stable ICU 2.4
  816. *
  817. * @see u_strrchr
  818. * @see u_memrchr32
  819. * @see u_strFindLast
  820. */
  821. U_STABLE UChar* U_EXPORT2
  822. u_memrchr(const UChar *s, UChar c, int32_t count);
  823. /**
  824. * Find the last occurrence of a code point in a string.
  825. * A surrogate code point is found only if its match in the text is not
  826. * part of a surrogate pair.
  827. * A NUL character is found at the string terminator.
  828. *
  829. * @param s The string to search (contains <code>count</code> UChars).
  830. * @param c The code point to find.
  831. * @param count The length of the string.
  832. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  833. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  834. * @stable ICU 2.4
  835. *
  836. * @see u_strrchr32
  837. * @see u_memrchr
  838. * @see u_strFindLast
  839. */
  840. U_STABLE UChar* U_EXPORT2
  841. u_memrchr32(const UChar *s, UChar32 c, int32_t count);
  842. /**
  843. * Unicode String literals in C.
  844. * We need one macro to declare a variable for the string
  845. * and to statically preinitialize it if possible,
  846. * and a second macro to dynamically intialize such a string variable if necessary.
  847. *
  848. * The macros are defined for maximum performance.
  849. * They work only for strings that contain "invariant characters", i.e.,
  850. * only latin letters, digits, and some punctuation.
  851. * See utypes.h for details.
  852. *
  853. * A pair of macros for a single string must be used with the same
  854. * parameters.
  855. * The string parameter must be a C string literal.
  856. * The length of the string, not including the terminating
  857. * <code>NUL</code>, must be specified as a constant.
  858. * The U_STRING_DECL macro should be invoked exactly once for one
  859. * such string variable before it is used.
  860. *
  861. * Usage:
  862. * <pre>
  863. * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
  864. * U_STRING_DECL(ustringVar2, "jumps 5%", 8);
  865. * static UBool didInit=FALSE;
  866. *
  867. * int32_t function() {
  868. * if(!didInit) {
  869. * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
  870. * U_STRING_INIT(ustringVar2, "jumps 5%", 8);
  871. * didInit=TRUE;
  872. * }
  873. * return u_strcmp(ustringVar1, ustringVar2);
  874. * }
  875. * </pre>
  876. *
  877. * Note that the macros will NOT consistently work if their argument is another <code>#define</code>.
  878. * The following will not work on all platforms, don't use it.
  879. *
  880. * <pre>
  881. * #define GLUCK "Mr. Gluck"
  882. * U_STRING_DECL(var, GLUCK, 9)
  883. * U_STRING_INIT(var, GLUCK, 9)
  884. * </pre>
  885. *
  886. * Instead, use the string literal "Mr. Gluck" as the argument to both macro
  887. * calls.
  888. *
  889. *
  890. * @stable ICU 2.0
  891. */
  892. #if defined(U_DECLARE_UTF16)
  893. # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
  894. /**@stable ICU 2.0 */
  895. # define U_STRING_INIT(var, cs, length)
  896. #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
  897. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
  898. /**@stable ICU 2.0 */
  899. # define U_STRING_INIT(var, cs, length)
  900. #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
  901. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs
  902. /**@stable ICU 2.0 */
  903. # define U_STRING_INIT(var, cs, length)
  904. #else
  905. # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
  906. /**@stable ICU 2.0 */
  907. # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
  908. #endif
  909. /**
  910. * Unescape a string of characters and write the resulting
  911. * Unicode characters to the destination buffer. The following escape
  912. * sequences are recognized:
  913. *
  914. * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
  915. * \\Uhhhhhhhh 8 hex digits
  916. * \\xhh 1-2 hex digits
  917. * \\x{h...} 1-8 hex digits
  918. * \\ooo 1-3 octal digits; o in [0-7]
  919. * \\cX control-X; X is masked with 0x1F
  920. *
  921. * as well as the standard ANSI C escapes:
  922. *
  923. * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
  924. * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
  925. * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
  926. *
  927. * Anything else following a backslash is generically escaped. For
  928. * example, "[a\\-z]" returns "[a-z]".
  929. *
  930. * If an escape sequence is ill-formed, this method returns an empty
  931. * string. An example of an ill-formed sequence is "\\u" followed by
  932. * fewer than 4 hex digits.
  933. *
  934. * The above characters are recognized in the compiler's codepage,
  935. * that is, they are coded as 'u', '\\', etc. Characters that are
  936. * not parts of escape sequences are converted using u_charsToUChars().
  937. *
  938. * This function is similar to UnicodeString::unescape() but not
  939. * identical to it. The latter takes a source UnicodeString, so it
  940. * does escape recognition but no conversion.
  941. *
  942. * @param src a zero-terminated string of invariant characters
  943. * @param dest pointer to buffer to receive converted and unescaped
  944. * text and, if there is room, a zero terminator. May be NULL for
  945. * preflighting, in which case no UChars will be written, but the
  946. * return value will still be valid. On error, an empty string is
  947. * stored here (if possible).
  948. * @param destCapacity the number of UChars that may be written at
  949. * dest. Ignored if dest == NULL.
  950. * @return the length of unescaped string.
  951. * @see u_unescapeAt
  952. * @see UnicodeString#unescape()
  953. * @see UnicodeString#unescapeAt()
  954. * @stable ICU 2.0
  955. */
  956. U_STABLE int32_t U_EXPORT2
  957. u_unescape(const char *src,
  958. UChar *dest, int32_t destCapacity);
  959. U_CDECL_BEGIN
  960. /**
  961. * Callback function for u_unescapeAt() that returns a character of
  962. * the source text given an offset and a context pointer. The context
  963. * pointer will be whatever is passed into u_unescapeAt().
  964. *
  965. * @param offset pointer to the offset that will be passed to u_unescapeAt().
  966. * @param context an opaque pointer passed directly into u_unescapeAt()
  967. * @return the character represented by the escape sequence at
  968. * offset
  969. * @see u_unescapeAt
  970. * @stable ICU 2.0
  971. */
  972. typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context);
  973. U_CDECL_END
  974. /**
  975. * Unescape a single sequence. The character at offset-1 is assumed
  976. * (without checking) to be a backslash. This method takes a callback
  977. * pointer to a function that returns the UChar at a given offset. By
  978. * varying this callback, ICU functions are able to unescape char*
  979. * strings, UnicodeString objects, and UFILE pointers.
  980. *
  981. * If offset is out of range, or if the escape sequence is ill-formed,
  982. * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
  983. * for a list of recognized sequences.
  984. *
  985. * @param charAt callback function that returns a UChar of the source
  986. * text given an offset and a context pointer.
  987. * @param offset pointer to the offset that will be passed to charAt.
  988. * The offset value will be updated upon return to point after the
  989. * last parsed character of the escape sequence. On error the offset
  990. * is unchanged.
  991. * @param length the number of characters in the source text. The
  992. * last character of the source text is considered to be at offset
  993. * length-1.
  994. * @param context an opaque pointer passed directly into charAt.
  995. * @return the character represented by the escape sequence at
  996. * offset, or (UChar32)0xFFFFFFFF on error.
  997. * @see u_unescape()
  998. * @see UnicodeString#unescape()
  999. * @see UnicodeString#unescapeAt()
  1000. * @stable ICU 2.0
  1001. */
  1002. U_STABLE UChar32 U_EXPORT2
  1003. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  1004. int32_t *offset,
  1005. int32_t length,
  1006. void *context);
  1007. /**
  1008. * Uppercase the characters in a string.
  1009. * Casing is locale-dependent and context-sensitive.
  1010. * The result may be longer or shorter than the original.
  1011. * The source string and the destination buffer are allowed to overlap.
  1012. *
  1013. * @param dest A buffer for the result string. The result will be zero-terminated if
  1014. * the buffer is large enough.
  1015. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1016. * dest may be NULL and the function will only return the length of the result
  1017. * without writing any of the result string.
  1018. * @param src The original string
  1019. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1020. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1021. * @param pErrorCode Must be a valid pointer to an error code value,
  1022. * which must not indicate a failure before the function call.
  1023. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1024. * only some of the result was written to the destination buffer.
  1025. * @stable ICU 2.0
  1026. */
  1027. U_STABLE int32_t U_EXPORT2
  1028. u_strToUpper(UChar *dest, int32_t destCapacity,
  1029. const UChar *src, int32_t srcLength,
  1030. const char *locale,
  1031. UErrorCode *pErrorCode);
  1032. /**
  1033. * Lowercase the characters in a string.
  1034. * Casing is locale-dependent and context-sensitive.
  1035. * The result may be longer or shorter than the original.
  1036. * The source string and the destination buffer are allowed to overlap.
  1037. *
  1038. * @param dest A buffer for the result string. The result will be zero-terminated if
  1039. * the buffer is large enough.
  1040. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1041. * dest may be NULL and the function will only return the length of the result
  1042. * without writing any of the result string.
  1043. * @param src The original string
  1044. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1045. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1046. * @param pErrorCode Must be a valid pointer to an error code value,
  1047. * which must not indicate a failure before the function call.
  1048. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1049. * only some of the result was written to the destination buffer.
  1050. * @stable ICU 2.0
  1051. */
  1052. U_STABLE int32_t U_EXPORT2
  1053. u_strToLower(UChar *dest, int32_t destCapacity,
  1054. const UChar *src, int32_t srcLength,
  1055. const char *locale,
  1056. UErrorCode *pErrorCode);
  1057. #if !UCONFIG_NO_BREAK_ITERATION
  1058. /**
  1059. * Titlecase a string.
  1060. * Casing is locale-dependent and context-sensitive.
  1061. * Titlecasing uses a break iterator to find the first characters of words
  1062. * that are to be titlecased. It titlecases those characters and lowercases
  1063. * all others.
  1064. *
  1065. * The titlecase break iterator can be provided to customize for arbitrary
  1066. * styles, using rules and dictionaries beyond the standard iterators.
  1067. * It may be more efficient to always provide an iterator to avoid
  1068. * opening and closing one for each string.
  1069. * The standard titlecase iterator for the root locale implements the
  1070. * algorithm of Unicode TR 21.
  1071. *
  1072. * This function uses only the setText(), first() and next() methods of the
  1073. * provided break iterator.
  1074. *
  1075. * The result may be longer or shorter than the original.
  1076. * The source string and the destination buffer are allowed to overlap.
  1077. *
  1078. * @param dest A buffer for the result string. The result will be zero-terminated if
  1079. * the buffer is large enough.
  1080. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1081. * dest may be NULL and the function will only return the length of the result
  1082. * without writing any of the result string.
  1083. * @param src The original string
  1084. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1085. * @param titleIter A break iterator to find the first characters of words
  1086. * that are to be titlecased.
  1087. * If none is provided (NULL), then a standard titlecase
  1088. * break iterator is opened.
  1089. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1090. * @param pErrorCode Must be a valid pointer to an error code value,
  1091. * which must not indicate a failure before the function call.
  1092. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1093. * only some of the result was written to the destination buffer.
  1094. * @stable ICU 2.1
  1095. */
  1096. U_STABLE int32_t U_EXPORT2
  1097. u_strToTitle(UChar *dest, int32_t destCapacity,
  1098. const UChar *src, int32_t srcLength,
  1099. UBreakIterator *titleIter,
  1100. const char *locale,
  1101. UErrorCode *pErrorCode);
  1102. #endif
  1103. /**
  1104. * Case-folds the characters in a string.
  1105. *
  1106. * Case-folding is locale-independent and not context-sensitive,
  1107. * but there is an option for whether to include or exclude mappings for dotted I
  1108. * and dotless i that are marked with 'T' in CaseFolding.txt.
  1109. *
  1110. * The result may be longer or shorter than the original.
  1111. * The source string and the destination buffer are allowed to overlap.
  1112. *
  1113. * @param dest A buffer for the result string. The result will be zero-terminated if
  1114. * the buffer is large enough.
  1115. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1116. * dest may be NULL and the function will only return the length of the result
  1117. * without writing any of the result string.
  1118. * @param src The original string
  1119. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1120. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
  1121. * @param pErrorCode Must be a valid pointer to an error code value,
  1122. * which must not indicate a failure before the function call.
  1123. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1124. * only some of the result was written to the destination buffer.
  1125. * @stable ICU 2.0
  1126. */
  1127. U_STABLE int32_t U_EXPORT2
  1128. u_strFoldCase(UChar *dest, int32_t destCapacity,
  1129. const UChar *src, int32_t srcLength,
  1130. uint32_t options,
  1131. UErrorCode *pErrorCode);
  1132. #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  1133. /**
  1134. * Convert a UTF-16 string to a wchar_t string.
  1135. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1136. * this function simply calls the fast, dedicated function for that.
  1137. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
  1138. *
  1139. * @param dest A buffer for the result string. The result will be zero-terminated if
  1140. * the buffer is large enough.
  1141. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
  1142. * dest may be NULL and the function will only return the length of the
  1143. * result without writing any of the result string (pre-flighting).
  1144. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1145. * pDestLength!=NULL then *pDestLength is always set to the
  1146. * number of output units corresponding to the transformation of
  1147. * all the input units, even in case of a buffer overflow.
  1148. * @param src The original source string
  1149. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1150. * @param pErrorCode Must be a valid pointer to an error code value,
  1151. * which must not indicate a failure before the function call.
  1152. * @return The pointer to destination buffer.
  1153. * @stable ICU 2.0
  1154. */
  1155. U_STABLE wchar_t* U_EXPORT2
  1156. u_strToWCS(wchar_t *dest,
  1157. int32_t destCapacity,
  1158. int32_t *pDestLength,
  1159. const UChar *src,
  1160. int32_t srcLength,
  1161. UErrorCode *pErrorCode);
  1162. /**
  1163. * Convert a wchar_t string to UTF-16.
  1164. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1165. * this function simply calls the fast, dedicated function for that.
  1166. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
  1167. *
  1168. * @param dest A buffer for the result string. The result will be zero-terminated if
  1169. * the buffer is large enough.
  1170. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1171. * dest may be NULL and the function will only return the length of the
  1172. * result without writing any of the result string (pre-flighting).
  1173. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1174. * pDestLength!=NULL then *pDestLength is always set to the
  1175. * number of output units corresponding to the transformation of
  1176. * all the input units, even in case of a buffer overflow.
  1177. * @param src The original source string
  1178. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1179. * @param pErrorCode Must be a valid pointer to an error code value,
  1180. * which must not indicate a failure before the function call.
  1181. * @return The pointer to destination buffer.
  1182. * @stable ICU 2.0
  1183. */
  1184. U_STABLE UChar* U_EXPORT2
  1185. u_strFromWCS(UChar *dest,
  1186. int32_t destCapacity,
  1187. int32_t *pDestLength,
  1188. const wchar_t *src,
  1189. int32_t srcLength,
  1190. UErrorCode *pErrorCode);
  1191. #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  1192. /**
  1193. * Convert a UTF-16 string to UTF-8.
  1194. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1195. *
  1196. * @param dest A buffer for the result string. The result will be zero-terminated if
  1197. * the buffer is large enough.
  1198. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1199. * dest may be NULL and the function will only return the length of the
  1200. * result without writing any of the result string (pre-flighting).
  1201. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1202. * pDestLength!=NULL then *pDestLength is always set to the
  1203. * number of output units corresponding to the transformation of
  1204. * all the input units, even in case of a buffer overflow.
  1205. * @param src The original source string
  1206. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1207. * @param pErrorCode Must be a valid pointer to an error code value,
  1208. * which must not indicate a failure before the function call.
  1209. * @return The pointer to destination buffer.
  1210. * @stable ICU 2.0
  1211. * @see u_strToUTF8WithSub
  1212. * @see u_strFromUTF8
  1213. */
  1214. U_STABLE char* U_EXPORT2
  1215. u_strToUTF8(char *dest,
  1216. int32_t destCapacity,
  1217. int32_t *pDestLength,
  1218. const UChar *src,
  1219. int32_t srcLength,
  1220. UErrorCode *pErrorCode);
  1221. /**
  1222. * Convert a UTF-8 string to UTF-16.
  1223. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1224. *
  1225. * @param dest A buffer for the result string. The result will be zero-terminated if
  1226. * the buffer is large enough.
  1227. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1228. * dest may be NULL and the function will only return the length of the
  1229. * result without writing any of the result string (pre-flighting).
  1230. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1231. * pDestLength!=NULL then *pDestLength is always set to the
  1232. * number of output units corresponding to the transformation of
  1233. * all the input units, even in case of a buffer overflow.
  1234. * @param src The original source string
  1235. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1236. * @param pErrorCode Must be a valid pointer to an error code value,
  1237. * which must not indicate a failure before the function call.
  1238. * @return The pointer to destination buffer.
  1239. * @stable ICU 2.0
  1240. * @see u_strFromUTF8WithSub
  1241. * @see u_strFromUTF8Lenient
  1242. */
  1243. U_STABLE UChar* U_EXPORT2
  1244. u_strFromUTF8(UChar *dest,
  1245. int32_t destCapacity,
  1246. int32_t *pDestLength,
  1247. const char *src,
  1248. int32_t srcLength,
  1249. UErrorCode *pErrorCode);
  1250. /**
  1251. * Convert a UTF-16 string to UTF-8.
  1252. *
  1253. * Same as u_strToUTF8() except for the additional subchar which is output for
  1254. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1255. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
  1256. *
  1257. * @param dest A buffer for the result string. The result will be zero-terminated if
  1258. * the buffer is large enough.
  1259. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1260. * dest may be NULL and the function will only return the length of the
  1261. * result without writing any of the result string (pre-flighting).
  1262. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1263. * pDestLength!=NULL then *pDestLength is always set to the
  1264. * number of output units corresponding to the transformation of
  1265. * all the input units, even in case of a buffer overflow.
  1266. * @param src The original source string
  1267. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1268. * @param subchar The substitution character to use in place of an illegal input sequence,
  1269. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1270. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1271. * except for surrogate code points (U+D800..U+DFFF).
  1272. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1273. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1274. * Set to 0 if no substitutions occur or subchar<0.
  1275. * pNumSubstitutions can be NULL.
  1276. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1277. * pass the U_SUCCESS() test, or else the function returns
  1278. * immediately. Check for U_FAILURE() on output or use with
  1279. * function chaining. (See User Guide for details.)
  1280. * @return The pointer to destination buffer.
  1281. * @see u_strToUTF8
  1282. * @see u_strFromUTF8WithSub
  1283. * @stable ICU 3.6
  1284. */
  1285. U_STABLE char* U_EXPORT2
  1286. u_strToUTF8WithSub(char *dest,
  1287. int32_t destCapacity,
  1288. int32_t *pDestLength,
  1289. const UChar *src,
  1290. int32_t srcLength,
  1291. UChar32 subchar, int32_t *pNumSubstitutions,
  1292. UErrorCode *pErrorCode);
  1293. /**
  1294. * Convert a UTF-8 string to UTF-16.
  1295. *
  1296. * Same as u_strFromUTF8() except for the additional subchar which is output for
  1297. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1298. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
  1299. *
  1300. * @param dest A buffer for the result string. The result will be zero-terminated if
  1301. * the buffer is large enough.
  1302. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1303. * dest may be NULL and the function will only return the length of the
  1304. * result without writing any of the result string (pre-flighting).
  1305. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1306. * pDestLength!=NULL then *pDestLength is always set to the
  1307. * number of output units corresponding to the transformation of
  1308. * all the input units, even in case of a buffer overflow.
  1309. * @param src The original source string
  1310. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1311. * @param subchar The substitution character to use in place of an illegal input sequence,
  1312. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1313. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1314. * except for surrogate code points (U+D800..U+DFFF).
  1315. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1316. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1317. * Set to 0 if no substitutions occur or subchar<0.
  1318. * pNumSubstitutions can be NULL.
  1319. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1320. * pass the U_SUCCESS() test, or else the function returns
  1321. * immediately. Check for U_FAILURE() on output or use with
  1322. * function chaining. (See User Guide for details.)
  1323. * @return The pointer to destination buffer.
  1324. * @see u_strFromUTF8
  1325. * @see u_strFromUTF8Lenient
  1326. * @see u_strToUTF8WithSub
  1327. * @stable ICU 3.6
  1328. */
  1329. U_STABLE UChar* U_EXPORT2
  1330. u_strFromUTF8WithSub(UChar *dest,
  1331. int32_t destCapacity,
  1332. int32_t *pDestLength,
  1333. const char *src,
  1334. int32_t srcLength,
  1335. UChar32 subchar, int32_t *pNumSubstitutions,
  1336. UErrorCode *pErrorCode);
  1337. /**
  1338. * Convert a UTF-8 string to UTF-16.
  1339. *
  1340. * Same as u_strFromUTF8() except that this function is designed to be very fast,
  1341. * which it achieves by being lenient about malformed UTF-8 sequences.
  1342. * This function is intended for use in environments where UTF-8 text is
  1343. * expected to be well-formed.
  1344. *
  1345. * Its semantics are:
  1346. * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  1347. * - The function will not read beyond the input string, nor write beyond
  1348. * the destCapacity.
  1349. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
  1350. * be well-formed UTF-16.
  1351. * The function will resynchronize to valid code point boundaries
  1352. * within a small number of code points after an illegal sequence.
  1353. * - Non-shortest forms are not detected and will result in "spoofing" output.
  1354. *
  1355. * For further performance improvement, if srcLength is given (>=0),
  1356. * then it must be destCapacity>=srcLength.
  1357. *
  1358. * There is no inverse u_strToUTF8Lenient() function because there is practically
  1359. * no performance gain from not checking that a UTF-16 string is well-formed.
  1360. *
  1361. * @param dest A buffer for the result string. The result will be zero-terminated if
  1362. * the buffer is large enough.
  1363. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1364. * dest may be NULL and the function will only return the length of the
  1365. * result without writing any of the result string (pre-flighting).
  1366. * Unlike for other ICU functions, if srcLength>=0 then it
  1367. * must be destCapacity>=srcLength.
  1368. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1369. * pDestLength!=NULL then *pDestLength is always set to the
  1370. * number of output units corresponding to the transformation of
  1371. * all the input units, even in case of a buffer overflow.
  1372. * Unlike for other ICU functions, if srcLength>=0 but
  1373. * destCapacity<srcLength, then *pDestLength will be set to srcLength
  1374. * (and U_BUFFER_OVERFLOW_ERROR will be set)
  1375. * regardless of the actual result length.
  1376. * @param src The original source string
  1377. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1378. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1379. * pass the U_SUCCESS() test, or else the function returns
  1380. * immediately. Check for U_FAILURE() on output or use with
  1381. * function chaining. (See User Guide for details.)
  1382. * @return The pointer to destination buffer.
  1383. * @see u_strFromUTF8
  1384. * @see u_strFromUTF8WithSub
  1385. * @see u_strToUTF8WithSub
  1386. * @stable ICU 3.6
  1387. */
  1388. U_STABLE UChar * U_EXPORT2
  1389. u_strFromUTF8Lenient(UChar *dest,
  1390. int32_t destCapacity,
  1391. int32_t *pDestLength,
  1392. const char *src,
  1393. int32_t srcLength,
  1394. UErrorCode *pErrorCode);
  1395. /**
  1396. * Convert a UTF-16 string to UTF-32.
  1397. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1398. *
  1399. * @param dest A buffer for the result string. The result will be zero-terminated if
  1400. * the buffer is large enough.
  1401. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1402. * dest may be NULL and the function will only return the length of the
  1403. * result without writing any of the result string (pre-flighting).
  1404. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1405. * pDestLength!=NULL then *pDestLength is always set to the
  1406. * number of output units corresponding to the transformation of
  1407. * all the input units, even in case of a buffer overflow.
  1408. * @param src The original source string
  1409. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1410. * @param pErrorCode Must be a valid pointer to an error code value,
  1411. * which must not indicate a failure before the function call.
  1412. * @return The pointer to destination buffer.
  1413. * @see u_strToUTF32WithSub
  1414. * @see u_strFromUTF32
  1415. * @stable ICU 2.0
  1416. */
  1417. U_STABLE UChar32* U_EXPORT2
  1418. u_strToUTF32(UChar32 *dest,
  1419. int32_t destCapacity,
  1420. int32_t *pDestLength,
  1421. const UChar *src,
  1422. int32_t srcLength,
  1423. UErrorCode *pErrorCode);
  1424. /**
  1425. * Convert a UTF-32 string to UTF-16.
  1426. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1427. *
  1428. * @param dest A buffer for the result string. The result will be zero-terminated if
  1429. * the buffer is large enough.
  1430. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1431. * dest may be NULL and the function will only return the length of the
  1432. * result without writing any of the result string (pre-flighting).
  1433. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1434. * pDestLength!=NULL then *pDestLength is always set to the
  1435. * number of output units corresponding to the transformation of
  1436. * all the input units, even in case of a buffer overflow.
  1437. * @param src The original source string
  1438. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1439. * @param pErrorCode Must be a valid pointer to an error code value,
  1440. * which must not indicate a failure before the function call.
  1441. * @return The pointer to destination buffer.
  1442. * @see u_strFromUTF32WithSub
  1443. * @see u_strToUTF32
  1444. * @stable ICU 2.0
  1445. */
  1446. U_STABLE UChar* U_EXPORT2
  1447. u_strFromUTF32(UChar *dest,
  1448. int32_t destCapacity,
  1449. int32_t *pDestLength,
  1450. const UChar32 *src,
  1451. int32_t srcLength,
  1452. UErrorCode *pErrorCode);
  1453. /**
  1454. * Convert a UTF-16 string to UTF-32.
  1455. *
  1456. * Same as u_strToUTF32() except for the additional subchar which is output for
  1457. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1458. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
  1459. *
  1460. * @param dest A buffer for the result string. The result will be zero-terminated if
  1461. * the buffer is large enough.
  1462. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1463. * dest may be NULL and the function will only return the length of the
  1464. * result without writing any of the result string (pre-flighting).
  1465. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1466. * pDestLength!=NULL then *pDestLength is always set to the
  1467. * number of output units corresponding to the transformation of
  1468. * all the input units, even in case of a buffer overflow.
  1469. * @param src The original source string
  1470. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1471. * @param subchar The substitution character to use in place of an illegal input sequence,
  1472. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1473. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1474. * except for surrogate code points (U+D800..U+DFFF).
  1475. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1476. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1477. * Set to 0 if no substitutions occur or subchar<0.
  1478. * pNumSubstitutions can be NULL.
  1479. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1480. * pass the U_SUCCESS() test, or else the function returns
  1481. * immediately. Check for U_FAILURE() on output or use with
  1482. * function chaining. (See User Guide for details.)
  1483. * @return The pointer to destination buffer.
  1484. * @see u_strToUTF32
  1485. * @see u_strFromUTF32WithSub
  1486. * @stable ICU 4.2
  1487. */
  1488. U_STABLE UChar32* U_EXPORT2
  1489. u_strToUTF32WithSub(UChar32 *dest,
  1490. int32_t destCapacity,
  1491. int32_t *pDestLength,
  1492. const UChar *src,
  1493. int32_t srcLength,
  1494. UChar32 subchar, int32_t *pNumSubstitutions,
  1495. UErrorCode *pErrorCode);
  1496. /**
  1497. * Convert a UTF-32 string to UTF-16.
  1498. *
  1499. * Same as u_strFromUTF32() except for the additional subchar which is output for
  1500. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1501. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
  1502. *
  1503. * @param dest A buffer for the result string. The result will be zero-terminated if
  1504. * the buffer is large enough.
  1505. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1506. * dest may be NULL and the function will only return the length of the
  1507. * result without writing any of the result string (pre-flighting).
  1508. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1509. * pDestLength!=NULL then *pDestLength is always set to the
  1510. * number of output units corresponding to the transformation of
  1511. * all the input units, even in case of a buffer overflow.
  1512. * @param src The original source string
  1513. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1514. * @param subchar The substitution character to use in place of an illegal input sequence,
  1515. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1516. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1517. * except for surrogate code points (U+D800..U+DFFF).
  1518. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1519. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1520. * Set to 0 if no substitutions occur or subchar<0.
  1521. * pNumSubstitutions can be NULL.
  1522. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1523. * pass the U_SUCCESS() test, or else the function returns
  1524. * immediately. Check for U_FAILURE() on output or use with
  1525. * function chaining. (See User Guide for details.)
  1526. * @return The pointer to destination buffer.
  1527. * @see u_strFromUTF32
  1528. * @see u_strToUTF32WithSub
  1529. * @stable ICU 4.2
  1530. */
  1531. U_STABLE UChar* U_EXPORT2
  1532. u_strFromUTF32WithSub(UChar *dest,
  1533. int32_t destCapacity,
  1534. int32_t *pDestLength,
  1535. const UChar32 *src,
  1536. int32_t srcLength,
  1537. UChar32 subchar, int32_t *pNumSubstitutions,
  1538. UErrorCode *pErrorCode);
  1539. /**
  1540. * Convert a 16-bit Unicode string to Java Modified UTF-8.
  1541. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
  1542. *
  1543. * This function behaves according to the documentation for Java DataOutput.writeUTF()
  1544. * except that it does not encode the output length in the destination buffer
  1545. * and does not have an output length restriction.
  1546. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
  1547. *
  1548. * The input string need not be well-formed UTF-16.
  1549. * (Therefore there is no subchar parameter.)
  1550. *
  1551. * @param dest A buffer for the result string. The result will be zero-terminated if
  1552. * the buffer is large enough.
  1553. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1554. * dest may be NULL and the function will only return the length of the
  1555. * result without writing any of the result string (pre-flighting).
  1556. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1557. * pDestLength!=NULL then *pDestLength is always set to the
  1558. * number of output units corresponding to the transformation of
  1559. * all the input units, even in case of a buffer overflow.
  1560. * @param src The original source string
  1561. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1562. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1563. * pass the U_SUCCESS() test, or else the function returns
  1564. * immediately. Check for U_FAILURE() on output or use with
  1565. * function chaining. (See User Guide for details.)
  1566. * @return The pointer to destination buffer.
  1567. * @stable ICU 4.4
  1568. * @see u_strToUTF8WithSub
  1569. * @see u_strFromJavaModifiedUTF8WithSub
  1570. */
  1571. U_STABLE char* U_EXPORT2
  1572. u_strToJavaModifiedUTF8(
  1573. char *dest,
  1574. int32_t destCapacity,
  1575. int32_t *pDestLength,
  1576. const UChar *src,
  1577. int32_t srcLength,
  1578. UErrorCode *pErrorCode);
  1579. /**
  1580. * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
  1581. * If the input string is not well-formed and no substitution char is specified,
  1582. * then the U_INVALID_CHAR_FOUND error code is set.
  1583. *
  1584. * This function behaves according to the documentation for Java DataInput.readUTF()
  1585. * except that it takes a length parameter rather than
  1586. * interpreting the first two input bytes as the length.
  1587. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
  1588. *
  1589. * The output string may not be well-formed UTF-16.
  1590. *
  1591. * @param dest A buffer for the result string. The result will be zero-terminated if
  1592. * the buffer is large enough.
  1593. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1594. * dest may be NULL and the function will only return the length of the
  1595. * result without writing any of the result string (pre-flighting).
  1596. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1597. * pDestLength!=NULL then *pDestLength is always set to the
  1598. * number of output units corresponding to the transformation of
  1599. * all the input units, even in case of a buffer overflow.
  1600. * @param src The original source string
  1601. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1602. * @param subchar The substitution character to use in place of an illegal input sequence,
  1603. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1604. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1605. * except for surrogate code points (U+D800..U+DFFF).
  1606. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1607. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1608. * Set to 0 if no substitutions occur or subchar<0.
  1609. * pNumSubstitutions can be NULL.
  1610. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1611. * pass the U_SUCCESS() test, or else the function returns
  1612. * immediately. Check for U_FAILURE() on output or use with
  1613. * function chaining. (See User Guide for details.)
  1614. * @return The pointer to destination buffer.
  1615. * @see u_strFromUTF8WithSub
  1616. * @see u_strFromUTF8Lenient
  1617. * @see u_strToJavaModifiedUTF8
  1618. * @stable ICU 4.4
  1619. */
  1620. U_STABLE UChar* U_EXPORT2
  1621. u_strFromJavaModifiedUTF8WithSub(
  1622. UChar *dest,
  1623. int32_t destCapacity,
  1624. int32_t *pDestLength,
  1625. const char *src,
  1626. int32_t srcLength,
  1627. UChar32 subchar, int32_t *pNumSubstitutions,
  1628. UErrorCode *pErrorCode);
  1629. #endif