utf8.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2015, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: utf8.h
  11. * encoding: US-ASCII
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 1999sep13
  16. * created by: Markus W. Scherer
  17. */
  18. /**
  19. * \file
  20. * \brief C API: 8-bit Unicode handling macros
  21. *
  22. * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
  23. *
  24. * For more information see utf.h and the ICU User Guide Strings chapter
  25. * (http://userguide.icu-project.org/strings).
  26. *
  27. * <em>Usage:</em>
  28. * ICU coding guidelines for if() statements should be followed when using these macros.
  29. * Compound statements (curly braces {}) must be used for if-else-while...
  30. * bodies and all macro statements should be terminated with semicolon.
  31. */
  32. #ifndef __UTF8_H__
  33. #define __UTF8_H__
  34. #include "unicode/umachine.h"
  35. #ifndef __UTF_H__
  36. # include "unicode/utf.h"
  37. #endif
  38. /* internal definitions ----------------------------------------------------- */
  39. /**
  40. * \var utf8_countTrailBytes
  41. * Internal array with numbers of trail bytes for any given byte used in
  42. * lead byte position.
  43. *
  44. * This is internal since it is not meant to be called directly by external clients;
  45. * however it is called by public macros in this file and thus must remain stable,
  46. * and should not be hidden when other internal functions are hidden (otherwise
  47. * public macros would fail to compile).
  48. * @internal
  49. */
  50. #ifdef U_UTF8_IMPL
  51. U_EXPORT const uint8_t
  52. #elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION)
  53. U_CFUNC const uint8_t
  54. #else
  55. U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/
  56. #endif
  57. utf8_countTrailBytes[256];
  58. /**
  59. * Counts the trail bytes for a UTF-8 lead byte.
  60. * Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
  61. *
  62. * This is internal since it is not meant to be called directly by external clients;
  63. * however it is called by public macros in this file and thus must remain stable.
  64. *
  65. * Note: Beginning with ICU 50, the implementation uses a multi-condition expression
  66. * which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
  67. * leadByte is evaluated multiple times.
  68. *
  69. * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
  70. * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
  71. * leadByte was evaluated exactly once.
  72. *
  73. * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
  74. * @internal
  75. */
  76. #define U8_COUNT_TRAIL_BYTES(leadByte) \
  77. ((uint8_t)(leadByte)<0xf0 ? \
  78. ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
  79. (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
  80. /**
  81. * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
  82. * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
  83. * leadByte might be evaluated multiple times.
  84. *
  85. * This is internal since it is not meant to be called directly by external clients;
  86. * however it is called by public macros in this file and thus must remain stable.
  87. *
  88. * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
  89. * @internal
  90. */
  91. #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
  92. (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
  93. /**
  94. * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
  95. *
  96. * This is internal since it is not meant to be called directly by external clients;
  97. * however it is called by public macros in this file and thus must remain stable.
  98. * @internal
  99. */
  100. #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
  101. /**
  102. * Function for handling "next code point" with error-checking.
  103. *
  104. * This is internal since it is not meant to be called directly by external clients;
  105. * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
  106. * file and thus must remain stable, and should not be hidden when other internal
  107. * functions are hidden (otherwise public macros would fail to compile).
  108. * @internal
  109. */
  110. U_STABLE UChar32 U_EXPORT2
  111. utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
  112. /**
  113. * Function for handling "append code point" with error-checking.
  114. *
  115. * This is internal since it is not meant to be called directly by external clients;
  116. * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
  117. * file and thus must remain stable, and should not be hidden when other internal
  118. * functions are hidden (otherwise public macros would fail to compile).
  119. * @internal
  120. */
  121. U_STABLE int32_t U_EXPORT2
  122. utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
  123. /**
  124. * Function for handling "previous code point" with error-checking.
  125. *
  126. * This is internal since it is not meant to be called directly by external clients;
  127. * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
  128. * file and thus must remain stable, and should not be hidden when other internal
  129. * functions are hidden (otherwise public macros would fail to compile).
  130. * @internal
  131. */
  132. U_STABLE UChar32 U_EXPORT2
  133. utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
  134. /**
  135. * Function for handling "skip backward one code point" with error-checking.
  136. *
  137. * This is internal since it is not meant to be called directly by external clients;
  138. * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
  139. * file and thus must remain stable, and should not be hidden when other internal
  140. * functions are hidden (otherwise public macros would fail to compile).
  141. * @internal
  142. */
  143. U_STABLE int32_t U_EXPORT2
  144. utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  145. /* single-code point definitions -------------------------------------------- */
  146. /**
  147. * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
  148. * @param c 8-bit code unit (byte)
  149. * @return TRUE or FALSE
  150. * @stable ICU 2.4
  151. */
  152. #define U8_IS_SINGLE(c) (((c)&0x80)==0)
  153. /**
  154. * Is this code unit (byte) a UTF-8 lead byte?
  155. * @param c 8-bit code unit (byte)
  156. * @return TRUE or FALSE
  157. * @stable ICU 2.4
  158. */
  159. #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
  160. /**
  161. * Is this code unit (byte) a UTF-8 trail byte?
  162. * @param c 8-bit code unit (byte)
  163. * @return TRUE or FALSE
  164. * @stable ICU 2.4
  165. */
  166. #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
  167. /**
  168. * How many code units (bytes) are used for the UTF-8 encoding
  169. * of this Unicode code point?
  170. * @param c 32-bit code point
  171. * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
  172. * @stable ICU 2.4
  173. */
  174. #define U8_LENGTH(c) \
  175. ((uint32_t)(c)<=0x7f ? 1 : \
  176. ((uint32_t)(c)<=0x7ff ? 2 : \
  177. ((uint32_t)(c)<=0xd7ff ? 3 : \
  178. ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
  179. ((uint32_t)(c)<=0xffff ? 3 : 4)\
  180. ) \
  181. ) \
  182. ) \
  183. )
  184. /**
  185. * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
  186. * @return 4
  187. * @stable ICU 2.4
  188. */
  189. #define U8_MAX_LENGTH 4
  190. /**
  191. * Get a code point from a string at a random-access offset,
  192. * without changing the offset.
  193. * The offset may point to either the lead byte or one of the trail bytes
  194. * for a code point, in which case the macro will read all of the bytes
  195. * for the code point.
  196. * The result is undefined if the offset points to an illegal UTF-8
  197. * byte sequence.
  198. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
  199. *
  200. * @param s const uint8_t * string
  201. * @param i string offset
  202. * @param c output UChar32 variable
  203. * @see U8_GET
  204. * @stable ICU 2.4
  205. */
  206. #define U8_GET_UNSAFE(s, i, c) { \
  207. int32_t _u8_get_unsafe_index=(int32_t)(i); \
  208. U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
  209. U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
  210. }
  211. /**
  212. * Get a code point from a string at a random-access offset,
  213. * without changing the offset.
  214. * The offset may point to either the lead byte or one of the trail bytes
  215. * for a code point, in which case the macro will read all of the bytes
  216. * for the code point.
  217. *
  218. * The length can be negative for a NUL-terminated string.
  219. *
  220. * If the offset points to an illegal UTF-8 byte sequence, then
  221. * c is set to a negative value.
  222. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
  223. *
  224. * @param s const uint8_t * string
  225. * @param start int32_t starting string offset
  226. * @param i int32_t string offset, must be start<=i<length
  227. * @param length int32_t string length
  228. * @param c output UChar32 variable, set to <0 in case of an error
  229. * @see U8_GET_UNSAFE
  230. * @stable ICU 2.4
  231. */
  232. #define U8_GET(s, start, i, length, c) { \
  233. int32_t _u8_get_index=(i); \
  234. U8_SET_CP_START(s, start, _u8_get_index); \
  235. U8_NEXT(s, _u8_get_index, length, c); \
  236. }
  237. /**
  238. * Get a code point from a string at a random-access offset,
  239. * without changing the offset.
  240. * The offset may point to either the lead byte or one of the trail bytes
  241. * for a code point, in which case the macro will read all of the bytes
  242. * for the code point.
  243. *
  244. * The length can be negative for a NUL-terminated string.
  245. *
  246. * If the offset points to an illegal UTF-8 byte sequence, then
  247. * c is set to U+FFFD.
  248. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
  249. *
  250. * This macro does not distinguish between a real U+FFFD in the text
  251. * and U+FFFD returned for an ill-formed sequence.
  252. * Use U8_GET() if that distinction is important.
  253. *
  254. * @param s const uint8_t * string
  255. * @param start int32_t starting string offset
  256. * @param i int32_t string offset, must be start<=i<length
  257. * @param length int32_t string length
  258. * @param c output UChar32 variable, set to U+FFFD in case of an error
  259. * @see U8_GET
  260. * @stable ICU 51
  261. */
  262. #define U8_GET_OR_FFFD(s, start, i, length, c) { \
  263. int32_t _u8_get_index=(i); \
  264. U8_SET_CP_START(s, start, _u8_get_index); \
  265. U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
  266. }
  267. /* definitions with forward iteration --------------------------------------- */
  268. /**
  269. * Get a code point from a string at a code point boundary offset,
  270. * and advance the offset to the next code point boundary.
  271. * (Post-incrementing forward iteration.)
  272. * "Unsafe" macro, assumes well-formed UTF-8.
  273. *
  274. * The offset may point to the lead byte of a multi-byte sequence,
  275. * in which case the macro will read the whole sequence.
  276. * The result is undefined if the offset points to a trail byte
  277. * or an illegal UTF-8 sequence.
  278. *
  279. * @param s const uint8_t * string
  280. * @param i string offset
  281. * @param c output UChar32 variable
  282. * @see U8_NEXT
  283. * @stable ICU 2.4
  284. */
  285. #define U8_NEXT_UNSAFE(s, i, c) { \
  286. (c)=(uint8_t)(s)[(i)++]; \
  287. if((c)>=0x80) { \
  288. if((c)<0xe0) { \
  289. (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
  290. } else if((c)<0xf0) { \
  291. /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
  292. (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
  293. (i)+=2; \
  294. } else { \
  295. (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
  296. (i)+=3; \
  297. } \
  298. } \
  299. }
  300. /**
  301. * Get a code point from a string at a code point boundary offset,
  302. * and advance the offset to the next code point boundary.
  303. * (Post-incrementing forward iteration.)
  304. * "Safe" macro, checks for illegal sequences and for string boundaries.
  305. *
  306. * The length can be negative for a NUL-terminated string.
  307. *
  308. * The offset may point to the lead byte of a multi-byte sequence,
  309. * in which case the macro will read the whole sequence.
  310. * If the offset points to a trail byte or an illegal UTF-8 sequence, then
  311. * c is set to a negative value.
  312. *
  313. * @param s const uint8_t * string
  314. * @param i int32_t string offset, must be i<length
  315. * @param length int32_t string length
  316. * @param c output UChar32 variable, set to <0 in case of an error
  317. * @see U8_NEXT_UNSAFE
  318. * @stable ICU 2.4
  319. */
  320. #define U8_NEXT(s, i, length, c) { \
  321. (c)=(uint8_t)(s)[(i)++]; \
  322. if((c)>=0x80) { \
  323. uint8_t __t1, __t2; \
  324. if( /* handle U+1000..U+CFFF inline */ \
  325. (0xe0<(c) && (c)<=0xec) && \
  326. (((i)+1)<(length) || (length)<0) && \
  327. (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
  328. (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
  329. ) { \
  330. /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
  331. (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
  332. (i)+=2; \
  333. } else if( /* handle U+0080..U+07FF inline */ \
  334. ((c)<0xe0 && (c)>=0xc2) && \
  335. ((i)!=(length)) && \
  336. (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
  337. ) { \
  338. (c)=(((c)&0x1f)<<6)|__t1; \
  339. ++(i); \
  340. } else { \
  341. /* function call for "complicated" and error cases */ \
  342. (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
  343. } \
  344. } \
  345. }
  346. /**
  347. * Get a code point from a string at a code point boundary offset,
  348. * and advance the offset to the next code point boundary.
  349. * (Post-incrementing forward iteration.)
  350. * "Safe" macro, checks for illegal sequences and for string boundaries.
  351. *
  352. * The length can be negative for a NUL-terminated string.
  353. *
  354. * The offset may point to the lead byte of a multi-byte sequence,
  355. * in which case the macro will read the whole sequence.
  356. * If the offset points to a trail byte or an illegal UTF-8 sequence, then
  357. * c is set to U+FFFD.
  358. *
  359. * This macro does not distinguish between a real U+FFFD in the text
  360. * and U+FFFD returned for an ill-formed sequence.
  361. * Use U8_NEXT() if that distinction is important.
  362. *
  363. * @param s const uint8_t * string
  364. * @param i int32_t string offset, must be i<length
  365. * @param length int32_t string length
  366. * @param c output UChar32 variable, set to U+FFFD in case of an error
  367. * @see U8_NEXT
  368. * @stable ICU 51
  369. */
  370. #define U8_NEXT_OR_FFFD(s, i, length, c) { \
  371. (c)=(uint8_t)(s)[(i)++]; \
  372. if((c)>=0x80) { \
  373. uint8_t __t1, __t2; \
  374. if( /* handle U+1000..U+CFFF inline */ \
  375. (0xe0<(c) && (c)<=0xec) && \
  376. (((i)+1)<(length) || (length)<0) && \
  377. (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
  378. (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
  379. ) { \
  380. /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
  381. (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
  382. (i)+=2; \
  383. } else if( /* handle U+0080..U+07FF inline */ \
  384. ((c)<0xe0 && (c)>=0xc2) && \
  385. ((i)!=(length)) && \
  386. (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
  387. ) { \
  388. (c)=(((c)&0x1f)<<6)|__t1; \
  389. ++(i); \
  390. } else { \
  391. /* function call for "complicated" and error cases */ \
  392. (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
  393. } \
  394. } \
  395. }
  396. /**
  397. * Append a code point to a string, overwriting 1 to 4 bytes.
  398. * The offset points to the current end of the string contents
  399. * and is advanced (post-increment).
  400. * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
  401. * Otherwise, the result is undefined.
  402. *
  403. * @param s const uint8_t * string buffer
  404. * @param i string offset
  405. * @param c code point to append
  406. * @see U8_APPEND
  407. * @stable ICU 2.4
  408. */
  409. #define U8_APPEND_UNSAFE(s, i, c) { \
  410. if((uint32_t)(c)<=0x7f) { \
  411. (s)[(i)++]=(uint8_t)(c); \
  412. } else { \
  413. if((uint32_t)(c)<=0x7ff) { \
  414. (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
  415. } else { \
  416. if((uint32_t)(c)<=0xffff) { \
  417. (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
  418. } else { \
  419. (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
  420. (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
  421. } \
  422. (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
  423. } \
  424. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
  425. } \
  426. }
  427. /**
  428. * Append a code point to a string, overwriting 1 to 4 bytes.
  429. * The offset points to the current end of the string contents
  430. * and is advanced (post-increment).
  431. * "Safe" macro, checks for a valid code point.
  432. * If a non-ASCII code point is written, checks for sufficient space in the string.
  433. * If the code point is not valid or trail bytes do not fit,
  434. * then isError is set to TRUE.
  435. *
  436. * @param s const uint8_t * string buffer
  437. * @param i int32_t string offset, must be i<capacity
  438. * @param capacity int32_t size of the string buffer
  439. * @param c UChar32 code point to append
  440. * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
  441. * @see U8_APPEND_UNSAFE
  442. * @stable ICU 2.4
  443. */
  444. #define U8_APPEND(s, i, capacity, c, isError) { \
  445. if((uint32_t)(c)<=0x7f) { \
  446. (s)[(i)++]=(uint8_t)(c); \
  447. } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \
  448. (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
  449. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
  450. } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
  451. (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
  452. (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
  453. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
  454. } else { \
  455. (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \
  456. } \
  457. }
  458. /**
  459. * Advance the string offset from one code point boundary to the next.
  460. * (Post-incrementing iteration.)
  461. * "Unsafe" macro, assumes well-formed UTF-8.
  462. *
  463. * @param s const uint8_t * string
  464. * @param i string offset
  465. * @see U8_FWD_1
  466. * @stable ICU 2.4
  467. */
  468. #define U8_FWD_1_UNSAFE(s, i) { \
  469. (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
  470. }
  471. /**
  472. * Advance the string offset from one code point boundary to the next.
  473. * (Post-incrementing iteration.)
  474. * "Safe" macro, checks for illegal sequences and for string boundaries.
  475. *
  476. * The length can be negative for a NUL-terminated string.
  477. *
  478. * @param s const uint8_t * string
  479. * @param i int32_t string offset, must be i<length
  480. * @param length int32_t string length
  481. * @see U8_FWD_1_UNSAFE
  482. * @stable ICU 2.4
  483. */
  484. #define U8_FWD_1(s, i, length) { \
  485. uint8_t __b=(uint8_t)(s)[(i)++]; \
  486. if(U8_IS_LEAD(__b)) { \
  487. uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
  488. if((i)+__count>(length) && (length)>=0) { \
  489. __count=(uint8_t)((length)-(i)); \
  490. } \
  491. while(__count>0 && U8_IS_TRAIL((s)[i])) { \
  492. ++(i); \
  493. --__count; \
  494. } \
  495. } \
  496. }
  497. /**
  498. * Advance the string offset from one code point boundary to the n-th next one,
  499. * i.e., move forward by n code points.
  500. * (Post-incrementing iteration.)
  501. * "Unsafe" macro, assumes well-formed UTF-8.
  502. *
  503. * @param s const uint8_t * string
  504. * @param i string offset
  505. * @param n number of code points to skip
  506. * @see U8_FWD_N
  507. * @stable ICU 2.4
  508. */
  509. #define U8_FWD_N_UNSAFE(s, i, n) { \
  510. int32_t __N=(n); \
  511. while(__N>0) { \
  512. U8_FWD_1_UNSAFE(s, i); \
  513. --__N; \
  514. } \
  515. }
  516. /**
  517. * Advance the string offset from one code point boundary to the n-th next one,
  518. * i.e., move forward by n code points.
  519. * (Post-incrementing iteration.)
  520. * "Safe" macro, checks for illegal sequences and for string boundaries.
  521. *
  522. * The length can be negative for a NUL-terminated string.
  523. *
  524. * @param s const uint8_t * string
  525. * @param i int32_t string offset, must be i<length
  526. * @param length int32_t string length
  527. * @param n number of code points to skip
  528. * @see U8_FWD_N_UNSAFE
  529. * @stable ICU 2.4
  530. */
  531. #define U8_FWD_N(s, i, length, n) { \
  532. int32_t __N=(n); \
  533. while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
  534. U8_FWD_1(s, i, length); \
  535. --__N; \
  536. } \
  537. }
  538. /**
  539. * Adjust a random-access offset to a code point boundary
  540. * at the start of a code point.
  541. * If the offset points to a UTF-8 trail byte,
  542. * then the offset is moved backward to the corresponding lead byte.
  543. * Otherwise, it is not modified.
  544. * "Unsafe" macro, assumes well-formed UTF-8.
  545. *
  546. * @param s const uint8_t * string
  547. * @param i string offset
  548. * @see U8_SET_CP_START
  549. * @stable ICU 2.4
  550. */
  551. #define U8_SET_CP_START_UNSAFE(s, i) { \
  552. while(U8_IS_TRAIL((s)[i])) { --(i); } \
  553. }
  554. /**
  555. * Adjust a random-access offset to a code point boundary
  556. * at the start of a code point.
  557. * If the offset points to a UTF-8 trail byte,
  558. * then the offset is moved backward to the corresponding lead byte.
  559. * Otherwise, it is not modified.
  560. * "Safe" macro, checks for illegal sequences and for string boundaries.
  561. *
  562. * @param s const uint8_t * string
  563. * @param start int32_t starting string offset (usually 0)
  564. * @param i int32_t string offset, must be start<=i
  565. * @see U8_SET_CP_START_UNSAFE
  566. * @stable ICU 2.4
  567. */
  568. #define U8_SET_CP_START(s, start, i) { \
  569. if(U8_IS_TRAIL((s)[(i)])) { \
  570. (i)=utf8_back1SafeBody(s, start, (i)); \
  571. } \
  572. }
  573. /* definitions with backward iteration -------------------------------------- */
  574. /**
  575. * Move the string offset from one code point boundary to the previous one
  576. * and get the code point between them.
  577. * (Pre-decrementing backward iteration.)
  578. * "Unsafe" macro, assumes well-formed UTF-8.
  579. *
  580. * The input offset may be the same as the string length.
  581. * If the offset is behind a multi-byte sequence, then the macro will read
  582. * the whole sequence.
  583. * If the offset is behind a lead byte, then that itself
  584. * will be returned as the code point.
  585. * The result is undefined if the offset is behind an illegal UTF-8 sequence.
  586. *
  587. * @param s const uint8_t * string
  588. * @param i string offset
  589. * @param c output UChar32 variable
  590. * @see U8_PREV
  591. * @stable ICU 2.4
  592. */
  593. #define U8_PREV_UNSAFE(s, i, c) { \
  594. (c)=(uint8_t)(s)[--(i)]; \
  595. if(U8_IS_TRAIL(c)) { \
  596. uint8_t __b, __count=1, __shift=6; \
  597. \
  598. /* c is a trail byte */ \
  599. (c)&=0x3f; \
  600. for(;;) { \
  601. __b=(uint8_t)(s)[--(i)]; \
  602. if(__b>=0xc0) { \
  603. U8_MASK_LEAD_BYTE(__b, __count); \
  604. (c)|=(UChar32)__b<<__shift; \
  605. break; \
  606. } else { \
  607. (c)|=(UChar32)(__b&0x3f)<<__shift; \
  608. ++__count; \
  609. __shift+=6; \
  610. } \
  611. } \
  612. } \
  613. }
  614. /**
  615. * Move the string offset from one code point boundary to the previous one
  616. * and get the code point between them.
  617. * (Pre-decrementing backward iteration.)
  618. * "Safe" macro, checks for illegal sequences and for string boundaries.
  619. *
  620. * The input offset may be the same as the string length.
  621. * If the offset is behind a multi-byte sequence, then the macro will read
  622. * the whole sequence.
  623. * If the offset is behind a lead byte, then that itself
  624. * will be returned as the code point.
  625. * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
  626. *
  627. * @param s const uint8_t * string
  628. * @param start int32_t starting string offset (usually 0)
  629. * @param i int32_t string offset, must be start<i
  630. * @param c output UChar32 variable, set to <0 in case of an error
  631. * @see U8_PREV_UNSAFE
  632. * @stable ICU 2.4
  633. */
  634. #define U8_PREV(s, start, i, c) { \
  635. (c)=(uint8_t)(s)[--(i)]; \
  636. if((c)>=0x80) { \
  637. (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
  638. } \
  639. }
  640. /**
  641. * Move the string offset from one code point boundary to the previous one
  642. * and get the code point between them.
  643. * (Pre-decrementing backward iteration.)
  644. * "Safe" macro, checks for illegal sequences and for string boundaries.
  645. *
  646. * The input offset may be the same as the string length.
  647. * If the offset is behind a multi-byte sequence, then the macro will read
  648. * the whole sequence.
  649. * If the offset is behind a lead byte, then that itself
  650. * will be returned as the code point.
  651. * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
  652. *
  653. * This macro does not distinguish between a real U+FFFD in the text
  654. * and U+FFFD returned for an ill-formed sequence.
  655. * Use U8_PREV() if that distinction is important.
  656. *
  657. * @param s const uint8_t * string
  658. * @param start int32_t starting string offset (usually 0)
  659. * @param i int32_t string offset, must be start<i
  660. * @param c output UChar32 variable, set to U+FFFD in case of an error
  661. * @see U8_PREV
  662. * @stable ICU 51
  663. */
  664. #define U8_PREV_OR_FFFD(s, start, i, c) { \
  665. (c)=(uint8_t)(s)[--(i)]; \
  666. if((c)>=0x80) { \
  667. (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
  668. } \
  669. }
  670. /**
  671. * Move the string offset from one code point boundary to the previous one.
  672. * (Pre-decrementing backward iteration.)
  673. * The input offset may be the same as the string length.
  674. * "Unsafe" macro, assumes well-formed UTF-8.
  675. *
  676. * @param s const uint8_t * string
  677. * @param i string offset
  678. * @see U8_BACK_1
  679. * @stable ICU 2.4
  680. */
  681. #define U8_BACK_1_UNSAFE(s, i) { \
  682. while(U8_IS_TRAIL((s)[--(i)])) {} \
  683. }
  684. /**
  685. * Move the string offset from one code point boundary to the previous one.
  686. * (Pre-decrementing backward iteration.)
  687. * The input offset may be the same as the string length.
  688. * "Safe" macro, checks for illegal sequences and for string boundaries.
  689. *
  690. * @param s const uint8_t * string
  691. * @param start int32_t starting string offset (usually 0)
  692. * @param i int32_t string offset, must be start<i
  693. * @see U8_BACK_1_UNSAFE
  694. * @stable ICU 2.4
  695. */
  696. #define U8_BACK_1(s, start, i) { \
  697. if(U8_IS_TRAIL((s)[--(i)])) { \
  698. (i)=utf8_back1SafeBody(s, start, (i)); \
  699. } \
  700. }
  701. /**
  702. * Move the string offset from one code point boundary to the n-th one before it,
  703. * i.e., move backward by n code points.
  704. * (Pre-decrementing backward iteration.)
  705. * The input offset may be the same as the string length.
  706. * "Unsafe" macro, assumes well-formed UTF-8.
  707. *
  708. * @param s const uint8_t * string
  709. * @param i string offset
  710. * @param n number of code points to skip
  711. * @see U8_BACK_N
  712. * @stable ICU 2.4
  713. */
  714. #define U8_BACK_N_UNSAFE(s, i, n) { \
  715. int32_t __N=(n); \
  716. while(__N>0) { \
  717. U8_BACK_1_UNSAFE(s, i); \
  718. --__N; \
  719. } \
  720. }
  721. /**
  722. * Move the string offset from one code point boundary to the n-th one before it,
  723. * i.e., move backward by n code points.
  724. * (Pre-decrementing backward iteration.)
  725. * The input offset may be the same as the string length.
  726. * "Safe" macro, checks for illegal sequences and for string boundaries.
  727. *
  728. * @param s const uint8_t * string
  729. * @param start int32_t index of the start of the string
  730. * @param i int32_t string offset, must be start<i
  731. * @param n number of code points to skip
  732. * @see U8_BACK_N_UNSAFE
  733. * @stable ICU 2.4
  734. */
  735. #define U8_BACK_N(s, start, i, n) { \
  736. int32_t __N=(n); \
  737. while(__N>0 && (i)>(start)) { \
  738. U8_BACK_1(s, start, i); \
  739. --__N; \
  740. } \
  741. }
  742. /**
  743. * Adjust a random-access offset to a code point boundary after a code point.
  744. * If the offset is behind a partial multi-byte sequence,
  745. * then the offset is incremented to behind the whole sequence.
  746. * Otherwise, it is not modified.
  747. * The input offset may be the same as the string length.
  748. * "Unsafe" macro, assumes well-formed UTF-8.
  749. *
  750. * @param s const uint8_t * string
  751. * @param i string offset
  752. * @see U8_SET_CP_LIMIT
  753. * @stable ICU 2.4
  754. */
  755. #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
  756. U8_BACK_1_UNSAFE(s, i); \
  757. U8_FWD_1_UNSAFE(s, i); \
  758. }
  759. /**
  760. * Adjust a random-access offset to a code point boundary after a code point.
  761. * If the offset is behind a partial multi-byte sequence,
  762. * then the offset is incremented to behind the whole sequence.
  763. * Otherwise, it is not modified.
  764. * The input offset may be the same as the string length.
  765. * "Safe" macro, checks for illegal sequences and for string boundaries.
  766. *
  767. * The length can be negative for a NUL-terminated string.
  768. *
  769. * @param s const uint8_t * string
  770. * @param start int32_t starting string offset (usually 0)
  771. * @param i int32_t string offset, must be start<=i<=length
  772. * @param length int32_t string length
  773. * @see U8_SET_CP_LIMIT_UNSAFE
  774. * @stable ICU 2.4
  775. */
  776. #define U8_SET_CP_LIMIT(s, start, i, length) { \
  777. if((start)<(i) && ((i)<(length) || (length)<0)) { \
  778. U8_BACK_1(s, start, i); \
  779. U8_FWD_1(s, i, length); \
  780. } \
  781. }
  782. #endif