uregex.h 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2004-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: uregex.h
  9. * encoding: US-ASCII
  10. * indentation:4
  11. *
  12. * created on: 2004mar09
  13. * created by: Andy Heninger
  14. *
  15. * ICU Regular Expressions, API for C
  16. */
  17. /**
  18. * \file
  19. * \brief C API: Regular Expressions
  20. *
  21. * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
  22. */
  23. #ifndef UREGEX_H
  24. #define UREGEX_H
  25. #include "unicode/utext.h"
  26. #include "unicode/utypes.h"
  27. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  28. #include "unicode/localpointer.h"
  29. #include "unicode/parseerr.h"
  30. struct URegularExpression;
  31. /**
  32. * Structure representing a compiled regular expression, plus the results
  33. * of a match operation.
  34. * @stable ICU 3.0
  35. */
  36. typedef struct URegularExpression URegularExpression;
  37. /**
  38. * Constants for Regular Expression Match Modes.
  39. * @stable ICU 2.4
  40. */
  41. typedef enum URegexpFlag{
  42. #ifndef U_HIDE_DRAFT_API
  43. /** Forces normalization of pattern and strings.
  44. Not implemented yet, just a placeholder, hence draft.
  45. @draft ICU 2.4 */
  46. UREGEX_CANON_EQ = 128,
  47. #endif /* U_HIDE_DRAFT_API */
  48. /** Enable case insensitive matching. @stable ICU 2.4 */
  49. UREGEX_CASE_INSENSITIVE = 2,
  50. /** Allow white space and comments within patterns @stable ICU 2.4 */
  51. UREGEX_COMMENTS = 4,
  52. /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
  53. * @stable ICU 2.4 */
  54. UREGEX_DOTALL = 32,
  55. /** If set, treat the entire pattern as a literal string.
  56. * Metacharacters or escape sequences in the input sequence will be given
  57. * no special meaning.
  58. *
  59. * The flag UREGEX_CASE_INSENSITIVE retains its impact
  60. * on matching when used in conjunction with this flag.
  61. * The other flags become superfluous.
  62. *
  63. * @stable ICU 4.0
  64. */
  65. UREGEX_LITERAL = 16,
  66. /** Control behavior of "$" and "^"
  67. * If set, recognize line terminators within string,
  68. * otherwise, match only at start and end of input string.
  69. * @stable ICU 2.4 */
  70. UREGEX_MULTILINE = 8,
  71. /** Unix-only line endings.
  72. * When this mode is enabled, only \\u000a is recognized as a line ending
  73. * in the behavior of ., ^, and $.
  74. * @stable ICU 4.0
  75. */
  76. UREGEX_UNIX_LINES = 1,
  77. /** Unicode word boundaries.
  78. * If set, \b uses the Unicode TR 29 definition of word boundaries.
  79. * Warning: Unicode word boundaries are quite different from
  80. * traditional regular expression word boundaries. See
  81. * http://unicode.org/reports/tr29/#Word_Boundaries
  82. * @stable ICU 2.8
  83. */
  84. UREGEX_UWORD = 256,
  85. /** Error on Unrecognized backslash escapes.
  86. * If set, fail with an error on patterns that contain
  87. * backslash-escaped ASCII letters without a known special
  88. * meaning. If this flag is not set, these
  89. * escaped letters represent themselves.
  90. * @stable ICU 4.0
  91. */
  92. UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
  93. } URegexpFlag;
  94. /**
  95. * Open (compile) an ICU regular expression. Compiles the regular expression in
  96. * string form into an internal representation using the specified match mode flags.
  97. * The resulting regular expression handle can then be used to perform various
  98. * matching operations.
  99. *
  100. *
  101. * @param pattern The Regular Expression pattern to be compiled.
  102. * @param patternLength The length of the pattern, or -1 if the pattern is
  103. * NUL terminated.
  104. * @param flags Flags that alter the default matching behavior for
  105. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  106. * example. For default behavior, set this parameter to zero.
  107. * See <code>enum URegexpFlag</code>. All desired flags
  108. * are bitwise-ORed together.
  109. * @param pe Receives the position (line and column numbers) of any syntax
  110. * error within the source regular expression string. If this
  111. * information is not wanted, pass NULL for this parameter.
  112. * @param status Receives error detected by this function.
  113. * @stable ICU 3.0
  114. *
  115. */
  116. U_STABLE URegularExpression * U_EXPORT2
  117. uregex_open( const UChar *pattern,
  118. int32_t patternLength,
  119. uint32_t flags,
  120. UParseError *pe,
  121. UErrorCode *status);
  122. /**
  123. * Open (compile) an ICU regular expression. Compiles the regular expression in
  124. * string form into an internal representation using the specified match mode flags.
  125. * The resulting regular expression handle can then be used to perform various
  126. * matching operations.
  127. * <p>
  128. * The contents of the pattern UText will be extracted and saved. Ownership of the
  129. * UText struct itself remains with the caller. This is to match the behavior of
  130. * uregex_open().
  131. *
  132. * @param pattern The Regular Expression pattern to be compiled.
  133. * @param flags Flags that alter the default matching behavior for
  134. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  135. * example. For default behavior, set this parameter to zero.
  136. * See <code>enum URegexpFlag</code>. All desired flags
  137. * are bitwise-ORed together.
  138. * @param pe Receives the position (line and column numbers) of any syntax
  139. * error within the source regular expression string. If this
  140. * information is not wanted, pass NULL for this parameter.
  141. * @param status Receives error detected by this function.
  142. *
  143. * @stable ICU 4.6
  144. */
  145. U_STABLE URegularExpression * U_EXPORT2
  146. uregex_openUText(UText *pattern,
  147. uint32_t flags,
  148. UParseError *pe,
  149. UErrorCode *status);
  150. /**
  151. * Open (compile) an ICU regular expression. The resulting regular expression
  152. * handle can then be used to perform various matching operations.
  153. * <p>
  154. * This function is the same as uregex_open, except that the pattern
  155. * is supplied as an 8 bit char * string in the default code page.
  156. *
  157. * @param pattern The Regular Expression pattern to be compiled,
  158. * NUL terminated.
  159. * @param flags Flags that alter the default matching behavior for
  160. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  161. * example. For default behavior, set this parameter to zero.
  162. * See <code>enum URegexpFlag</code>. All desired flags
  163. * are bitwise-ORed together.
  164. * @param pe Receives the position (line and column numbers) of any syntax
  165. * error within the source regular expression string. If this
  166. * information is not wanted, pass NULL for this parameter.
  167. * @param status Receives errors detected by this function.
  168. * @return The URegularExpression object representing the compiled
  169. * pattern.
  170. *
  171. * @stable ICU 3.0
  172. */
  173. #if !UCONFIG_NO_CONVERSION
  174. U_STABLE URegularExpression * U_EXPORT2
  175. uregex_openC( const char *pattern,
  176. uint32_t flags,
  177. UParseError *pe,
  178. UErrorCode *status);
  179. #endif
  180. /**
  181. * Close the regular expression, recovering all resources (memory) it
  182. * was holding.
  183. *
  184. * @param regexp The regular expression to be closed.
  185. * @stable ICU 3.0
  186. */
  187. U_STABLE void U_EXPORT2
  188. uregex_close(URegularExpression *regexp);
  189. #if U_SHOW_CPLUSPLUS_API
  190. U_NAMESPACE_BEGIN
  191. /**
  192. * \class LocalURegularExpressionPointer
  193. * "Smart pointer" class, closes a URegularExpression via uregex_close().
  194. * For most methods see the LocalPointerBase base class.
  195. *
  196. * @see LocalPointerBase
  197. * @see LocalPointer
  198. * @stable ICU 4.4
  199. */
  200. U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close);
  201. U_NAMESPACE_END
  202. #endif
  203. /**
  204. * Make a copy of a compiled regular expression. Cloning a regular
  205. * expression is faster than opening a second instance from the source
  206. * form of the expression, and requires less memory.
  207. * <p>
  208. * Note that the current input string and the position of any matched text
  209. * within it are not cloned; only the pattern itself and the
  210. * match mode flags are copied.
  211. * <p>
  212. * Cloning can be particularly useful to threaded applications that perform
  213. * multiple match operations in parallel. Each concurrent RE
  214. * operation requires its own instance of a URegularExpression.
  215. *
  216. * @param regexp The compiled regular expression to be cloned.
  217. * @param status Receives indication of any errors encountered
  218. * @return the cloned copy of the compiled regular expression.
  219. * @stable ICU 3.0
  220. */
  221. U_STABLE URegularExpression * U_EXPORT2
  222. uregex_clone(const URegularExpression *regexp, UErrorCode *status);
  223. /**
  224. * Returns a pointer to the source form of the pattern for this regular expression.
  225. * This function will work even if the pattern was originally specified as a UText.
  226. *
  227. * @param regexp The compiled regular expression.
  228. * @param patLength This output parameter will be set to the length of the
  229. * pattern string. A NULL pointer may be used here if the
  230. * pattern length is not needed, as would be the case if
  231. * the pattern is known in advance to be a NUL terminated
  232. * string.
  233. * @param status Receives errors detected by this function.
  234. * @return a pointer to the pattern string. The storage for the string is
  235. * owned by the regular expression object, and must not be
  236. * altered or deleted by the application. The returned string
  237. * will remain valid until the regular expression is closed.
  238. * @stable ICU 3.0
  239. */
  240. U_STABLE const UChar * U_EXPORT2
  241. uregex_pattern(const URegularExpression *regexp,
  242. int32_t *patLength,
  243. UErrorCode *status);
  244. /**
  245. * Returns the source text of the pattern for this regular expression.
  246. * This function will work even if the pattern was originally specified as a UChar string.
  247. *
  248. * @param regexp The compiled regular expression.
  249. * @param status Receives errors detected by this function.
  250. * @return the pattern text. The storage for the text is owned by the regular expression
  251. * object, and must not be altered or deleted.
  252. *
  253. * @stable ICU 4.6
  254. */
  255. U_STABLE UText * U_EXPORT2
  256. uregex_patternUText(const URegularExpression *regexp,
  257. UErrorCode *status);
  258. /**
  259. * Get the match mode flags that were specified when compiling this regular expression.
  260. * @param status Receives errors detected by this function.
  261. * @param regexp The compiled regular expression.
  262. * @return The match mode flags
  263. * @see URegexpFlag
  264. * @stable ICU 3.0
  265. */
  266. U_STABLE int32_t U_EXPORT2
  267. uregex_flags(const URegularExpression *regexp,
  268. UErrorCode *status);
  269. /**
  270. * Set the subject text string upon which the regular expression will look for matches.
  271. * This function may be called any number of times, allowing the regular
  272. * expression pattern to be applied to different strings.
  273. * <p>
  274. * Regular expression matching operations work directly on the application's
  275. * string data. No copy is made. The subject string data must not be
  276. * altered after calling this function until after all regular expression
  277. * operations involving this string data are completed.
  278. * <p>
  279. * Zero length strings are permitted. In this case, no subsequent match
  280. * operation will dereference the text string pointer.
  281. *
  282. * @param regexp The compiled regular expression.
  283. * @param text The subject text string.
  284. * @param textLength The length of the subject text, or -1 if the string
  285. * is NUL terminated.
  286. * @param status Receives errors detected by this function.
  287. * @stable ICU 3.0
  288. */
  289. U_STABLE void U_EXPORT2
  290. uregex_setText(URegularExpression *regexp,
  291. const UChar *text,
  292. int32_t textLength,
  293. UErrorCode *status);
  294. /**
  295. * Set the subject text string upon which the regular expression will look for matches.
  296. * This function may be called any number of times, allowing the regular
  297. * expression pattern to be applied to different strings.
  298. * <p>
  299. * Regular expression matching operations work directly on the application's
  300. * string data; only a shallow clone is made. The subject string data must not be
  301. * altered after calling this function until after all regular expression
  302. * operations involving this string data are completed.
  303. *
  304. * @param regexp The compiled regular expression.
  305. * @param text The subject text string.
  306. * @param status Receives errors detected by this function.
  307. *
  308. * @stable ICU 4.6
  309. */
  310. U_STABLE void U_EXPORT2
  311. uregex_setUText(URegularExpression *regexp,
  312. UText *text,
  313. UErrorCode *status);
  314. /**
  315. * Get the subject text that is currently associated with this
  316. * regular expression object. If the input was supplied using uregex_setText(),
  317. * that pointer will be returned. Otherwise, the characters in the input will
  318. * be extracted to a buffer and returned. In either case, ownership remains
  319. * with the regular expression object.
  320. *
  321. * This function will work even if the input was originally specified as a UText.
  322. *
  323. * @param regexp The compiled regular expression.
  324. * @param textLength The length of the string is returned in this output parameter.
  325. * A NULL pointer may be used here if the
  326. * text length is not needed, as would be the case if
  327. * the text is known in advance to be a NUL terminated
  328. * string.
  329. * @param status Receives errors detected by this function.
  330. * @return Pointer to the subject text string currently associated with
  331. * this regular expression.
  332. * @stable ICU 3.0
  333. */
  334. U_STABLE const UChar * U_EXPORT2
  335. uregex_getText(URegularExpression *regexp,
  336. int32_t *textLength,
  337. UErrorCode *status);
  338. /**
  339. * Get the subject text that is currently associated with this
  340. * regular expression object.
  341. *
  342. * This function will work even if the input was originally specified as a UChar string.
  343. *
  344. * @param regexp The compiled regular expression.
  345. * @param dest A mutable UText in which to store the current input.
  346. * If NULL, a new UText will be created as an immutable shallow clone
  347. * of the actual input string.
  348. * @param status Receives errors detected by this function.
  349. * @return The subject text currently associated with this regular expression.
  350. * If a pre-allocated UText was provided, it will always be used and returned.
  351. *
  352. * @stable ICU 4.6
  353. */
  354. U_STABLE UText * U_EXPORT2
  355. uregex_getUText(URegularExpression *regexp,
  356. UText *dest,
  357. UErrorCode *status);
  358. /**
  359. * Set the subject text string upon which the regular expression is looking for matches
  360. * without changing any other aspect of the matching state.
  361. * The new and previous text strings must have the same content.
  362. *
  363. * This function is intended for use in environments where ICU is operating on
  364. * strings that may move around in memory. It provides a mechanism for notifying
  365. * ICU that the string has been relocated, and providing a new UText to access the
  366. * string in its new position.
  367. *
  368. * Note that the regular expression implementation never copies the underlying text
  369. * of a string being matched, but always operates directly on the original text
  370. * provided by the user. Refreshing simply drops the references to the old text
  371. * and replaces them with references to the new.
  372. *
  373. * Caution: this function is normally used only by very specialized
  374. * system-level code. One example use case is with garbage collection
  375. * that moves the text in memory.
  376. *
  377. * @param regexp The compiled regular expression.
  378. * @param text The new (moved) text string.
  379. * @param status Receives errors detected by this function.
  380. *
  381. * @stable ICU 4.8
  382. */
  383. U_STABLE void U_EXPORT2
  384. uregex_refreshUText(URegularExpression *regexp,
  385. UText *text,
  386. UErrorCode *status);
  387. /**
  388. * Attempts to match the input string against the pattern.
  389. * To succeed, the match must extend to the end of the string,
  390. * or cover the complete match region.
  391. *
  392. * If startIndex >= zero the match operation starts at the specified
  393. * index and must extend to the end of the input string. Any region
  394. * that has been specified is reset.
  395. *
  396. * If startIndex == -1 the match must cover the input region, or the entire
  397. * input string if no region has been set. This directly corresponds to
  398. * Matcher.matches() in Java
  399. *
  400. * @param regexp The compiled regular expression.
  401. * @param startIndex The input string (native) index at which to begin matching, or -1
  402. * to match the input Region.
  403. * @param status Receives errors detected by this function.
  404. * @return TRUE if there is a match
  405. * @stable ICU 3.0
  406. */
  407. U_STABLE UBool U_EXPORT2
  408. uregex_matches(URegularExpression *regexp,
  409. int32_t startIndex,
  410. UErrorCode *status);
  411. /**
  412. * 64bit version of uregex_matches.
  413. * Attempts to match the input string against the pattern.
  414. * To succeed, the match must extend to the end of the string,
  415. * or cover the complete match region.
  416. *
  417. * If startIndex >= zero the match operation starts at the specified
  418. * index and must extend to the end of the input string. Any region
  419. * that has been specified is reset.
  420. *
  421. * If startIndex == -1 the match must cover the input region, or the entire
  422. * input string if no region has been set. This directly corresponds to
  423. * Matcher.matches() in Java
  424. *
  425. * @param regexp The compiled regular expression.
  426. * @param startIndex The input string (native) index at which to begin matching, or -1
  427. * to match the input Region.
  428. * @param status Receives errors detected by this function.
  429. * @return TRUE if there is a match
  430. * @stable ICU 4.6
  431. */
  432. U_STABLE UBool U_EXPORT2
  433. uregex_matches64(URegularExpression *regexp,
  434. int64_t startIndex,
  435. UErrorCode *status);
  436. /**
  437. * Attempts to match the input string, starting from the specified index, against the pattern.
  438. * The match may be of any length, and is not required to extend to the end
  439. * of the input string. Contrast with uregex_matches().
  440. *
  441. * <p>If startIndex is >= 0 any input region that was set for this
  442. * URegularExpression is reset before the operation begins.
  443. *
  444. * <p>If the specified starting index == -1 the match begins at the start of the input
  445. * region, or at the start of the full string if no region has been specified.
  446. * This corresponds directly with Matcher.lookingAt() in Java.
  447. *
  448. * <p>If the match succeeds then more information can be obtained via the
  449. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  450. * and <code>uregex_group()</code> functions.</p>
  451. *
  452. * @param regexp The compiled regular expression.
  453. * @param startIndex The input string (native) index at which to begin matching, or
  454. * -1 to match the Input Region
  455. * @param status A reference to a UErrorCode to receive any errors.
  456. * @return TRUE if there is a match.
  457. * @stable ICU 3.0
  458. */
  459. U_STABLE UBool U_EXPORT2
  460. uregex_lookingAt(URegularExpression *regexp,
  461. int32_t startIndex,
  462. UErrorCode *status);
  463. /**
  464. * 64bit version of uregex_lookingAt.
  465. * Attempts to match the input string, starting from the specified index, against the pattern.
  466. * The match may be of any length, and is not required to extend to the end
  467. * of the input string. Contrast with uregex_matches().
  468. *
  469. * <p>If startIndex is >= 0 any input region that was set for this
  470. * URegularExpression is reset before the operation begins.
  471. *
  472. * <p>If the specified starting index == -1 the match begins at the start of the input
  473. * region, or at the start of the full string if no region has been specified.
  474. * This corresponds directly with Matcher.lookingAt() in Java.
  475. *
  476. * <p>If the match succeeds then more information can be obtained via the
  477. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  478. * and <code>uregex_group()</code> functions.</p>
  479. *
  480. * @param regexp The compiled regular expression.
  481. * @param startIndex The input string (native) index at which to begin matching, or
  482. * -1 to match the Input Region
  483. * @param status A reference to a UErrorCode to receive any errors.
  484. * @return TRUE if there is a match.
  485. * @stable ICU 4.6
  486. */
  487. U_STABLE UBool U_EXPORT2
  488. uregex_lookingAt64(URegularExpression *regexp,
  489. int64_t startIndex,
  490. UErrorCode *status);
  491. /**
  492. * Find the first matching substring of the input string that matches the pattern.
  493. * If startIndex is >= zero the search for a match begins at the specified index,
  494. * and any match region is reset. This corresponds directly with
  495. * Matcher.find(startIndex) in Java.
  496. *
  497. * If startIndex == -1 the search begins at the start of the input region,
  498. * or at the start of the full string if no region has been specified.
  499. *
  500. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  501. * <code>uregex_group()</code> will provide more information regarding the match.
  502. *
  503. * @param regexp The compiled regular expression.
  504. * @param startIndex The position (native) in the input string to begin the search, or
  505. * -1 to search within the Input Region.
  506. * @param status A reference to a UErrorCode to receive any errors.
  507. * @return TRUE if a match is found.
  508. * @stable ICU 3.0
  509. */
  510. U_STABLE UBool U_EXPORT2
  511. uregex_find(URegularExpression *regexp,
  512. int32_t startIndex,
  513. UErrorCode *status);
  514. /**
  515. * 64bit version of uregex_find.
  516. * Find the first matching substring of the input string that matches the pattern.
  517. * If startIndex is >= zero the search for a match begins at the specified index,
  518. * and any match region is reset. This corresponds directly with
  519. * Matcher.find(startIndex) in Java.
  520. *
  521. * If startIndex == -1 the search begins at the start of the input region,
  522. * or at the start of the full string if no region has been specified.
  523. *
  524. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  525. * <code>uregex_group()</code> will provide more information regarding the match.
  526. *
  527. * @param regexp The compiled regular expression.
  528. * @param startIndex The position (native) in the input string to begin the search, or
  529. * -1 to search within the Input Region.
  530. * @param status A reference to a UErrorCode to receive any errors.
  531. * @return TRUE if a match is found.
  532. * @stable ICU 4.6
  533. */
  534. U_STABLE UBool U_EXPORT2
  535. uregex_find64(URegularExpression *regexp,
  536. int64_t startIndex,
  537. UErrorCode *status);
  538. /**
  539. * Find the next pattern match in the input string. Begin searching
  540. * the input at the location following the end of he previous match,
  541. * or at the start of the string (or region) if there is no
  542. * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
  543. * <code>uregex_group()</code> will provide more information regarding the match.
  544. *
  545. * @param regexp The compiled regular expression.
  546. * @param status A reference to a UErrorCode to receive any errors.
  547. * @return TRUE if a match is found.
  548. * @see uregex_reset
  549. * @stable ICU 3.0
  550. */
  551. U_STABLE UBool U_EXPORT2
  552. uregex_findNext(URegularExpression *regexp,
  553. UErrorCode *status);
  554. /**
  555. * Get the number of capturing groups in this regular expression's pattern.
  556. * @param regexp The compiled regular expression.
  557. * @param status A reference to a UErrorCode to receive any errors.
  558. * @return the number of capture groups
  559. * @stable ICU 3.0
  560. */
  561. U_STABLE int32_t U_EXPORT2
  562. uregex_groupCount(URegularExpression *regexp,
  563. UErrorCode *status);
  564. /**
  565. * Get the group number corresponding to a named capture group.
  566. * The returned number can be used with any function that access
  567. * capture groups by number.
  568. *
  569. * The function returns an error status if the specified name does not
  570. * appear in the pattern.
  571. *
  572. * @param regexp The compiled regular expression.
  573. * @param groupName The capture group name.
  574. * @param nameLength The length of the name, or -1 if the name is a
  575. * nul-terminated string.
  576. * @param status A pointer to a UErrorCode to receive any errors.
  577. *
  578. * @stable ICU 55
  579. */
  580. U_STABLE int32_t U_EXPORT2
  581. uregex_groupNumberFromName(URegularExpression *regexp,
  582. const UChar *groupName,
  583. int32_t nameLength,
  584. UErrorCode *status);
  585. /**
  586. * Get the group number corresponding to a named capture group.
  587. * The returned number can be used with any function that access
  588. * capture groups by number.
  589. *
  590. * The function returns an error status if the specified name does not
  591. * appear in the pattern.
  592. *
  593. * @param regexp The compiled regular expression.
  594. * @param groupName The capture group name,
  595. * platform invariant characters only.
  596. * @param nameLength The length of the name, or -1 if the name is
  597. * nul-terminated.
  598. * @param status A pointer to a UErrorCode to receive any errors.
  599. *
  600. * @stable ICU 55
  601. */
  602. U_STABLE int32_t U_EXPORT2
  603. uregex_groupNumberFromCName(URegularExpression *regexp,
  604. const char *groupName,
  605. int32_t nameLength,
  606. UErrorCode *status);
  607. /** Extract the string for the specified matching expression or subexpression.
  608. * Group #0 is the complete string of matched text.
  609. * Group #1 is the text matched by the first set of capturing parentheses.
  610. *
  611. * @param regexp The compiled regular expression.
  612. * @param groupNum The capture group to extract. Group 0 is the complete
  613. * match. The value of this parameter must be
  614. * less than or equal to the number of capture groups in
  615. * the pattern.
  616. * @param dest Buffer to receive the matching string data
  617. * @param destCapacity Capacity of the dest buffer.
  618. * @param status A reference to a UErrorCode to receive any errors.
  619. * @return Length of matching data,
  620. * or -1 if no applicable match.
  621. * @stable ICU 3.0
  622. */
  623. U_STABLE int32_t U_EXPORT2
  624. uregex_group(URegularExpression *regexp,
  625. int32_t groupNum,
  626. UChar *dest,
  627. int32_t destCapacity,
  628. UErrorCode *status);
  629. /** Returns a shallow immutable clone of the entire input string with the current index set
  630. * to the beginning of the requested capture group. The capture group length is also
  631. * returned via groupLength.
  632. * Group #0 is the complete string of matched text.
  633. * Group #1 is the text matched by the first set of capturing parentheses.
  634. *
  635. * @param regexp The compiled regular expression.
  636. * @param groupNum The capture group to extract. Group 0 is the complete
  637. * match. The value of this parameter must be
  638. * less than or equal to the number of capture groups in
  639. * the pattern.
  640. * @param dest A mutable UText in which to store the current input.
  641. * If NULL, a new UText will be created as an immutable shallow clone
  642. * of the entire input string.
  643. * @param groupLength The group length of the desired capture group. Output parameter.
  644. * @param status A reference to a UErrorCode to receive any errors.
  645. * @return The subject text currently associated with this regular expression.
  646. * If a pre-allocated UText was provided, it will always be used and returned.
  647. *
  648. * @stable ICU 4.6
  649. */
  650. U_STABLE UText * U_EXPORT2
  651. uregex_groupUText(URegularExpression *regexp,
  652. int32_t groupNum,
  653. UText *dest,
  654. int64_t *groupLength,
  655. UErrorCode *status);
  656. /**
  657. * Returns the index in the input string of the start of the text matched by the
  658. * specified capture group during the previous match operation. Return -1 if
  659. * the capture group was not part of the last match.
  660. * Group #0 refers to the complete range of matched text.
  661. * Group #1 refers to the text matched by the first set of capturing parentheses.
  662. *
  663. * @param regexp The compiled regular expression.
  664. * @param groupNum The capture group number
  665. * @param status A reference to a UErrorCode to receive any errors.
  666. * @return the starting (native) position in the input of the text matched
  667. * by the specified group.
  668. * @stable ICU 3.0
  669. */
  670. U_STABLE int32_t U_EXPORT2
  671. uregex_start(URegularExpression *regexp,
  672. int32_t groupNum,
  673. UErrorCode *status);
  674. /**
  675. * 64bit version of uregex_start.
  676. * Returns the index in the input string of the start of the text matched by the
  677. * specified capture group during the previous match operation. Return -1 if
  678. * the capture group was not part of the last match.
  679. * Group #0 refers to the complete range of matched text.
  680. * Group #1 refers to the text matched by the first set of capturing parentheses.
  681. *
  682. * @param regexp The compiled regular expression.
  683. * @param groupNum The capture group number
  684. * @param status A reference to a UErrorCode to receive any errors.
  685. * @return the starting (native) position in the input of the text matched
  686. * by the specified group.
  687. * @stable ICU 4.6
  688. */
  689. U_STABLE int64_t U_EXPORT2
  690. uregex_start64(URegularExpression *regexp,
  691. int32_t groupNum,
  692. UErrorCode *status);
  693. /**
  694. * Returns the index in the input string of the position following the end
  695. * of the text matched by the specified capture group.
  696. * Return -1 if the capture group was not part of the last match.
  697. * Group #0 refers to the complete range of matched text.
  698. * Group #1 refers to the text matched by the first set of capturing parentheses.
  699. *
  700. * @param regexp The compiled regular expression.
  701. * @param groupNum The capture group number
  702. * @param status A reference to a UErrorCode to receive any errors.
  703. * @return the (native) index of the position following the last matched character.
  704. * @stable ICU 3.0
  705. */
  706. U_STABLE int32_t U_EXPORT2
  707. uregex_end(URegularExpression *regexp,
  708. int32_t groupNum,
  709. UErrorCode *status);
  710. /**
  711. * 64bit version of uregex_end.
  712. * Returns the index in the input string of the position following the end
  713. * of the text matched by the specified capture group.
  714. * Return -1 if the capture group was not part of the last match.
  715. * Group #0 refers to the complete range of matched text.
  716. * Group #1 refers to the text matched by the first set of capturing parentheses.
  717. *
  718. * @param regexp The compiled regular expression.
  719. * @param groupNum The capture group number
  720. * @param status A reference to a UErrorCode to receive any errors.
  721. * @return the (native) index of the position following the last matched character.
  722. * @stable ICU 4.6
  723. */
  724. U_STABLE int64_t U_EXPORT2
  725. uregex_end64(URegularExpression *regexp,
  726. int32_t groupNum,
  727. UErrorCode *status);
  728. /**
  729. * Reset any saved state from the previous match. Has the effect of
  730. * causing uregex_findNext to begin at the specified index, and causing
  731. * uregex_start(), uregex_end() and uregex_group() to return an error
  732. * indicating that there is no match information available. Clears any
  733. * match region that may have been set.
  734. *
  735. * @param regexp The compiled regular expression.
  736. * @param index The position (native) in the text at which a
  737. * uregex_findNext() should begin searching.
  738. * @param status A reference to a UErrorCode to receive any errors.
  739. * @stable ICU 3.0
  740. */
  741. U_STABLE void U_EXPORT2
  742. uregex_reset(URegularExpression *regexp,
  743. int32_t index,
  744. UErrorCode *status);
  745. /**
  746. * 64bit version of uregex_reset.
  747. * Reset any saved state from the previous match. Has the effect of
  748. * causing uregex_findNext to begin at the specified index, and causing
  749. * uregex_start(), uregex_end() and uregex_group() to return an error
  750. * indicating that there is no match information available. Clears any
  751. * match region that may have been set.
  752. *
  753. * @param regexp The compiled regular expression.
  754. * @param index The position (native) in the text at which a
  755. * uregex_findNext() should begin searching.
  756. * @param status A reference to a UErrorCode to receive any errors.
  757. * @stable ICU 4.6
  758. */
  759. U_STABLE void U_EXPORT2
  760. uregex_reset64(URegularExpression *regexp,
  761. int64_t index,
  762. UErrorCode *status);
  763. /**
  764. * Sets the limits of the matching region for this URegularExpression.
  765. * The region is the part of the input string that will be considered when matching.
  766. * Invoking this method resets any saved state from the previous match,
  767. * then sets the region to start at the index specified by the start parameter
  768. * and end at the index specified by the end parameter.
  769. *
  770. * Depending on the transparency and anchoring being used (see useTransparentBounds
  771. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  772. * at or around the boundaries of the region
  773. *
  774. * The function will fail if start is greater than limit, or if either index
  775. * is less than zero or greater than the length of the string being matched.
  776. *
  777. * @param regexp The compiled regular expression.
  778. * @param regionStart The (native) index to begin searches at.
  779. * @param regionLimit The (native) index to end searches at (exclusive).
  780. * @param status A pointer to a UErrorCode to receive any errors.
  781. * @stable ICU 4.0
  782. */
  783. U_STABLE void U_EXPORT2
  784. uregex_setRegion(URegularExpression *regexp,
  785. int32_t regionStart,
  786. int32_t regionLimit,
  787. UErrorCode *status);
  788. /**
  789. * 64bit version of uregex_setRegion.
  790. * Sets the limits of the matching region for this URegularExpression.
  791. * The region is the part of the input string that will be considered when matching.
  792. * Invoking this method resets any saved state from the previous match,
  793. * then sets the region to start at the index specified by the start parameter
  794. * and end at the index specified by the end parameter.
  795. *
  796. * Depending on the transparency and anchoring being used (see useTransparentBounds
  797. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  798. * at or around the boundaries of the region
  799. *
  800. * The function will fail if start is greater than limit, or if either index
  801. * is less than zero or greater than the length of the string being matched.
  802. *
  803. * @param regexp The compiled regular expression.
  804. * @param regionStart The (native) index to begin searches at.
  805. * @param regionLimit The (native) index to end searches at (exclusive).
  806. * @param status A pointer to a UErrorCode to receive any errors.
  807. * @stable ICU 4.6
  808. */
  809. U_STABLE void U_EXPORT2
  810. uregex_setRegion64(URegularExpression *regexp,
  811. int64_t regionStart,
  812. int64_t regionLimit,
  813. UErrorCode *status);
  814. /**
  815. * Set the matching region and the starting index for subsequent matches
  816. * in a single operation.
  817. * This is useful because the usual function for setting the starting
  818. * index, urgex_reset(), also resets any region limits.
  819. *
  820. * @param regexp The compiled regular expression.
  821. * @param regionStart The (native) index to begin searches at.
  822. * @param regionLimit The (native) index to end searches at (exclusive).
  823. * @param startIndex The index in the input text at which the next
  824. * match operation should begin.
  825. * @param status A pointer to a UErrorCode to receive any errors.
  826. * @stable ICU 4.6
  827. */
  828. U_STABLE void U_EXPORT2
  829. uregex_setRegionAndStart(URegularExpression *regexp,
  830. int64_t regionStart,
  831. int64_t regionLimit,
  832. int64_t startIndex,
  833. UErrorCode *status);
  834. /**
  835. * Reports the start index of the matching region. Any matches found are limited to
  836. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  837. *
  838. * @param regexp The compiled regular expression.
  839. * @param status A pointer to a UErrorCode to receive any errors.
  840. * @return The starting (native) index of this matcher's region.
  841. * @stable ICU 4.0
  842. */
  843. U_STABLE int32_t U_EXPORT2
  844. uregex_regionStart(const URegularExpression *regexp,
  845. UErrorCode *status);
  846. /**
  847. * 64bit version of uregex_regionStart.
  848. * Reports the start index of the matching region. Any matches found are limited to
  849. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  850. *
  851. * @param regexp The compiled regular expression.
  852. * @param status A pointer to a UErrorCode to receive any errors.
  853. * @return The starting (native) index of this matcher's region.
  854. * @stable ICU 4.6
  855. */
  856. U_STABLE int64_t U_EXPORT2
  857. uregex_regionStart64(const URegularExpression *regexp,
  858. UErrorCode *status);
  859. /**
  860. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  861. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  862. * and regionEnd (exclusive).
  863. *
  864. * @param regexp The compiled regular expression.
  865. * @param status A pointer to a UErrorCode to receive any errors.
  866. * @return The ending point (native) of this matcher's region.
  867. * @stable ICU 4.0
  868. */
  869. U_STABLE int32_t U_EXPORT2
  870. uregex_regionEnd(const URegularExpression *regexp,
  871. UErrorCode *status);
  872. /**
  873. * 64bit version of uregex_regionEnd.
  874. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  875. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  876. * and regionEnd (exclusive).
  877. *
  878. * @param regexp The compiled regular expression.
  879. * @param status A pointer to a UErrorCode to receive any errors.
  880. * @return The ending point (native) of this matcher's region.
  881. * @stable ICU 4.6
  882. */
  883. U_STABLE int64_t U_EXPORT2
  884. uregex_regionEnd64(const URegularExpression *regexp,
  885. UErrorCode *status);
  886. /**
  887. * Queries the transparency of region bounds for this URegularExpression.
  888. * See useTransparentBounds for a description of transparent and opaque bounds.
  889. * By default, matching boundaries are opaque.
  890. *
  891. * @param regexp The compiled regular expression.
  892. * @param status A pointer to a UErrorCode to receive any errors.
  893. * @return TRUE if this matcher is using opaque bounds, false if it is not.
  894. * @stable ICU 4.0
  895. */
  896. U_STABLE UBool U_EXPORT2
  897. uregex_hasTransparentBounds(const URegularExpression *regexp,
  898. UErrorCode *status);
  899. /**
  900. * Sets the transparency of region bounds for this URegularExpression.
  901. * Invoking this function with an argument of TRUE will set matches to use transparent bounds.
  902. * If the boolean argument is FALSE, then opaque bounds will be used.
  903. *
  904. * Using transparent bounds, the boundaries of the matching region are transparent
  905. * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  906. * see text beyond the boundaries of the region while checking for a match.
  907. *
  908. * With opaque bounds, no text outside of the matching region is visible to lookahead,
  909. * lookbehind, and boundary matching constructs.
  910. *
  911. * By default, opaque bounds are used.
  912. *
  913. * @param regexp The compiled regular expression.
  914. * @param b TRUE for transparent bounds; FALSE for opaque bounds
  915. * @param status A pointer to a UErrorCode to receive any errors.
  916. * @stable ICU 4.0
  917. **/
  918. U_STABLE void U_EXPORT2
  919. uregex_useTransparentBounds(URegularExpression *regexp,
  920. UBool b,
  921. UErrorCode *status);
  922. /**
  923. * Return true if this URegularExpression is using anchoring bounds.
  924. * By default, anchoring region bounds are used.
  925. *
  926. * @param regexp The compiled regular expression.
  927. * @param status A pointer to a UErrorCode to receive any errors.
  928. * @return TRUE if this matcher is using anchoring bounds.
  929. * @stable ICU 4.0
  930. */
  931. U_STABLE UBool U_EXPORT2
  932. uregex_hasAnchoringBounds(const URegularExpression *regexp,
  933. UErrorCode *status);
  934. /**
  935. * Set whether this URegularExpression is using Anchoring Bounds for its region.
  936. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  937. * and end of the region. Without Anchoring Bounds, anchors will only match at
  938. * the positions they would in the complete text.
  939. *
  940. * Anchoring Bounds are the default for regions.
  941. *
  942. * @param regexp The compiled regular expression.
  943. * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
  944. * @param status A pointer to a UErrorCode to receive any errors.
  945. * @stable ICU 4.0
  946. */
  947. U_STABLE void U_EXPORT2
  948. uregex_useAnchoringBounds(URegularExpression *regexp,
  949. UBool b,
  950. UErrorCode *status);
  951. /**
  952. * Return TRUE if the most recent matching operation touched the
  953. * end of the text being processed. In this case, additional input text could
  954. * change the results of that match.
  955. *
  956. * @param regexp The compiled regular expression.
  957. * @param status A pointer to a UErrorCode to receive any errors.
  958. * @return TRUE if the most recent match hit the end of input
  959. * @stable ICU 4.0
  960. */
  961. U_STABLE UBool U_EXPORT2
  962. uregex_hitEnd(const URegularExpression *regexp,
  963. UErrorCode *status);
  964. /**
  965. * Return TRUE the most recent match succeeded and additional input could cause
  966. * it to fail. If this function returns false and a match was found, then more input
  967. * might change the match but the match won't be lost. If a match was not found,
  968. * then requireEnd has no meaning.
  969. *
  970. * @param regexp The compiled regular expression.
  971. * @param status A pointer to a UErrorCode to receive any errors.
  972. * @return TRUE if more input could cause the most recent match to no longer match.
  973. * @stable ICU 4.0
  974. */
  975. U_STABLE UBool U_EXPORT2
  976. uregex_requireEnd(const URegularExpression *regexp,
  977. UErrorCode *status);
  978. /**
  979. * Replaces every substring of the input that matches the pattern
  980. * with the given replacement string. This is a convenience function that
  981. * provides a complete find-and-replace-all operation.
  982. *
  983. * This method scans the input string looking for matches of the pattern.
  984. * Input that is not part of any match is copied unchanged to the
  985. * destination buffer. Matched regions are replaced in the output
  986. * buffer by the replacement string. The replacement string may contain
  987. * references to capture groups; these take the form of $1, $2, etc.
  988. *
  989. * @param regexp The compiled regular expression.
  990. * @param replacementText A string containing the replacement text.
  991. * @param replacementLength The length of the replacement string, or
  992. * -1 if it is NUL terminated.
  993. * @param destBuf A (UChar *) buffer that will receive the result.
  994. * @param destCapacity The capacity of the destination buffer.
  995. * @param status A reference to a UErrorCode to receive any errors.
  996. * @return The length of the string resulting from the find
  997. * and replace operation. In the event that the
  998. * destination capacity is inadequate, the return value
  999. * is still the full length of the untruncated string.
  1000. * @stable ICU 3.0
  1001. */
  1002. U_STABLE int32_t U_EXPORT2
  1003. uregex_replaceAll(URegularExpression *regexp,
  1004. const UChar *replacementText,
  1005. int32_t replacementLength,
  1006. UChar *destBuf,
  1007. int32_t destCapacity,
  1008. UErrorCode *status);
  1009. /**
  1010. * Replaces every substring of the input that matches the pattern
  1011. * with the given replacement string. This is a convenience function that
  1012. * provides a complete find-and-replace-all operation.
  1013. *
  1014. * This method scans the input string looking for matches of the pattern.
  1015. * Input that is not part of any match is copied unchanged to the
  1016. * destination buffer. Matched regions are replaced in the output
  1017. * buffer by the replacement string. The replacement string may contain
  1018. * references to capture groups; these take the form of $1, $2, etc.
  1019. *
  1020. * @param regexp The compiled regular expression.
  1021. * @param replacement A string containing the replacement text.
  1022. * @param dest A mutable UText that will receive the result.
  1023. * If NULL, a new UText will be created (which may not be mutable).
  1024. * @param status A reference to a UErrorCode to receive any errors.
  1025. * @return A UText containing the results of the find and replace.
  1026. * If a pre-allocated UText was provided, it will always be used and returned.
  1027. *
  1028. * @stable ICU 4.6
  1029. */
  1030. U_STABLE UText * U_EXPORT2
  1031. uregex_replaceAllUText(URegularExpression *regexp,
  1032. UText *replacement,
  1033. UText *dest,
  1034. UErrorCode *status);
  1035. /**
  1036. * Replaces the first substring of the input that matches the pattern
  1037. * with the given replacement string. This is a convenience function that
  1038. * provides a complete find-and-replace operation.
  1039. *
  1040. * This method scans the input string looking for a match of the pattern.
  1041. * All input that is not part of the match is copied unchanged to the
  1042. * destination buffer. The matched region is replaced in the output
  1043. * buffer by the replacement string. The replacement string may contain
  1044. * references to capture groups; these take the form of $1, $2, etc.
  1045. *
  1046. * @param regexp The compiled regular expression.
  1047. * @param replacementText A string containing the replacement text.
  1048. * @param replacementLength The length of the replacement string, or
  1049. * -1 if it is NUL terminated.
  1050. * @param destBuf A (UChar *) buffer that will receive the result.
  1051. * @param destCapacity The capacity of the destination buffer.
  1052. * @param status a reference to a UErrorCode to receive any errors.
  1053. * @return The length of the string resulting from the find
  1054. * and replace operation. In the event that the
  1055. * destination capacity is inadequate, the return value
  1056. * is still the full length of the untruncated string.
  1057. * @stable ICU 3.0
  1058. */
  1059. U_STABLE int32_t U_EXPORT2
  1060. uregex_replaceFirst(URegularExpression *regexp,
  1061. const UChar *replacementText,
  1062. int32_t replacementLength,
  1063. UChar *destBuf,
  1064. int32_t destCapacity,
  1065. UErrorCode *status);
  1066. /**
  1067. * Replaces the first substring of the input that matches the pattern
  1068. * with the given replacement string. This is a convenience function that
  1069. * provides a complete find-and-replace operation.
  1070. *
  1071. * This method scans the input string looking for a match of the pattern.
  1072. * All input that is not part of the match is copied unchanged to the
  1073. * destination buffer. The matched region is replaced in the output
  1074. * buffer by the replacement string. The replacement string may contain
  1075. * references to capture groups; these take the form of $1, $2, etc.
  1076. *
  1077. * @param regexp The compiled regular expression.
  1078. * @param replacement A string containing the replacement text.
  1079. * @param dest A mutable UText that will receive the result.
  1080. * If NULL, a new UText will be created (which may not be mutable).
  1081. * @param status A reference to a UErrorCode to receive any errors.
  1082. * @return A UText containing the results of the find and replace.
  1083. * If a pre-allocated UText was provided, it will always be used and returned.
  1084. *
  1085. * @stable ICU 4.6
  1086. */
  1087. U_STABLE UText * U_EXPORT2
  1088. uregex_replaceFirstUText(URegularExpression *regexp,
  1089. UText *replacement,
  1090. UText *dest,
  1091. UErrorCode *status);
  1092. /**
  1093. * Implements a replace operation intended to be used as part of an
  1094. * incremental find-and-replace.
  1095. *
  1096. * <p>The input string, starting from the end of the previous match and ending at
  1097. * the start of the current match, is appended to the destination string. Then the
  1098. * replacement string is appended to the output string,
  1099. * including handling any substitutions of captured text.</p>
  1100. *
  1101. * <p>A note on preflight computation of buffersize and error handling:
  1102. * Calls to uregex_appendReplacement() and uregex_appendTail() are
  1103. * designed to be chained, one after another, with the destination
  1104. * buffer pointer and buffer capacity updated after each in preparation
  1105. * to for the next. If the destination buffer is exhausted partway through such a
  1106. * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
  1107. * ICU conventions are for a function to perform no action if it is
  1108. * called with an error status, but for this one case, uregex_appendRepacement()
  1109. * will operate normally so that buffer size computations will complete
  1110. * correctly.
  1111. *
  1112. * <p>For simple, prepackaged, non-incremental find-and-replace
  1113. * operations, see replaceFirst() or replaceAll().</p>
  1114. *
  1115. * @param regexp The regular expression object.
  1116. * @param replacementText The string that will replace the matched portion of the
  1117. * input string as it is copied to the destination buffer.
  1118. * The replacement text may contain references ($1, for
  1119. * example) to capture groups from the match.
  1120. * @param replacementLength The length of the replacement text string,
  1121. * or -1 if the string is NUL terminated.
  1122. * @param destBuf The buffer into which the results of the
  1123. * find-and-replace are placed. On return, this pointer
  1124. * will be updated to refer to the beginning of the
  1125. * unused portion of buffer, leaving it in position for
  1126. * a subsequent call to this function.
  1127. * @param destCapacity The size of the output buffer, On return, this
  1128. * parameter will be updated to reflect the space remaining
  1129. * unused in the output buffer.
  1130. * @param status A reference to a UErrorCode to receive any errors.
  1131. * @return The length of the result string. In the event that
  1132. * destCapacity is inadequate, the full length of the
  1133. * untruncated output string is returned.
  1134. *
  1135. * @stable ICU 3.0
  1136. *
  1137. */
  1138. U_STABLE int32_t U_EXPORT2
  1139. uregex_appendReplacement(URegularExpression *regexp,
  1140. const UChar *replacementText,
  1141. int32_t replacementLength,
  1142. UChar **destBuf,
  1143. int32_t *destCapacity,
  1144. UErrorCode *status);
  1145. /**
  1146. * Implements a replace operation intended to be used as part of an
  1147. * incremental find-and-replace.
  1148. *
  1149. * <p>The input string, starting from the end of the previous match and ending at
  1150. * the start of the current match, is appended to the destination string. Then the
  1151. * replacement string is appended to the output string,
  1152. * including handling any substitutions of captured text.</p>
  1153. *
  1154. * <p>For simple, prepackaged, non-incremental find-and-replace
  1155. * operations, see replaceFirst() or replaceAll().</p>
  1156. *
  1157. * @param regexp The regular expression object.
  1158. * @param replacementText The string that will replace the matched portion of the
  1159. * input string as it is copied to the destination buffer.
  1160. * The replacement text may contain references ($1, for
  1161. * example) to capture groups from the match.
  1162. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1163. * @param status A reference to a UErrorCode to receive any errors.
  1164. *
  1165. * @stable ICU 4.6
  1166. */
  1167. U_STABLE void U_EXPORT2
  1168. uregex_appendReplacementUText(URegularExpression *regexp,
  1169. UText *replacementText,
  1170. UText *dest,
  1171. UErrorCode *status);
  1172. /**
  1173. * As the final step in a find-and-replace operation, append the remainder
  1174. * of the input string, starting at the position following the last match,
  1175. * to the destination string. <code>uregex_appendTail()</code> is intended
  1176. * to be invoked after one or more invocations of the
  1177. * <code>uregex_appendReplacement()</code> function.
  1178. *
  1179. * @param regexp The regular expression object. This is needed to
  1180. * obtain the input string and with the position
  1181. * of the last match within it.
  1182. * @param destBuf The buffer in which the results of the
  1183. * find-and-replace are placed. On return, the pointer
  1184. * will be updated to refer to the beginning of the
  1185. * unused portion of buffer.
  1186. * @param destCapacity The size of the output buffer, On return, this
  1187. * value will be updated to reflect the space remaining
  1188. * unused in the output buffer.
  1189. * @param status A reference to a UErrorCode to receive any errors.
  1190. * @return The length of the result string. In the event that
  1191. * destCapacity is inadequate, the full length of the
  1192. * untruncated output string is returned.
  1193. *
  1194. * @stable ICU 3.0
  1195. */
  1196. U_STABLE int32_t U_EXPORT2
  1197. uregex_appendTail(URegularExpression *regexp,
  1198. UChar **destBuf,
  1199. int32_t *destCapacity,
  1200. UErrorCode *status);
  1201. /**
  1202. * As the final step in a find-and-replace operation, append the remainder
  1203. * of the input string, starting at the position following the last match,
  1204. * to the destination string. <code>uregex_appendTailUText()</code> is intended
  1205. * to be invoked after one or more invocations of the
  1206. * <code>uregex_appendReplacementUText()</code> function.
  1207. *
  1208. * @param regexp The regular expression object. This is needed to
  1209. * obtain the input string and with the position
  1210. * of the last match within it.
  1211. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1212. *
  1213. * @param status Error code
  1214. *
  1215. * @return The destination UText.
  1216. *
  1217. * @stable ICU 4.6
  1218. */
  1219. U_STABLE UText * U_EXPORT2
  1220. uregex_appendTailUText(URegularExpression *regexp,
  1221. UText *dest,
  1222. UErrorCode *status);
  1223. /**
  1224. * Split a string into fields. Somewhat like split() from Perl.
  1225. * The pattern matches identify delimiters that separate the input
  1226. * into fields. The input data between the matches becomes the
  1227. * fields themselves.
  1228. *
  1229. * Each of the fields is copied from the input string to the destination
  1230. * buffer, and NUL terminated. The position of each field within
  1231. * the destination buffer is returned in the destFields array.
  1232. *
  1233. * If the delimiter pattern includes capture groups, the captured text will
  1234. * also appear in the destination array of output strings, interspersed
  1235. * with the fields. This is similar to Perl, but differs from Java,
  1236. * which ignores the presence of capture groups in the pattern.
  1237. *
  1238. * Trailing empty fields will always be returned, assuming sufficient
  1239. * destination capacity. This differs from the default behavior for Java
  1240. * and Perl where trailing empty fields are not returned.
  1241. *
  1242. * The number of strings produced by the split operation is returned.
  1243. * This count includes the strings from capture groups in the delimiter pattern.
  1244. * This behavior differs from Java, which ignores capture groups.
  1245. *
  1246. * @param regexp The compiled regular expression.
  1247. * @param destBuf A (UChar *) buffer to receive the fields that
  1248. * are extracted from the input string. These
  1249. * field pointers will refer to positions within the
  1250. * destination buffer supplied by the caller. Any
  1251. * extra positions within the destFields array will be
  1252. * set to NULL.
  1253. * @param destCapacity The capacity of the destBuf.
  1254. * @param requiredCapacity The actual capacity required of the destBuf.
  1255. * If destCapacity is too small, requiredCapacity will return
  1256. * the total capacity required to hold all of the output, and
  1257. * a U_BUFFER_OVERFLOW_ERROR will be returned.
  1258. * @param destFields An array to be filled with the position of each
  1259. * of the extracted fields within destBuf.
  1260. * @param destFieldsCapacity The number of elements in the destFields array.
  1261. * If the number of fields found is less than destFieldsCapacity,
  1262. * the extra destFields elements are set to zero.
  1263. * If destFieldsCapacity is too small, the trailing part of the
  1264. * input, including any field delimiters, is treated as if it
  1265. * were the last field - it is copied to the destBuf, and
  1266. * its position is in the destBuf is stored in the last element
  1267. * of destFields. This behavior mimics that of Perl. It is not
  1268. * an error condition, and no error status is returned when all destField
  1269. * positions are used.
  1270. * @param status A reference to a UErrorCode to receive any errors.
  1271. * @return The number of fields into which the input string was split.
  1272. * @stable ICU 3.0
  1273. */
  1274. U_STABLE int32_t U_EXPORT2
  1275. uregex_split( URegularExpression *regexp,
  1276. UChar *destBuf,
  1277. int32_t destCapacity,
  1278. int32_t *requiredCapacity,
  1279. UChar *destFields[],
  1280. int32_t destFieldsCapacity,
  1281. UErrorCode *status);
  1282. /**
  1283. * Split a string into fields. Somewhat like split() from Perl.
  1284. * The pattern matches identify delimiters that separate the input
  1285. * into fields. The input data between the matches becomes the
  1286. * fields themselves.
  1287. * <p>
  1288. * The behavior of this function is not very closely aligned with uregex_split();
  1289. * instead, it is based on (and implemented directly on top of) the C++ split method.
  1290. *
  1291. * @param regexp The compiled regular expression.
  1292. * @param destFields An array of mutable UText structs to receive the results of the split.
  1293. * If a field is NULL, a new UText is allocated to contain the results for
  1294. * that field. This new UText is not guaranteed to be mutable.
  1295. * @param destFieldsCapacity The number of elements in the destination array.
  1296. * If the number of fields found is less than destCapacity, the
  1297. * extra strings in the destination array are not altered.
  1298. * If the number of destination strings is less than the number
  1299. * of fields, the trailing part of the input string, including any
  1300. * field delimiters, is placed in the last destination string.
  1301. * This behavior mimics that of Perl. It is not an error condition, and no
  1302. * error status is returned when all destField positions are used.
  1303. * @param status A reference to a UErrorCode to receive any errors.
  1304. * @return The number of fields into which the input string was split.
  1305. *
  1306. * @stable ICU 4.6
  1307. */
  1308. U_STABLE int32_t U_EXPORT2
  1309. uregex_splitUText(URegularExpression *regexp,
  1310. UText *destFields[],
  1311. int32_t destFieldsCapacity,
  1312. UErrorCode *status);
  1313. /**
  1314. * Set a processing time limit for match operations with this URegularExpression.
  1315. *
  1316. * Some patterns, when matching certain strings, can run in exponential time.
  1317. * For practical purposes, the match operation may appear to be in an
  1318. * infinite loop.
  1319. * When a limit is set a match operation will fail with an error if the
  1320. * limit is exceeded.
  1321. * <p>
  1322. * The units of the limit are steps of the match engine.
  1323. * Correspondence with actual processor time will depend on the speed
  1324. * of the processor and the details of the specific pattern, but will
  1325. * typically be on the order of milliseconds.
  1326. * <p>
  1327. * By default, the matching time is not limited.
  1328. * <p>
  1329. *
  1330. * @param regexp The compiled regular expression.
  1331. * @param limit The limit value, or 0 for no limit.
  1332. * @param status A reference to a UErrorCode to receive any errors.
  1333. * @stable ICU 4.0
  1334. */
  1335. U_STABLE void U_EXPORT2
  1336. uregex_setTimeLimit(URegularExpression *regexp,
  1337. int32_t limit,
  1338. UErrorCode *status);
  1339. /**
  1340. * Get the time limit for for matches with this URegularExpression.
  1341. * A return value of zero indicates that there is no limit.
  1342. *
  1343. * @param regexp The compiled regular expression.
  1344. * @param status A reference to a UErrorCode to receive any errors.
  1345. * @return the maximum allowed time for a match, in units of processing steps.
  1346. * @stable ICU 4.0
  1347. */
  1348. U_STABLE int32_t U_EXPORT2
  1349. uregex_getTimeLimit(const URegularExpression *regexp,
  1350. UErrorCode *status);
  1351. /**
  1352. * Set the amount of heap storage available for use by the match backtracking stack.
  1353. * <p>
  1354. * ICU uses a backtracking regular expression engine, with the backtrack stack
  1355. * maintained on the heap. This function sets the limit to the amount of memory
  1356. * that can be used for this purpose. A backtracking stack overflow will
  1357. * result in an error from the match operation that caused it.
  1358. * <p>
  1359. * A limit is desirable because a malicious or poorly designed pattern can use
  1360. * excessive memory, potentially crashing the process. A limit is enabled
  1361. * by default.
  1362. * <p>
  1363. * @param regexp The compiled regular expression.
  1364. * @param limit The maximum size, in bytes, of the matching backtrack stack.
  1365. * A value of zero means no limit.
  1366. * The limit must be greater than or equal to zero.
  1367. * @param status A reference to a UErrorCode to receive any errors.
  1368. *
  1369. * @stable ICU 4.0
  1370. */
  1371. U_STABLE void U_EXPORT2
  1372. uregex_setStackLimit(URegularExpression *regexp,
  1373. int32_t limit,
  1374. UErrorCode *status);
  1375. /**
  1376. * Get the size of the heap storage available for use by the back tracking stack.
  1377. *
  1378. * @return the maximum backtracking stack size, in bytes, or zero if the
  1379. * stack size is unlimited.
  1380. * @stable ICU 4.0
  1381. */
  1382. U_STABLE int32_t U_EXPORT2
  1383. uregex_getStackLimit(const URegularExpression *regexp,
  1384. UErrorCode *status);
  1385. /**
  1386. * Function pointer for a regular expression matching callback function.
  1387. * When set, a callback function will be called periodically during matching
  1388. * operations. If the call back function returns FALSE, the matching
  1389. * operation will be terminated early.
  1390. *
  1391. * Note: the callback function must not call other functions on this
  1392. * URegularExpression.
  1393. *
  1394. * @param context context pointer. The callback function will be invoked
  1395. * with the context specified at the time that
  1396. * uregex_setMatchCallback() is called.
  1397. * @param steps the accumulated processing time, in match steps,
  1398. * for this matching operation.
  1399. * @return TRUE to continue the matching operation.
  1400. * FALSE to terminate the matching operation.
  1401. * @stable ICU 4.0
  1402. */
  1403. U_CDECL_BEGIN
  1404. typedef UBool U_CALLCONV URegexMatchCallback (
  1405. const void *context,
  1406. int32_t steps);
  1407. U_CDECL_END
  1408. /**
  1409. * Set a callback function for this URegularExpression.
  1410. * During matching operations the function will be called periodically,
  1411. * giving the application the opportunity to terminate a long-running
  1412. * match.
  1413. *
  1414. * @param regexp The compiled regular expression.
  1415. * @param callback A pointer to the user-supplied callback function.
  1416. * @param context User context pointer. The value supplied at the
  1417. * time the callback function is set will be saved
  1418. * and passed to the callback each time that it is called.
  1419. * @param status A reference to a UErrorCode to receive any errors.
  1420. * @stable ICU 4.0
  1421. */
  1422. U_STABLE void U_EXPORT2
  1423. uregex_setMatchCallback(URegularExpression *regexp,
  1424. URegexMatchCallback *callback,
  1425. const void *context,
  1426. UErrorCode *status);
  1427. /**
  1428. * Get the callback function for this URegularExpression.
  1429. *
  1430. * @param regexp The compiled regular expression.
  1431. * @param callback Out parameter, receives a pointer to the user-supplied
  1432. * callback function.
  1433. * @param context Out parameter, receives the user context pointer that
  1434. * was set when uregex_setMatchCallback() was called.
  1435. * @param status A reference to a UErrorCode to receive any errors.
  1436. * @stable ICU 4.0
  1437. */
  1438. U_STABLE void U_EXPORT2
  1439. uregex_getMatchCallback(const URegularExpression *regexp,
  1440. URegexMatchCallback **callback,
  1441. const void **context,
  1442. UErrorCode *status);
  1443. /**
  1444. * Function pointer for a regular expression find callback function.
  1445. *
  1446. * When set, a callback function will be called during a find operation
  1447. * and for operations that depend on find, such as findNext, split and some replace
  1448. * operations like replaceFirst.
  1449. * The callback will usually be called after each attempt at a match, but this is not a
  1450. * guarantee that the callback will be invoked at each character. For finds where the
  1451. * match engine is invoked at each character, this may be close to true, but less likely
  1452. * for more optimized loops where the pattern is known to only start, and the match
  1453. * engine invoked, at certain characters.
  1454. * When invoked, this callback will specify the index at which a match operation is about
  1455. * to be attempted, giving the application the opportunity to terminate a long-running
  1456. * find operation.
  1457. *
  1458. * If the call back function returns FALSE, the find operation will be terminated early.
  1459. *
  1460. * Note: the callback function must not call other functions on this
  1461. * URegularExpression
  1462. *
  1463. * @param context context pointer. The callback function will be invoked
  1464. * with the context specified at the time that
  1465. * uregex_setFindProgressCallback() is called.
  1466. * @param matchIndex the next index at which a match attempt will be attempted for this
  1467. * find operation. If this callback interrupts the search, this is the
  1468. * index at which a find/findNext operation may be re-initiated.
  1469. * @return TRUE to continue the matching operation.
  1470. * FALSE to terminate the matching operation.
  1471. * @stable ICU 4.6
  1472. */
  1473. U_CDECL_BEGIN
  1474. typedef UBool U_CALLCONV URegexFindProgressCallback (
  1475. const void *context,
  1476. int64_t matchIndex);
  1477. U_CDECL_END
  1478. /**
  1479. * Set the find progress callback function for this URegularExpression.
  1480. *
  1481. * @param regexp The compiled regular expression.
  1482. * @param callback A pointer to the user-supplied callback function.
  1483. * @param context User context pointer. The value supplied at the
  1484. * time the callback function is set will be saved
  1485. * and passed to the callback each time that it is called.
  1486. * @param status A reference to a UErrorCode to receive any errors.
  1487. * @stable ICU 4.6
  1488. */
  1489. U_STABLE void U_EXPORT2
  1490. uregex_setFindProgressCallback(URegularExpression *regexp,
  1491. URegexFindProgressCallback *callback,
  1492. const void *context,
  1493. UErrorCode *status);
  1494. /**
  1495. * Get the find progress callback function for this URegularExpression.
  1496. *
  1497. * @param regexp The compiled regular expression.
  1498. * @param callback Out parameter, receives a pointer to the user-supplied
  1499. * callback function.
  1500. * @param context Out parameter, receives the user context pointer that
  1501. * was set when uregex_setFindProgressCallback() was called.
  1502. * @param status A reference to a UErrorCode to receive any errors.
  1503. * @stable ICU 4.6
  1504. */
  1505. U_STABLE void U_EXPORT2
  1506. uregex_getFindProgressCallback(const URegularExpression *regexp,
  1507. URegexFindProgressCallback **callback,
  1508. const void **context,
  1509. UErrorCode *status);
  1510. #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
  1511. #endif /* UREGEX_H */