regex.h 85 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: regex.h
  9. * encoding: US-ASCII
  10. * indentation:4
  11. *
  12. * created on: 2002oct22
  13. * created by: Andy Heninger
  14. *
  15. * ICU Regular Expressions, API for C++
  16. */
  17. #ifndef REGEX_H
  18. #define REGEX_H
  19. //#define REGEX_DEBUG
  20. /**
  21. * \file
  22. * \brief C++ API: Regular Expressions
  23. *
  24. * <h2>Regular Expression API</h2>
  25. *
  26. * <p>The ICU API for processing regular expressions consists of two classes,
  27. * <code>RegexPattern</code> and <code>RegexMatcher</code>.
  28. * <code>RegexPattern</code> objects represent a pre-processed, or compiled
  29. * regular expression. They are created from a regular expression pattern string,
  30. * and can be used to create <code>RegexMatcher</code> objects for the pattern.</p>
  31. *
  32. * <p>Class <code>RegexMatcher</code> bundles together a regular expression
  33. * pattern and a target string to which the search pattern will be applied.
  34. * <code>RegexMatcher</code> includes API for doing plain find or search
  35. * operations, for search and replace operations, and for obtaining detailed
  36. * information about bounds of a match. </p>
  37. *
  38. * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
  39. * expression pattern strings application code can be simplified and the explicit
  40. * need for <code>RegexPattern</code> objects can usually be eliminated.
  41. * </p>
  42. */
  43. #include "unicode/utypes.h"
  44. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  45. #include "unicode/uobject.h"
  46. #include "unicode/unistr.h"
  47. #include "unicode/utext.h"
  48. #include "unicode/parseerr.h"
  49. #include "unicode/uregex.h"
  50. // Forward Declarations
  51. struct UHashtable;
  52. U_NAMESPACE_BEGIN
  53. struct Regex8BitSet;
  54. class RegexCImpl;
  55. class RegexMatcher;
  56. class RegexPattern;
  57. struct REStackFrame;
  58. class RuleBasedBreakIterator;
  59. class UnicodeSet;
  60. class UVector;
  61. class UVector32;
  62. class UVector64;
  63. /**
  64. * Class <code>RegexPattern</code> represents a compiled regular expression. It includes
  65. * factory methods for creating a RegexPattern object from the source (string) form
  66. * of a regular expression, methods for creating RegexMatchers that allow the pattern
  67. * to be applied to input text, and a few convenience methods for simple common
  68. * uses of regular expressions.
  69. *
  70. * <p>Class RegexPattern is not intended to be subclassed.</p>
  71. *
  72. * @stable ICU 2.4
  73. */
  74. class U_I18N_API RegexPattern U_FINAL : public UObject {
  75. public:
  76. /**
  77. * default constructor. Create a RegexPattern object that refers to no actual
  78. * pattern. Not normally needed; RegexPattern objects are usually
  79. * created using the factory method <code>compile()</code>.
  80. *
  81. * @stable ICU 2.4
  82. */
  83. RegexPattern();
  84. /**
  85. * Copy Constructor. Create a new RegexPattern object that is equivalent
  86. * to the source object.
  87. * @param source the pattern object to be copied.
  88. * @stable ICU 2.4
  89. */
  90. RegexPattern(const RegexPattern &source);
  91. /**
  92. * Destructor. Note that a RegexPattern object must persist so long as any
  93. * RegexMatcher objects that were created from the RegexPattern are active.
  94. * @stable ICU 2.4
  95. */
  96. virtual ~RegexPattern();
  97. /**
  98. * Comparison operator. Two RegexPattern objects are considered equal if they
  99. * were constructed from identical source patterns using the same match flag
  100. * settings.
  101. * @param that a RegexPattern object to compare with "this".
  102. * @return TRUE if the objects are equivalent.
  103. * @stable ICU 2.4
  104. */
  105. UBool operator==(const RegexPattern& that) const;
  106. /**
  107. * Comparison operator. Two RegexPattern objects are considered equal if they
  108. * were constructed from identical source patterns using the same match flag
  109. * settings.
  110. * @param that a RegexPattern object to compare with "this".
  111. * @return TRUE if the objects are different.
  112. * @stable ICU 2.4
  113. */
  114. inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
  115. /**
  116. * Assignment operator. After assignment, this RegexPattern will behave identically
  117. * to the source object.
  118. * @stable ICU 2.4
  119. */
  120. RegexPattern &operator =(const RegexPattern &source);
  121. /**
  122. * Create an exact copy of this RegexPattern object. Since RegexPattern is not
  123. * intended to be subclassed, <code>clone()</code> and the copy construction are
  124. * equivalent operations.
  125. * @return the copy of this RegexPattern
  126. * @stable ICU 2.4
  127. */
  128. virtual RegexPattern *clone() const;
  129. /**
  130. * Compiles the regular expression in string form into a RegexPattern
  131. * object. These compile methods, rather than the constructors, are the usual
  132. * way that RegexPattern objects are created.
  133. *
  134. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  135. * objects created from the pattern are active. RegexMatchers keep a pointer
  136. * back to their pattern, so premature deletion of the pattern is a
  137. * catastrophic error.</p>
  138. *
  139. * <p>All pattern match mode flags are set to their default values.</p>
  140. *
  141. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  142. * from a pattern string rather than separately compiling the pattern and
  143. * then creating a RegexMatcher object from the pattern.</p>
  144. *
  145. * @param regex The regular expression to be compiled.
  146. * @param pe Receives the position (line and column nubers) of any error
  147. * within the regular expression.)
  148. * @param status A reference to a UErrorCode to receive any errors.
  149. * @return A regexPattern object for the compiled pattern.
  150. *
  151. * @stable ICU 2.4
  152. */
  153. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  154. UParseError &pe,
  155. UErrorCode &status);
  156. /**
  157. * Compiles the regular expression in string form into a RegexPattern
  158. * object. These compile methods, rather than the constructors, are the usual
  159. * way that RegexPattern objects are created.
  160. *
  161. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  162. * objects created from the pattern are active. RegexMatchers keep a pointer
  163. * back to their pattern, so premature deletion of the pattern is a
  164. * catastrophic error.</p>
  165. *
  166. * <p>All pattern match mode flags are set to their default values.</p>
  167. *
  168. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  169. * from a pattern string rather than separately compiling the pattern and
  170. * then creating a RegexMatcher object from the pattern.</p>
  171. *
  172. * @param regex The regular expression to be compiled. Note, the text referred
  173. * to by this UText must not be deleted during the lifetime of the
  174. * RegexPattern object or any RegexMatcher object created from it.
  175. * @param pe Receives the position (line and column nubers) of any error
  176. * within the regular expression.)
  177. * @param status A reference to a UErrorCode to receive any errors.
  178. * @return A regexPattern object for the compiled pattern.
  179. *
  180. * @stable ICU 4.6
  181. */
  182. static RegexPattern * U_EXPORT2 compile( UText *regex,
  183. UParseError &pe,
  184. UErrorCode &status);
  185. /**
  186. * Compiles the regular expression in string form into a RegexPattern
  187. * object using the specified match mode flags. These compile methods,
  188. * rather than the constructors, are the usual way that RegexPattern objects
  189. * are created.
  190. *
  191. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  192. * objects created from the pattern are active. RegexMatchers keep a pointer
  193. * back to their pattern, so premature deletion of the pattern is a
  194. * catastrophic error.</p>
  195. *
  196. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  197. * from a pattern string instead of than separately compiling the pattern and
  198. * then creating a RegexMatcher object from the pattern.</p>
  199. *
  200. * @param regex The regular expression to be compiled.
  201. * @param flags The match mode flags to be used.
  202. * @param pe Receives the position (line and column numbers) of any error
  203. * within the regular expression.)
  204. * @param status A reference to a UErrorCode to receive any errors.
  205. * @return A regexPattern object for the compiled pattern.
  206. *
  207. * @stable ICU 2.4
  208. */
  209. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  210. uint32_t flags,
  211. UParseError &pe,
  212. UErrorCode &status);
  213. /**
  214. * Compiles the regular expression in string form into a RegexPattern
  215. * object using the specified match mode flags. These compile methods,
  216. * rather than the constructors, are the usual way that RegexPattern objects
  217. * are created.
  218. *
  219. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  220. * objects created from the pattern are active. RegexMatchers keep a pointer
  221. * back to their pattern, so premature deletion of the pattern is a
  222. * catastrophic error.</p>
  223. *
  224. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  225. * from a pattern string instead of than separately compiling the pattern and
  226. * then creating a RegexMatcher object from the pattern.</p>
  227. *
  228. * @param regex The regular expression to be compiled. Note, the text referred
  229. * to by this UText must not be deleted during the lifetime of the
  230. * RegexPattern object or any RegexMatcher object created from it.
  231. * @param flags The match mode flags to be used.
  232. * @param pe Receives the position (line and column numbers) of any error
  233. * within the regular expression.)
  234. * @param status A reference to a UErrorCode to receive any errors.
  235. * @return A regexPattern object for the compiled pattern.
  236. *
  237. * @stable ICU 4.6
  238. */
  239. static RegexPattern * U_EXPORT2 compile( UText *regex,
  240. uint32_t flags,
  241. UParseError &pe,
  242. UErrorCode &status);
  243. /**
  244. * Compiles the regular expression in string form into a RegexPattern
  245. * object using the specified match mode flags. These compile methods,
  246. * rather than the constructors, are the usual way that RegexPattern objects
  247. * are created.
  248. *
  249. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  250. * objects created from the pattern are active. RegexMatchers keep a pointer
  251. * back to their pattern, so premature deletion of the pattern is a
  252. * catastrophic error.</p>
  253. *
  254. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  255. * from a pattern string instead of than separately compiling the pattern and
  256. * then creating a RegexMatcher object from the pattern.</p>
  257. *
  258. * @param regex The regular expression to be compiled.
  259. * @param flags The match mode flags to be used.
  260. * @param status A reference to a UErrorCode to receive any errors.
  261. * @return A regexPattern object for the compiled pattern.
  262. *
  263. * @stable ICU 2.6
  264. */
  265. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  266. uint32_t flags,
  267. UErrorCode &status);
  268. /**
  269. * Compiles the regular expression in string form into a RegexPattern
  270. * object using the specified match mode flags. These compile methods,
  271. * rather than the constructors, are the usual way that RegexPattern objects
  272. * are created.
  273. *
  274. * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
  275. * objects created from the pattern are active. RegexMatchers keep a pointer
  276. * back to their pattern, so premature deletion of the pattern is a
  277. * catastrophic error.</p>
  278. *
  279. * <p>Note that it is often more convenient to construct a RegexMatcher directly
  280. * from a pattern string instead of than separately compiling the pattern and
  281. * then creating a RegexMatcher object from the pattern.</p>
  282. *
  283. * @param regex The regular expression to be compiled. Note, the text referred
  284. * to by this UText must not be deleted during the lifetime of the
  285. * RegexPattern object or any RegexMatcher object created from it.
  286. * @param flags The match mode flags to be used.
  287. * @param status A reference to a UErrorCode to receive any errors.
  288. * @return A regexPattern object for the compiled pattern.
  289. *
  290. * @stable ICU 4.6
  291. */
  292. static RegexPattern * U_EXPORT2 compile( UText *regex,
  293. uint32_t flags,
  294. UErrorCode &status);
  295. /**
  296. * Get the match mode flags that were used when compiling this pattern.
  297. * @return the match mode flags
  298. * @stable ICU 2.4
  299. */
  300. virtual uint32_t flags() const;
  301. /**
  302. * Creates a RegexMatcher that will match the given input against this pattern. The
  303. * RegexMatcher can then be used to perform match, find or replace operations
  304. * on the input. Note that a RegexPattern object must not be deleted while
  305. * RegexMatchers created from it still exist and might possibly be used again.
  306. * <p>
  307. * The matcher will retain a reference to the supplied input string, and all regexp
  308. * pattern matching operations happen directly on this original string. It is
  309. * critical that the string not be altered or deleted before use by the regular
  310. * expression operations is complete.
  311. *
  312. * @param input The input string to which the regular expression will be applied.
  313. * @param status A reference to a UErrorCode to receive any errors.
  314. * @return A RegexMatcher object for this pattern and input.
  315. *
  316. * @stable ICU 2.4
  317. */
  318. virtual RegexMatcher *matcher(const UnicodeString &input,
  319. UErrorCode &status) const;
  320. private:
  321. /**
  322. * Cause a compilation error if an application accidentally attempts to
  323. * create a matcher with a (UChar *) string as input rather than
  324. * a UnicodeString. Avoids a dangling reference to a temporary string.
  325. * <p>
  326. * To efficiently work with UChar *strings, wrap the data in a UnicodeString
  327. * using one of the aliasing constructors, such as
  328. * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
  329. * or in a UText, using
  330. * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
  331. *
  332. */
  333. RegexMatcher *matcher(const UChar *input,
  334. UErrorCode &status) const;
  335. public:
  336. /**
  337. * Creates a RegexMatcher that will match against this pattern. The
  338. * RegexMatcher can be used to perform match, find or replace operations.
  339. * Note that a RegexPattern object must not be deleted while
  340. * RegexMatchers created from it still exist and might possibly be used again.
  341. *
  342. * @param status A reference to a UErrorCode to receive any errors.
  343. * @return A RegexMatcher object for this pattern and input.
  344. *
  345. * @stable ICU 2.6
  346. */
  347. virtual RegexMatcher *matcher(UErrorCode &status) const;
  348. /**
  349. * Test whether a string matches a regular expression. This convenience function
  350. * both compiles the regular expression and applies it in a single operation.
  351. * Note that if the same pattern needs to be applied repeatedly, this method will be
  352. * less efficient than creating and reusing a RegexMatcher object.
  353. *
  354. * @param regex The regular expression
  355. * @param input The string data to be matched
  356. * @param pe Receives the position of any syntax errors within the regular expression
  357. * @param status A reference to a UErrorCode to receive any errors.
  358. * @return True if the regular expression exactly matches the full input string.
  359. *
  360. * @stable ICU 2.4
  361. */
  362. static UBool U_EXPORT2 matches(const UnicodeString &regex,
  363. const UnicodeString &input,
  364. UParseError &pe,
  365. UErrorCode &status);
  366. /**
  367. * Test whether a string matches a regular expression. This convenience function
  368. * both compiles the regular expression and applies it in a single operation.
  369. * Note that if the same pattern needs to be applied repeatedly, this method will be
  370. * less efficient than creating and reusing a RegexMatcher object.
  371. *
  372. * @param regex The regular expression
  373. * @param input The string data to be matched
  374. * @param pe Receives the position of any syntax errors within the regular expression
  375. * @param status A reference to a UErrorCode to receive any errors.
  376. * @return True if the regular expression exactly matches the full input string.
  377. *
  378. * @stable ICU 4.6
  379. */
  380. static UBool U_EXPORT2 matches(UText *regex,
  381. UText *input,
  382. UParseError &pe,
  383. UErrorCode &status);
  384. /**
  385. * Returns the regular expression from which this pattern was compiled. This method will work
  386. * even if the pattern was compiled from a UText.
  387. *
  388. * Note: If the pattern was originally compiled from a UText, and that UText was modified,
  389. * the returned string may no longer reflect the RegexPattern object.
  390. * @stable ICU 2.4
  391. */
  392. virtual UnicodeString pattern() const;
  393. /**
  394. * Returns the regular expression from which this pattern was compiled. This method will work
  395. * even if the pattern was compiled from a UnicodeString.
  396. *
  397. * Note: This is the original input, not a clone. If the pattern was originally compiled from a
  398. * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
  399. * object.
  400. *
  401. * @stable ICU 4.6
  402. */
  403. virtual UText *patternText(UErrorCode &status) const;
  404. /**
  405. * Get the group number corresponding to a named capture group.
  406. * The returned number can be used with any function that access
  407. * capture groups by number.
  408. *
  409. * The function returns an error status if the specified name does not
  410. * appear in the pattern.
  411. *
  412. * @param groupName The capture group name.
  413. * @param status A UErrorCode to receive any errors.
  414. *
  415. * @stable ICU 55
  416. */
  417. virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
  418. /**
  419. * Get the group number corresponding to a named capture group.
  420. * The returned number can be used with any function that access
  421. * capture groups by number.
  422. *
  423. * The function returns an error status if the specified name does not
  424. * appear in the pattern.
  425. *
  426. * @param groupName The capture group name,
  427. * platform invariant characters only.
  428. * @param nameLength The length of the name, or -1 if the name is
  429. * nul-terminated.
  430. * @param status A UErrorCode to receive any errors.
  431. *
  432. * @stable ICU 55
  433. */
  434. virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
  435. /**
  436. * Split a string into fields. Somewhat like split() from Perl or Java.
  437. * Pattern matches identify delimiters that separate the input
  438. * into fields. The input data between the delimiters becomes the
  439. * fields themselves.
  440. *
  441. * If the delimiter pattern includes capture groups, the captured text will
  442. * also appear in the destination array of output strings, interspersed
  443. * with the fields. This is similar to Perl, but differs from Java,
  444. * which ignores the presence of capture groups in the pattern.
  445. *
  446. * Trailing empty fields will always be returned, assuming sufficient
  447. * destination capacity. This differs from the default behavior for Java
  448. * and Perl where trailing empty fields are not returned.
  449. *
  450. * The number of strings produced by the split operation is returned.
  451. * This count includes the strings from capture groups in the delimiter pattern.
  452. * This behavior differs from Java, which ignores capture groups.
  453. *
  454. * For the best performance on split() operations,
  455. * <code>RegexMatcher::split</code> is preferable to this function
  456. *
  457. * @param input The string to be split into fields. The field delimiters
  458. * match the pattern (in the "this" object)
  459. * @param dest An array of UnicodeStrings to receive the results of the split.
  460. * This is an array of actual UnicodeString objects, not an
  461. * array of pointers to strings. Local (stack based) arrays can
  462. * work well here.
  463. * @param destCapacity The number of elements in the destination array.
  464. * If the number of fields found is less than destCapacity, the
  465. * extra strings in the destination array are not altered.
  466. * If the number of destination strings is less than the number
  467. * of fields, the trailing part of the input string, including any
  468. * field delimiters, is placed in the last destination string.
  469. * @param status A reference to a UErrorCode to receive any errors.
  470. * @return The number of fields into which the input string was split.
  471. * @stable ICU 2.4
  472. */
  473. virtual int32_t split(const UnicodeString &input,
  474. UnicodeString dest[],
  475. int32_t destCapacity,
  476. UErrorCode &status) const;
  477. /**
  478. * Split a string into fields. Somewhat like split() from Perl or Java.
  479. * Pattern matches identify delimiters that separate the input
  480. * into fields. The input data between the delimiters becomes the
  481. * fields themselves.
  482. *
  483. * If the delimiter pattern includes capture groups, the captured text will
  484. * also appear in the destination array of output strings, interspersed
  485. * with the fields. This is similar to Perl, but differs from Java,
  486. * which ignores the presence of capture groups in the pattern.
  487. *
  488. * Trailing empty fields will always be returned, assuming sufficient
  489. * destination capacity. This differs from the default behavior for Java
  490. * and Perl where trailing empty fields are not returned.
  491. *
  492. * The number of strings produced by the split operation is returned.
  493. * This count includes the strings from capture groups in the delimiter pattern.
  494. * This behavior differs from Java, which ignores capture groups.
  495. *
  496. * For the best performance on split() operations,
  497. * <code>RegexMatcher::split</code> is preferable to this function
  498. *
  499. * @param input The string to be split into fields. The field delimiters
  500. * match the pattern (in the "this" object)
  501. * @param dest An array of mutable UText structs to receive the results of the split.
  502. * If a field is NULL, a new UText is allocated to contain the results for
  503. * that field. This new UText is not guaranteed to be mutable.
  504. * @param destCapacity The number of elements in the destination array.
  505. * If the number of fields found is less than destCapacity, the
  506. * extra strings in the destination array are not altered.
  507. * If the number of destination strings is less than the number
  508. * of fields, the trailing part of the input string, including any
  509. * field delimiters, is placed in the last destination string.
  510. * @param status A reference to a UErrorCode to receive any errors.
  511. * @return The number of destination strings used.
  512. *
  513. * @stable ICU 4.6
  514. */
  515. virtual int32_t split(UText *input,
  516. UText *dest[],
  517. int32_t destCapacity,
  518. UErrorCode &status) const;
  519. /**
  520. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  521. *
  522. * @stable ICU 2.4
  523. */
  524. virtual UClassID getDynamicClassID() const;
  525. /**
  526. * ICU "poor man's RTTI", returns a UClassID for this class.
  527. *
  528. * @stable ICU 2.4
  529. */
  530. static UClassID U_EXPORT2 getStaticClassID();
  531. private:
  532. //
  533. // Implementation Data
  534. //
  535. UText *fPattern; // The original pattern string.
  536. UnicodeString *fPatternString; // The original pattern UncodeString if relevant
  537. uint32_t fFlags; // The flags used when compiling the pattern.
  538. //
  539. UVector64 *fCompiledPat; // The compiled pattern p-code.
  540. UnicodeString fLiteralText; // Any literal string data from the pattern,
  541. // after un-escaping, for use during the match.
  542. UVector *fSets; // Any UnicodeSets referenced from the pattern.
  543. Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
  544. UErrorCode fDeferredStatus; // status if some prior error has left this
  545. // RegexPattern in an unusable state.
  546. int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
  547. // >= this value. For some patterns, this calculated
  548. // value may be less than the true shortest
  549. // possible match.
  550. int32_t fFrameSize; // Size of a state stack frame in the
  551. // execution engine.
  552. int32_t fDataSize; // The size of the data needed by the pattern that
  553. // does not go on the state stack, but has just
  554. // a single copy per matcher.
  555. UVector32 *fGroupMap; // Map from capture group number to position of
  556. // the group's variables in the matcher stack frame.
  557. UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
  558. // regex character classes, e.g. Word.
  559. Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
  560. // sets for predefined regex classes.
  561. int32_t fStartType; // Info on how a match must start.
  562. int32_t fInitialStringIdx; //
  563. int32_t fInitialStringLen;
  564. UnicodeSet *fInitialChars;
  565. UChar32 fInitialChar;
  566. Regex8BitSet *fInitialChars8;
  567. UBool fNeedsAltInput;
  568. UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
  569. friend class RegexCompile;
  570. friend class RegexMatcher;
  571. friend class RegexCImpl;
  572. //
  573. // Implementation Methods
  574. //
  575. void init(); // Common initialization, for use by constructors.
  576. void zap(); // Common cleanup
  577. void dumpOp(int32_t index) const;
  578. public:
  579. #ifndef U_HIDE_INTERNAL_API
  580. /**
  581. * Dump a compiled pattern. Internal debug function.
  582. * @internal
  583. */
  584. void dumpPattern() const;
  585. #endif /* U_HIDE_INTERNAL_API */
  586. };
  587. /**
  588. * class RegexMatcher bundles together a regular expression pattern and
  589. * input text to which the expression can be applied. It includes methods
  590. * for testing for matches, and for find and replace operations.
  591. *
  592. * <p>Class RegexMatcher is not intended to be subclassed.</p>
  593. *
  594. * @stable ICU 2.4
  595. */
  596. class U_I18N_API RegexMatcher U_FINAL : public UObject {
  597. public:
  598. /**
  599. * Construct a RegexMatcher for a regular expression.
  600. * This is a convenience method that avoids the need to explicitly create
  601. * a RegexPattern object. Note that if several RegexMatchers need to be
  602. * created for the same expression, it will be more efficient to
  603. * separately create and cache a RegexPattern object, and use
  604. * its matcher() method to create the RegexMatcher objects.
  605. *
  606. * @param regexp The Regular Expression to be compiled.
  607. * @param flags Regular expression options, such as case insensitive matching.
  608. * @see UREGEX_CASE_INSENSITIVE
  609. * @param status Any errors are reported by setting this UErrorCode variable.
  610. * @stable ICU 2.6
  611. */
  612. RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
  613. /**
  614. * Construct a RegexMatcher for a regular expression.
  615. * This is a convenience method that avoids the need to explicitly create
  616. * a RegexPattern object. Note that if several RegexMatchers need to be
  617. * created for the same expression, it will be more efficient to
  618. * separately create and cache a RegexPattern object, and use
  619. * its matcher() method to create the RegexMatcher objects.
  620. *
  621. * @param regexp The regular expression to be compiled.
  622. * @param flags Regular expression options, such as case insensitive matching.
  623. * @see UREGEX_CASE_INSENSITIVE
  624. * @param status Any errors are reported by setting this UErrorCode variable.
  625. *
  626. * @stable ICU 4.6
  627. */
  628. RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
  629. /**
  630. * Construct a RegexMatcher for a regular expression.
  631. * This is a convenience method that avoids the need to explicitly create
  632. * a RegexPattern object. Note that if several RegexMatchers need to be
  633. * created for the same expression, it will be more efficient to
  634. * separately create and cache a RegexPattern object, and use
  635. * its matcher() method to create the RegexMatcher objects.
  636. * <p>
  637. * The matcher will retain a reference to the supplied input string, and all regexp
  638. * pattern matching operations happen directly on the original string. It is
  639. * critical that the string not be altered or deleted before use by the regular
  640. * expression operations is complete.
  641. *
  642. * @param regexp The Regular Expression to be compiled.
  643. * @param input The string to match. The matcher retains a reference to the
  644. * caller's string; mo copy is made.
  645. * @param flags Regular expression options, such as case insensitive matching.
  646. * @see UREGEX_CASE_INSENSITIVE
  647. * @param status Any errors are reported by setting this UErrorCode variable.
  648. * @stable ICU 2.6
  649. */
  650. RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
  651. uint32_t flags, UErrorCode &status);
  652. /**
  653. * Construct a RegexMatcher for a regular expression.
  654. * This is a convenience method that avoids the need to explicitly create
  655. * a RegexPattern object. Note that if several RegexMatchers need to be
  656. * created for the same expression, it will be more efficient to
  657. * separately create and cache a RegexPattern object, and use
  658. * its matcher() method to create the RegexMatcher objects.
  659. * <p>
  660. * The matcher will make a shallow clone of the supplied input text, and all regexp
  661. * pattern matching operations happen on this clone. While read-only operations on
  662. * the supplied text are permitted, it is critical that the underlying string not be
  663. * altered or deleted before use by the regular expression operations is complete.
  664. *
  665. * @param regexp The Regular Expression to be compiled.
  666. * @param input The string to match. The matcher retains a shallow clone of the text.
  667. * @param flags Regular expression options, such as case insensitive matching.
  668. * @see UREGEX_CASE_INSENSITIVE
  669. * @param status Any errors are reported by setting this UErrorCode variable.
  670. *
  671. * @stable ICU 4.6
  672. */
  673. RegexMatcher(UText *regexp, UText *input,
  674. uint32_t flags, UErrorCode &status);
  675. private:
  676. /**
  677. * Cause a compilation error if an application accidentally attempts to
  678. * create a matcher with a (UChar *) string as input rather than
  679. * a UnicodeString. Avoids a dangling reference to a temporary string.
  680. * <p>
  681. * To efficiently work with UChar *strings, wrap the data in a UnicodeString
  682. * using one of the aliasing constructors, such as
  683. * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
  684. * or in a UText, using
  685. * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
  686. *
  687. */
  688. RegexMatcher(const UnicodeString &regexp, const UChar *input,
  689. uint32_t flags, UErrorCode &status);
  690. public:
  691. /**
  692. * Destructor.
  693. *
  694. * @stable ICU 2.4
  695. */
  696. virtual ~RegexMatcher();
  697. /**
  698. * Attempts to match the entire input region against the pattern.
  699. * @param status A reference to a UErrorCode to receive any errors.
  700. * @return TRUE if there is a match
  701. * @stable ICU 2.4
  702. */
  703. virtual UBool matches(UErrorCode &status);
  704. /**
  705. * Resets the matcher, then attempts to match the input beginning
  706. * at the specified startIndex, and extending to the end of the input.
  707. * The input region is reset to include the entire input string.
  708. * A successful match must extend to the end of the input.
  709. * @param startIndex The input string (native) index at which to begin matching.
  710. * @param status A reference to a UErrorCode to receive any errors.
  711. * @return TRUE if there is a match
  712. * @stable ICU 2.8
  713. */
  714. virtual UBool matches(int64_t startIndex, UErrorCode &status);
  715. /**
  716. * Attempts to match the input string, starting from the beginning of the region,
  717. * against the pattern. Like the matches() method, this function
  718. * always starts at the beginning of the input region;
  719. * unlike that function, it does not require that the entire region be matched.
  720. *
  721. * <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
  722. * <code>end()</code>, and <code>group()</code> functions.</p>
  723. *
  724. * @param status A reference to a UErrorCode to receive any errors.
  725. * @return TRUE if there is a match at the start of the input string.
  726. * @stable ICU 2.4
  727. */
  728. virtual UBool lookingAt(UErrorCode &status);
  729. /**
  730. * Attempts to match the input string, starting from the specified index, against the pattern.
  731. * The match may be of any length, and is not required to extend to the end
  732. * of the input string. Contrast with match().
  733. *
  734. * <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
  735. * <code>end()</code>, and <code>group()</code> functions.</p>
  736. *
  737. * @param startIndex The input string (native) index at which to begin matching.
  738. * @param status A reference to a UErrorCode to receive any errors.
  739. * @return TRUE if there is a match.
  740. * @stable ICU 2.8
  741. */
  742. virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
  743. /**
  744. * Find the next pattern match in the input string.
  745. * The find begins searching the input at the location following the end of
  746. * the previous match, or at the start of the string if there is no previous match.
  747. * If a match is found, <code>start(), end()</code> and <code>group()</code>
  748. * will provide more information regarding the match.
  749. * <p>Note that if the input string is changed by the application,
  750. * use find(startPos, status) instead of find(), because the saved starting
  751. * position may not be valid with the altered input string.</p>
  752. * @return TRUE if a match is found.
  753. * @stable ICU 2.4
  754. */
  755. virtual UBool find();
  756. /**
  757. * Find the next pattern match in the input string.
  758. * The find begins searching the input at the location following the end of
  759. * the previous match, or at the start of the string if there is no previous match.
  760. * If a match is found, <code>start(), end()</code> and <code>group()</code>
  761. * will provide more information regarding the match.
  762. * <p>Note that if the input string is changed by the application,
  763. * use find(startPos, status) instead of find(), because the saved starting
  764. * position may not be valid with the altered input string.</p>
  765. * @param status A reference to a UErrorCode to receive any errors.
  766. * @return TRUE if a match is found.
  767. * @stable ICU 55
  768. */
  769. virtual UBool find(UErrorCode &status);
  770. /**
  771. * Resets this RegexMatcher and then attempts to find the next substring of the
  772. * input string that matches the pattern, starting at the specified index.
  773. *
  774. * @param start The (native) index in the input string to begin the search.
  775. * @param status A reference to a UErrorCode to receive any errors.
  776. * @return TRUE if a match is found.
  777. * @stable ICU 2.4
  778. */
  779. virtual UBool find(int64_t start, UErrorCode &status);
  780. /**
  781. * Returns a string containing the text matched by the previous match.
  782. * If the pattern can match an empty string, an empty string may be returned.
  783. * @param status A reference to a UErrorCode to receive any errors.
  784. * Possible errors are U_REGEX_INVALID_STATE if no match
  785. * has been attempted or the last match failed.
  786. * @return a string containing the matched input text.
  787. * @stable ICU 2.4
  788. */
  789. virtual UnicodeString group(UErrorCode &status) const;
  790. /**
  791. * Returns a string containing the text captured by the given group
  792. * during the previous match operation. Group(0) is the entire match.
  793. *
  794. * A zero length string is returned both for capture groups that did not
  795. * participate in the match and for actual zero length matches.
  796. * To distinguish between these two cases use the function start(),
  797. * which returns -1 for non-participating groups.
  798. *
  799. * @param groupNum the capture group number
  800. * @param status A reference to a UErrorCode to receive any errors.
  801. * Possible errors are U_REGEX_INVALID_STATE if no match
  802. * has been attempted or the last match failed and
  803. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  804. * @return the captured text
  805. * @stable ICU 2.4
  806. */
  807. virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
  808. /**
  809. * Returns the number of capturing groups in this matcher's pattern.
  810. * @return the number of capture groups
  811. * @stable ICU 2.4
  812. */
  813. virtual int32_t groupCount() const;
  814. /**
  815. * Returns a shallow clone of the entire live input string with the UText current native index
  816. * set to the beginning of the requested group.
  817. *
  818. * @param dest The UText into which the input should be cloned, or NULL to create a new UText
  819. * @param group_len A reference to receive the length of the desired capture group
  820. * @param status A reference to a UErrorCode to receive any errors.
  821. * Possible errors are U_REGEX_INVALID_STATE if no match
  822. * has been attempted or the last match failed and
  823. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  824. * @return dest if non-NULL, a shallow copy of the input text otherwise
  825. *
  826. * @stable ICU 4.6
  827. */
  828. virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
  829. /**
  830. * Returns a shallow clone of the entire live input string with the UText current native index
  831. * set to the beginning of the requested group.
  832. *
  833. * A group length of zero is returned both for capture groups that did not
  834. * participate in the match and for actual zero length matches.
  835. * To distinguish between these two cases use the function start(),
  836. * which returns -1 for non-participating groups.
  837. *
  838. * @param groupNum The capture group number.
  839. * @param dest The UText into which the input should be cloned, or NULL to create a new UText.
  840. * @param group_len A reference to receive the length of the desired capture group
  841. * @param status A reference to a UErrorCode to receive any errors.
  842. * Possible errors are U_REGEX_INVALID_STATE if no match
  843. * has been attempted or the last match failed and
  844. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  845. * @return dest if non-NULL, a shallow copy of the input text otherwise
  846. *
  847. * @stable ICU 4.6
  848. */
  849. virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
  850. /**
  851. * Returns the index in the input string of the start of the text matched
  852. * during the previous match operation.
  853. * @param status a reference to a UErrorCode to receive any errors.
  854. * @return The (native) position in the input string of the start of the last match.
  855. * @stable ICU 2.4
  856. */
  857. virtual int32_t start(UErrorCode &status) const;
  858. /**
  859. * Returns the index in the input string of the start of the text matched
  860. * during the previous match operation.
  861. * @param status a reference to a UErrorCode to receive any errors.
  862. * @return The (native) position in the input string of the start of the last match.
  863. * @stable ICU 4.6
  864. */
  865. virtual int64_t start64(UErrorCode &status) const;
  866. /**
  867. * Returns the index in the input string of the start of the text matched by the
  868. * specified capture group during the previous match operation. Return -1 if
  869. * the capture group exists in the pattern, but was not part of the last match.
  870. *
  871. * @param group the capture group number
  872. * @param status A reference to a UErrorCode to receive any errors. Possible
  873. * errors are U_REGEX_INVALID_STATE if no match has been
  874. * attempted or the last match failed, and
  875. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  876. * @return the (native) start position of substring matched by the specified group.
  877. * @stable ICU 2.4
  878. */
  879. virtual int32_t start(int32_t group, UErrorCode &status) const;
  880. /**
  881. * Returns the index in the input string of the start of the text matched by the
  882. * specified capture group during the previous match operation. Return -1 if
  883. * the capture group exists in the pattern, but was not part of the last match.
  884. *
  885. * @param group the capture group number.
  886. * @param status A reference to a UErrorCode to receive any errors. Possible
  887. * errors are U_REGEX_INVALID_STATE if no match has been
  888. * attempted or the last match failed, and
  889. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  890. * @return the (native) start position of substring matched by the specified group.
  891. * @stable ICU 4.6
  892. */
  893. virtual int64_t start64(int32_t group, UErrorCode &status) const;
  894. /**
  895. * Returns the index in the input string of the first character following the
  896. * text matched during the previous match operation.
  897. *
  898. * @param status A reference to a UErrorCode to receive any errors. Possible
  899. * errors are U_REGEX_INVALID_STATE if no match has been
  900. * attempted or the last match failed.
  901. * @return the index of the last character matched, plus one.
  902. * The index value returned is a native index, corresponding to
  903. * code units for the underlying encoding type, for example,
  904. * a byte index for UTF-8.
  905. * @stable ICU 2.4
  906. */
  907. virtual int32_t end(UErrorCode &status) const;
  908. /**
  909. * Returns the index in the input string of the first character following the
  910. * text matched during the previous match operation.
  911. *
  912. * @param status A reference to a UErrorCode to receive any errors. Possible
  913. * errors are U_REGEX_INVALID_STATE if no match has been
  914. * attempted or the last match failed.
  915. * @return the index of the last character matched, plus one.
  916. * The index value returned is a native index, corresponding to
  917. * code units for the underlying encoding type, for example,
  918. * a byte index for UTF-8.
  919. * @stable ICU 4.6
  920. */
  921. virtual int64_t end64(UErrorCode &status) const;
  922. /**
  923. * Returns the index in the input string of the character following the
  924. * text matched by the specified capture group during the previous match operation.
  925. *
  926. * @param group the capture group number
  927. * @param status A reference to a UErrorCode to receive any errors. Possible
  928. * errors are U_REGEX_INVALID_STATE if no match has been
  929. * attempted or the last match failed and
  930. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  931. * @return the index of the first character following the text
  932. * captured by the specified group during the previous match operation.
  933. * Return -1 if the capture group exists in the pattern but was not part of the match.
  934. * The index value returned is a native index, corresponding to
  935. * code units for the underlying encoding type, for example,
  936. * a byte index for UTF8.
  937. * @stable ICU 2.4
  938. */
  939. virtual int32_t end(int32_t group, UErrorCode &status) const;
  940. /**
  941. * Returns the index in the input string of the character following the
  942. * text matched by the specified capture group during the previous match operation.
  943. *
  944. * @param group the capture group number
  945. * @param status A reference to a UErrorCode to receive any errors. Possible
  946. * errors are U_REGEX_INVALID_STATE if no match has been
  947. * attempted or the last match failed and
  948. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  949. * @return the index of the first character following the text
  950. * captured by the specified group during the previous match operation.
  951. * Return -1 if the capture group exists in the pattern but was not part of the match.
  952. * The index value returned is a native index, corresponding to
  953. * code units for the underlying encoding type, for example,
  954. * a byte index for UTF8.
  955. * @stable ICU 4.6
  956. */
  957. virtual int64_t end64(int32_t group, UErrorCode &status) const;
  958. /**
  959. * Resets this matcher. The effect is to remove any memory of previous matches,
  960. * and to cause subsequent find() operations to begin at the beginning of
  961. * the input string.
  962. *
  963. * @return this RegexMatcher.
  964. * @stable ICU 2.4
  965. */
  966. virtual RegexMatcher &reset();
  967. /**
  968. * Resets this matcher, and set the current input position.
  969. * The effect is to remove any memory of previous matches,
  970. * and to cause subsequent find() operations to begin at
  971. * the specified (native) position in the input string.
  972. * <p>
  973. * The matcher's region is reset to its default, which is the entire
  974. * input string.
  975. * <p>
  976. * An alternative to this function is to set a match region
  977. * beginning at the desired index.
  978. *
  979. * @return this RegexMatcher.
  980. * @stable ICU 2.8
  981. */
  982. virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
  983. /**
  984. * Resets this matcher with a new input string. This allows instances of RegexMatcher
  985. * to be reused, which is more efficient than creating a new RegexMatcher for
  986. * each input string to be processed.
  987. * @param input The new string on which subsequent pattern matches will operate.
  988. * The matcher retains a reference to the callers string, and operates
  989. * directly on that. Ownership of the string remains with the caller.
  990. * Because no copy of the string is made, it is essential that the
  991. * caller not delete the string until after regexp operations on it
  992. * are done.
  993. * Note that while a reset on the matcher with an input string that is then
  994. * modified across/during matcher operations may be supported currently for UnicodeString,
  995. * this was not originally intended behavior, and support for this is not guaranteed
  996. * in upcoming versions of ICU.
  997. * @return this RegexMatcher.
  998. * @stable ICU 2.4
  999. */
  1000. virtual RegexMatcher &reset(const UnicodeString &input);
  1001. /**
  1002. * Resets this matcher with a new input string. This allows instances of RegexMatcher
  1003. * to be reused, which is more efficient than creating a new RegexMatcher for
  1004. * each input string to be processed.
  1005. * @param input The new string on which subsequent pattern matches will operate.
  1006. * The matcher makes a shallow clone of the given text; ownership of the
  1007. * original string remains with the caller. Because no deep copy of the
  1008. * text is made, it is essential that the caller not modify the string
  1009. * until after regexp operations on it are done.
  1010. * @return this RegexMatcher.
  1011. *
  1012. * @stable ICU 4.6
  1013. */
  1014. virtual RegexMatcher &reset(UText *input);
  1015. /**
  1016. * Set the subject text string upon which the regular expression is looking for matches
  1017. * without changing any other aspect of the matching state.
  1018. * The new and previous text strings must have the same content.
  1019. *
  1020. * This function is intended for use in environments where ICU is operating on
  1021. * strings that may move around in memory. It provides a mechanism for notifying
  1022. * ICU that the string has been relocated, and providing a new UText to access the
  1023. * string in its new position.
  1024. *
  1025. * Note that the regular expression implementation never copies the underlying text
  1026. * of a string being matched, but always operates directly on the original text
  1027. * provided by the user. Refreshing simply drops the references to the old text
  1028. * and replaces them with references to the new.
  1029. *
  1030. * Caution: this function is normally used only by very specialized,
  1031. * system-level code. One example use case is with garbage collection that moves
  1032. * the text in memory.
  1033. *
  1034. * @param input The new (moved) text string.
  1035. * @param status Receives errors detected by this function.
  1036. *
  1037. * @stable ICU 4.8
  1038. */
  1039. virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
  1040. private:
  1041. /**
  1042. * Cause a compilation error if an application accidentally attempts to
  1043. * reset a matcher with a (UChar *) string as input rather than
  1044. * a UnicodeString. Avoids a dangling reference to a temporary string.
  1045. * <p>
  1046. * To efficiently work with UChar *strings, wrap the data in a UnicodeString
  1047. * using one of the aliasing constructors, such as
  1048. * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
  1049. * or in a UText, using
  1050. * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
  1051. *
  1052. */
  1053. RegexMatcher &reset(const UChar *input);
  1054. public:
  1055. /**
  1056. * Returns the input string being matched. Ownership of the string belongs to
  1057. * the matcher; it should not be altered or deleted. This method will work even if the input
  1058. * was originally supplied as a UText.
  1059. * @return the input string
  1060. * @stable ICU 2.4
  1061. */
  1062. virtual const UnicodeString &input() const;
  1063. /**
  1064. * Returns the input string being matched. This is the live input text; it should not be
  1065. * altered or deleted. This method will work even if the input was originally supplied as
  1066. * a UnicodeString.
  1067. * @return the input text
  1068. *
  1069. * @stable ICU 4.6
  1070. */
  1071. virtual UText *inputText() const;
  1072. /**
  1073. * Returns the input string being matched, either by copying it into the provided
  1074. * UText parameter or by returning a shallow clone of the live input. Note that copying
  1075. * the entire input may cause significant performance and memory issues.
  1076. * @param dest The UText into which the input should be copied, or NULL to create a new UText
  1077. * @param status error code
  1078. * @return dest if non-NULL, a shallow copy of the input text otherwise
  1079. *
  1080. * @stable ICU 4.6
  1081. */
  1082. virtual UText *getInput(UText *dest, UErrorCode &status) const;
  1083. /** Sets the limits of this matcher's region.
  1084. * The region is the part of the input string that will be searched to find a match.
  1085. * Invoking this method resets the matcher, and then sets the region to start
  1086. * at the index specified by the start parameter and end at the index specified
  1087. * by the end parameter.
  1088. *
  1089. * Depending on the transparency and anchoring being used (see useTransparentBounds
  1090. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  1091. * at or around the boundaries of the region
  1092. *
  1093. * The function will fail if start is greater than limit, or if either index
  1094. * is less than zero or greater than the length of the string being matched.
  1095. *
  1096. * @param start The (native) index to begin searches at.
  1097. * @param limit The index to end searches at (exclusive).
  1098. * @param status A reference to a UErrorCode to receive any errors.
  1099. * @stable ICU 4.0
  1100. */
  1101. virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
  1102. /**
  1103. * Identical to region(start, limit, status) but also allows a start position without
  1104. * resetting the region state.
  1105. * @param regionStart The region start
  1106. * @param regionLimit the limit of the region
  1107. * @param startIndex The (native) index within the region bounds at which to begin searches.
  1108. * @param status A reference to a UErrorCode to receive any errors.
  1109. * If startIndex is not within the specified region bounds,
  1110. * U_INDEX_OUTOFBOUNDS_ERROR is returned.
  1111. * @stable ICU 4.6
  1112. */
  1113. virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
  1114. /**
  1115. * Reports the start index of this matcher's region. The searches this matcher
  1116. * conducts are limited to finding matches within regionStart (inclusive) and
  1117. * regionEnd (exclusive).
  1118. *
  1119. * @return The starting (native) index of this matcher's region.
  1120. * @stable ICU 4.0
  1121. */
  1122. virtual int32_t regionStart() const;
  1123. /**
  1124. * Reports the start index of this matcher's region. The searches this matcher
  1125. * conducts are limited to finding matches within regionStart (inclusive) and
  1126. * regionEnd (exclusive).
  1127. *
  1128. * @return The starting (native) index of this matcher's region.
  1129. * @stable ICU 4.6
  1130. */
  1131. virtual int64_t regionStart64() const;
  1132. /**
  1133. * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1134. * this matcher conducts are limited to finding matches within regionStart
  1135. * (inclusive) and regionEnd (exclusive).
  1136. *
  1137. * @return The ending point (native) of this matcher's region.
  1138. * @stable ICU 4.0
  1139. */
  1140. virtual int32_t regionEnd() const;
  1141. /**
  1142. * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1143. * this matcher conducts are limited to finding matches within regionStart
  1144. * (inclusive) and regionEnd (exclusive).
  1145. *
  1146. * @return The ending point (native) of this matcher's region.
  1147. * @stable ICU 4.6
  1148. */
  1149. virtual int64_t regionEnd64() const;
  1150. /**
  1151. * Queries the transparency of region bounds for this matcher.
  1152. * See useTransparentBounds for a description of transparent and opaque bounds.
  1153. * By default, a matcher uses opaque region boundaries.
  1154. *
  1155. * @return TRUE if this matcher is using opaque bounds, false if it is not.
  1156. * @stable ICU 4.0
  1157. */
  1158. virtual UBool hasTransparentBounds() const;
  1159. /**
  1160. * Sets the transparency of region bounds for this matcher.
  1161. * Invoking this function with an argument of true will set this matcher to use transparent bounds.
  1162. * If the boolean argument is false, then opaque bounds will be used.
  1163. *
  1164. * Using transparent bounds, the boundaries of this matcher's region are transparent
  1165. * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  1166. * see text beyond the boundaries of the region while checking for a match.
  1167. *
  1168. * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
  1169. * lookbehind, and boundary matching constructs.
  1170. *
  1171. * By default, a matcher uses opaque bounds.
  1172. *
  1173. * @param b TRUE for transparent bounds; FALSE for opaque bounds
  1174. * @return This Matcher;
  1175. * @stable ICU 4.0
  1176. **/
  1177. virtual RegexMatcher &useTransparentBounds(UBool b);
  1178. /**
  1179. * Return true if this matcher is using anchoring bounds.
  1180. * By default, matchers use anchoring region bounds.
  1181. *
  1182. * @return TRUE if this matcher is using anchoring bounds.
  1183. * @stable ICU 4.0
  1184. */
  1185. virtual UBool hasAnchoringBounds() const;
  1186. /**
  1187. * Set whether this matcher is using Anchoring Bounds for its region.
  1188. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  1189. * and end of the region. Without Anchoring Bounds, anchors will only match at
  1190. * the positions they would in the complete text.
  1191. *
  1192. * Anchoring Bounds are the default for regions.
  1193. *
  1194. * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
  1195. * @return This Matcher
  1196. * @stable ICU 4.0
  1197. */
  1198. virtual RegexMatcher &useAnchoringBounds(UBool b);
  1199. /**
  1200. * Return TRUE if the most recent matching operation attempted to access
  1201. * additional input beyond the available input text.
  1202. * In this case, additional input text could change the results of the match.
  1203. *
  1204. * hitEnd() is defined for both successful and unsuccessful matches.
  1205. * In either case hitEnd() will return TRUE if if the end of the text was
  1206. * reached at any point during the matching process.
  1207. *
  1208. * @return TRUE if the most recent match hit the end of input
  1209. * @stable ICU 4.0
  1210. */
  1211. virtual UBool hitEnd() const;
  1212. /**
  1213. * Return TRUE the most recent match succeeded and additional input could cause
  1214. * it to fail. If this method returns false and a match was found, then more input
  1215. * might change the match but the match won't be lost. If a match was not found,
  1216. * then requireEnd has no meaning.
  1217. *
  1218. * @return TRUE if more input could cause the most recent match to no longer match.
  1219. * @stable ICU 4.0
  1220. */
  1221. virtual UBool requireEnd() const;
  1222. /**
  1223. * Returns the pattern that is interpreted by this matcher.
  1224. * @return the RegexPattern for this RegexMatcher
  1225. * @stable ICU 2.4
  1226. */
  1227. virtual const RegexPattern &pattern() const;
  1228. /**
  1229. * Replaces every substring of the input that matches the pattern
  1230. * with the given replacement string. This is a convenience function that
  1231. * provides a complete find-and-replace-all operation.
  1232. *
  1233. * This method first resets this matcher. It then scans the input string
  1234. * looking for matches of the pattern. Input that is not part of any
  1235. * match is left unchanged; each match is replaced in the result by the
  1236. * replacement string. The replacement string may contain references to
  1237. * capture groups.
  1238. *
  1239. * @param replacement a string containing the replacement text.
  1240. * @param status a reference to a UErrorCode to receive any errors.
  1241. * @return a string containing the results of the find and replace.
  1242. * @stable ICU 2.4
  1243. */
  1244. virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
  1245. /**
  1246. * Replaces every substring of the input that matches the pattern
  1247. * with the given replacement string. This is a convenience function that
  1248. * provides a complete find-and-replace-all operation.
  1249. *
  1250. * This method first resets this matcher. It then scans the input string
  1251. * looking for matches of the pattern. Input that is not part of any
  1252. * match is left unchanged; each match is replaced in the result by the
  1253. * replacement string. The replacement string may contain references to
  1254. * capture groups.
  1255. *
  1256. * @param replacement a string containing the replacement text.
  1257. * @param dest a mutable UText in which the results are placed.
  1258. * If NULL, a new UText will be created (which may not be mutable).
  1259. * @param status a reference to a UErrorCode to receive any errors.
  1260. * @return a string containing the results of the find and replace.
  1261. * If a pre-allocated UText was provided, it will always be used and returned.
  1262. *
  1263. * @stable ICU 4.6
  1264. */
  1265. virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
  1266. /**
  1267. * Replaces the first substring of the input that matches
  1268. * the pattern with the replacement string. This is a convenience
  1269. * function that provides a complete find-and-replace operation.
  1270. *
  1271. * <p>This function first resets this RegexMatcher. It then scans the input string
  1272. * looking for a match of the pattern. Input that is not part
  1273. * of the match is appended directly to the result string; the match is replaced
  1274. * in the result by the replacement string. The replacement string may contain
  1275. * references to captured groups.</p>
  1276. *
  1277. * <p>The state of the matcher (the position at which a subsequent find()
  1278. * would begin) after completing a replaceFirst() is not specified. The
  1279. * RegexMatcher should be reset before doing additional find() operations.</p>
  1280. *
  1281. * @param replacement a string containing the replacement text.
  1282. * @param status a reference to a UErrorCode to receive any errors.
  1283. * @return a string containing the results of the find and replace.
  1284. * @stable ICU 2.4
  1285. */
  1286. virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
  1287. /**
  1288. * Replaces the first substring of the input that matches
  1289. * the pattern with the replacement string. This is a convenience
  1290. * function that provides a complete find-and-replace operation.
  1291. *
  1292. * <p>This function first resets this RegexMatcher. It then scans the input string
  1293. * looking for a match of the pattern. Input that is not part
  1294. * of the match is appended directly to the result string; the match is replaced
  1295. * in the result by the replacement string. The replacement string may contain
  1296. * references to captured groups.</p>
  1297. *
  1298. * <p>The state of the matcher (the position at which a subsequent find()
  1299. * would begin) after completing a replaceFirst() is not specified. The
  1300. * RegexMatcher should be reset before doing additional find() operations.</p>
  1301. *
  1302. * @param replacement a string containing the replacement text.
  1303. * @param dest a mutable UText in which the results are placed.
  1304. * If NULL, a new UText will be created (which may not be mutable).
  1305. * @param status a reference to a UErrorCode to receive any errors.
  1306. * @return a string containing the results of the find and replace.
  1307. * If a pre-allocated UText was provided, it will always be used and returned.
  1308. *
  1309. * @stable ICU 4.6
  1310. */
  1311. virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
  1312. /**
  1313. * Implements a replace operation intended to be used as part of an
  1314. * incremental find-and-replace.
  1315. *
  1316. * <p>The input string, starting from the end of the previous replacement and ending at
  1317. * the start of the current match, is appended to the destination string. Then the
  1318. * replacement string is appended to the output string,
  1319. * including handling any substitutions of captured text.</p>
  1320. *
  1321. * <p>For simple, prepackaged, non-incremental find-and-replace
  1322. * operations, see replaceFirst() or replaceAll().</p>
  1323. *
  1324. * @param dest A UnicodeString to which the results of the find-and-replace are appended.
  1325. * @param replacement A UnicodeString that provides the text to be substituted for
  1326. * the input text that matched the regexp pattern. The replacement
  1327. * text may contain references to captured text from the
  1328. * input.
  1329. * @param status A reference to a UErrorCode to receive any errors. Possible
  1330. * errors are U_REGEX_INVALID_STATE if no match has been
  1331. * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1332. * if the replacement text specifies a capture group that
  1333. * does not exist in the pattern.
  1334. *
  1335. * @return this RegexMatcher
  1336. * @stable ICU 2.4
  1337. *
  1338. */
  1339. virtual RegexMatcher &appendReplacement(UnicodeString &dest,
  1340. const UnicodeString &replacement, UErrorCode &status);
  1341. /**
  1342. * Implements a replace operation intended to be used as part of an
  1343. * incremental find-and-replace.
  1344. *
  1345. * <p>The input string, starting from the end of the previous replacement and ending at
  1346. * the start of the current match, is appended to the destination string. Then the
  1347. * replacement string is appended to the output string,
  1348. * including handling any substitutions of captured text.</p>
  1349. *
  1350. * <p>For simple, prepackaged, non-incremental find-and-replace
  1351. * operations, see replaceFirst() or replaceAll().</p>
  1352. *
  1353. * @param dest A mutable UText to which the results of the find-and-replace are appended.
  1354. * Must not be NULL.
  1355. * @param replacement A UText that provides the text to be substituted for
  1356. * the input text that matched the regexp pattern. The replacement
  1357. * text may contain references to captured text from the input.
  1358. * @param status A reference to a UErrorCode to receive any errors. Possible
  1359. * errors are U_REGEX_INVALID_STATE if no match has been
  1360. * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1361. * if the replacement text specifies a capture group that
  1362. * does not exist in the pattern.
  1363. *
  1364. * @return this RegexMatcher
  1365. *
  1366. * @stable ICU 4.6
  1367. */
  1368. virtual RegexMatcher &appendReplacement(UText *dest,
  1369. UText *replacement, UErrorCode &status);
  1370. /**
  1371. * As the final step in a find-and-replace operation, append the remainder
  1372. * of the input string, starting at the position following the last appendReplacement(),
  1373. * to the destination string. <code>appendTail()</code> is intended to be invoked after one
  1374. * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
  1375. *
  1376. * @param dest A UnicodeString to which the results of the find-and-replace are appended.
  1377. * @return the destination string.
  1378. * @stable ICU 2.4
  1379. */
  1380. virtual UnicodeString &appendTail(UnicodeString &dest);
  1381. /**
  1382. * As the final step in a find-and-replace operation, append the remainder
  1383. * of the input string, starting at the position following the last appendReplacement(),
  1384. * to the destination string. <code>appendTail()</code> is intended to be invoked after one
  1385. * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
  1386. *
  1387. * @param dest A mutable UText to which the results of the find-and-replace are appended.
  1388. * Must not be NULL.
  1389. * @param status error cod
  1390. * @return the destination string.
  1391. *
  1392. * @stable ICU 4.6
  1393. */
  1394. virtual UText *appendTail(UText *dest, UErrorCode &status);
  1395. /**
  1396. * Split a string into fields. Somewhat like split() from Perl.
  1397. * The pattern matches identify delimiters that separate the input
  1398. * into fields. The input data between the matches becomes the
  1399. * fields themselves.
  1400. *
  1401. * @param input The string to be split into fields. The field delimiters
  1402. * match the pattern (in the "this" object). This matcher
  1403. * will be reset to this input string.
  1404. * @param dest An array of UnicodeStrings to receive the results of the split.
  1405. * This is an array of actual UnicodeString objects, not an
  1406. * array of pointers to strings. Local (stack based) arrays can
  1407. * work well here.
  1408. * @param destCapacity The number of elements in the destination array.
  1409. * If the number of fields found is less than destCapacity, the
  1410. * extra strings in the destination array are not altered.
  1411. * If the number of destination strings is less than the number
  1412. * of fields, the trailing part of the input string, including any
  1413. * field delimiters, is placed in the last destination string.
  1414. * @param status A reference to a UErrorCode to receive any errors.
  1415. * @return The number of fields into which the input string was split.
  1416. * @stable ICU 2.6
  1417. */
  1418. virtual int32_t split(const UnicodeString &input,
  1419. UnicodeString dest[],
  1420. int32_t destCapacity,
  1421. UErrorCode &status);
  1422. /**
  1423. * Split a string into fields. Somewhat like split() from Perl.
  1424. * The pattern matches identify delimiters that separate the input
  1425. * into fields. The input data between the matches becomes the
  1426. * fields themselves.
  1427. *
  1428. * @param input The string to be split into fields. The field delimiters
  1429. * match the pattern (in the "this" object). This matcher
  1430. * will be reset to this input string.
  1431. * @param dest An array of mutable UText structs to receive the results of the split.
  1432. * If a field is NULL, a new UText is allocated to contain the results for
  1433. * that field. This new UText is not guaranteed to be mutable.
  1434. * @param destCapacity The number of elements in the destination array.
  1435. * If the number of fields found is less than destCapacity, the
  1436. * extra strings in the destination array are not altered.
  1437. * If the number of destination strings is less than the number
  1438. * of fields, the trailing part of the input string, including any
  1439. * field delimiters, is placed in the last destination string.
  1440. * @param status A reference to a UErrorCode to receive any errors.
  1441. * @return The number of fields into which the input string was split.
  1442. *
  1443. * @stable ICU 4.6
  1444. */
  1445. virtual int32_t split(UText *input,
  1446. UText *dest[],
  1447. int32_t destCapacity,
  1448. UErrorCode &status);
  1449. /**
  1450. * Set a processing time limit for match operations with this Matcher.
  1451. *
  1452. * Some patterns, when matching certain strings, can run in exponential time.
  1453. * For practical purposes, the match operation may appear to be in an
  1454. * infinite loop.
  1455. * When a limit is set a match operation will fail with an error if the
  1456. * limit is exceeded.
  1457. * <p>
  1458. * The units of the limit are steps of the match engine.
  1459. * Correspondence with actual processor time will depend on the speed
  1460. * of the processor and the details of the specific pattern, but will
  1461. * typically be on the order of milliseconds.
  1462. * <p>
  1463. * By default, the matching time is not limited.
  1464. * <p>
  1465. *
  1466. * @param limit The limit value, or 0 for no limit.
  1467. * @param status A reference to a UErrorCode to receive any errors.
  1468. * @stable ICU 4.0
  1469. */
  1470. virtual void setTimeLimit(int32_t limit, UErrorCode &status);
  1471. /**
  1472. * Get the time limit, if any, for match operations made with this Matcher.
  1473. *
  1474. * @return the maximum allowed time for a match, in units of processing steps.
  1475. * @stable ICU 4.0
  1476. */
  1477. virtual int32_t getTimeLimit() const;
  1478. /**
  1479. * Set the amount of heap storage available for use by the match backtracking stack.
  1480. * The matcher is also reset, discarding any results from previous matches.
  1481. * <p>
  1482. * ICU uses a backtracking regular expression engine, with the backtrack stack
  1483. * maintained on the heap. This function sets the limit to the amount of memory
  1484. * that can be used for this purpose. A backtracking stack overflow will
  1485. * result in an error from the match operation that caused it.
  1486. * <p>
  1487. * A limit is desirable because a malicious or poorly designed pattern can use
  1488. * excessive memory, potentially crashing the process. A limit is enabled
  1489. * by default.
  1490. * <p>
  1491. * @param limit The maximum size, in bytes, of the matching backtrack stack.
  1492. * A value of zero means no limit.
  1493. * The limit must be greater or equal to zero.
  1494. *
  1495. * @param status A reference to a UErrorCode to receive any errors.
  1496. *
  1497. * @stable ICU 4.0
  1498. */
  1499. virtual void setStackLimit(int32_t limit, UErrorCode &status);
  1500. /**
  1501. * Get the size of the heap storage available for use by the back tracking stack.
  1502. *
  1503. * @return the maximum backtracking stack size, in bytes, or zero if the
  1504. * stack size is unlimited.
  1505. * @stable ICU 4.0
  1506. */
  1507. virtual int32_t getStackLimit() const;
  1508. /**
  1509. * Set a callback function for use with this Matcher.
  1510. * During matching operations the function will be called periodically,
  1511. * giving the application the opportunity to terminate a long-running
  1512. * match.
  1513. *
  1514. * @param callback A pointer to the user-supplied callback function.
  1515. * @param context User context pointer. The value supplied at the
  1516. * time the callback function is set will be saved
  1517. * and passed to the callback each time that it is called.
  1518. * @param status A reference to a UErrorCode to receive any errors.
  1519. * @stable ICU 4.0
  1520. */
  1521. virtual void setMatchCallback(URegexMatchCallback *callback,
  1522. const void *context,
  1523. UErrorCode &status);
  1524. /**
  1525. * Get the callback function for this URegularExpression.
  1526. *
  1527. * @param callback Out parameter, receives a pointer to the user-supplied
  1528. * callback function.
  1529. * @param context Out parameter, receives the user context pointer that
  1530. * was set when uregex_setMatchCallback() was called.
  1531. * @param status A reference to a UErrorCode to receive any errors.
  1532. * @stable ICU 4.0
  1533. */
  1534. virtual void getMatchCallback(URegexMatchCallback *&callback,
  1535. const void *&context,
  1536. UErrorCode &status);
  1537. /**
  1538. * Set a progress callback function for use with find operations on this Matcher.
  1539. * During find operations, the callback will be invoked after each return from a
  1540. * match attempt, giving the application the opportunity to terminate a long-running
  1541. * find operation.
  1542. *
  1543. * @param callback A pointer to the user-supplied callback function.
  1544. * @param context User context pointer. The value supplied at the
  1545. * time the callback function is set will be saved
  1546. * and passed to the callback each time that it is called.
  1547. * @param status A reference to a UErrorCode to receive any errors.
  1548. * @stable ICU 4.6
  1549. */
  1550. virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
  1551. const void *context,
  1552. UErrorCode &status);
  1553. /**
  1554. * Get the find progress callback function for this URegularExpression.
  1555. *
  1556. * @param callback Out parameter, receives a pointer to the user-supplied
  1557. * callback function.
  1558. * @param context Out parameter, receives the user context pointer that
  1559. * was set when uregex_setFindProgressCallback() was called.
  1560. * @param status A reference to a UErrorCode to receive any errors.
  1561. * @stable ICU 4.6
  1562. */
  1563. virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
  1564. const void *&context,
  1565. UErrorCode &status);
  1566. #ifndef U_HIDE_INTERNAL_API
  1567. /**
  1568. * setTrace Debug function, enable/disable tracing of the matching engine.
  1569. * For internal ICU development use only. DO NO USE!!!!
  1570. * @internal
  1571. */
  1572. void setTrace(UBool state);
  1573. #endif /* U_HIDE_INTERNAL_API */
  1574. /**
  1575. * ICU "poor man's RTTI", returns a UClassID for this class.
  1576. *
  1577. * @stable ICU 2.2
  1578. */
  1579. static UClassID U_EXPORT2 getStaticClassID();
  1580. /**
  1581. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  1582. *
  1583. * @stable ICU 2.2
  1584. */
  1585. virtual UClassID getDynamicClassID() const;
  1586. private:
  1587. // Constructors and other object boilerplate are private.
  1588. // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
  1589. RegexMatcher(); // default constructor not implemented
  1590. RegexMatcher(const RegexPattern *pat);
  1591. RegexMatcher(const RegexMatcher &other);
  1592. RegexMatcher &operator =(const RegexMatcher &rhs);
  1593. void init(UErrorCode &status); // Common initialization
  1594. void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
  1595. friend class RegexPattern;
  1596. friend class RegexCImpl;
  1597. public:
  1598. #ifndef U_HIDE_INTERNAL_API
  1599. /** @internal */
  1600. void resetPreserveRegion(); // Reset matcher state, but preserve any region.
  1601. #endif /* U_HIDE_INTERNAL_API */
  1602. private:
  1603. //
  1604. // MatchAt This is the internal interface to the match engine itself.
  1605. // Match status comes back in matcher member variables.
  1606. //
  1607. void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
  1608. inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
  1609. UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
  1610. UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
  1611. REStackFrame *resetStack();
  1612. inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
  1613. void IncrementTime(UErrorCode &status);
  1614. // Call user find callback function, if set. Return TRUE if operation should be interrupted.
  1615. inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
  1616. int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
  1617. UBool findUsingChunk(UErrorCode &status);
  1618. void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
  1619. UBool isChunkWordBoundary(int32_t pos);
  1620. const RegexPattern *fPattern;
  1621. RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
  1622. // should delete it when through.
  1623. const UnicodeString *fInput; // The string being matched. Only used for input()
  1624. UText *fInputText; // The text being matched. Is never NULL.
  1625. UText *fAltInputText; // A shallow copy of the text being matched.
  1626. // Only created if the pattern contains backreferences.
  1627. int64_t fInputLength; // Full length of the input text.
  1628. int32_t fFrameSize; // The size of a frame in the backtrack stack.
  1629. int64_t fRegionStart; // Start of the input region, default = 0.
  1630. int64_t fRegionLimit; // End of input region, default to input.length.
  1631. int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
  1632. int64_t fAnchorLimit; // See useAnchoringBounds
  1633. int64_t fLookStart; // Region bounds for look-ahead/behind and
  1634. int64_t fLookLimit; // and other boundary tests. See
  1635. // useTransparentBounds
  1636. int64_t fActiveStart; // Currently active bounds for matching.
  1637. int64_t fActiveLimit; // Usually is the same as region, but
  1638. // is changed to fLookStart/Limit when
  1639. // entering look around regions.
  1640. UBool fTransparentBounds; // True if using transparent bounds.
  1641. UBool fAnchoringBounds; // True if using anchoring bounds.
  1642. UBool fMatch; // True if the last attempted match was successful.
  1643. int64_t fMatchStart; // Position of the start of the most recent match
  1644. int64_t fMatchEnd; // First position after the end of the most recent match
  1645. // Zero if no previous match, even when a region
  1646. // is active.
  1647. int64_t fLastMatchEnd; // First position after the end of the previous match,
  1648. // or -1 if there was no previous match.
  1649. int64_t fAppendPosition; // First position after the end of the previous
  1650. // appendReplacement(). As described by the
  1651. // JavaDoc for Java Matcher, where it is called
  1652. // "append position"
  1653. UBool fHitEnd; // True if the last match touched the end of input.
  1654. UBool fRequireEnd; // True if the last match required end-of-input
  1655. // (matched $ or Z)
  1656. UVector64 *fStack;
  1657. REStackFrame *fFrame; // After finding a match, the last active stack frame,
  1658. // which will contain the capture group results.
  1659. // NOT valid while match engine is running.
  1660. int64_t *fData; // Data area for use by the compiled pattern.
  1661. int64_t fSmallData[8]; // Use this for data if it's enough.
  1662. int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
  1663. // match engine run. Zero for unlimited.
  1664. int32_t fTime; // Match time, accumulates while matching.
  1665. int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
  1666. // Kept separately from fTime to keep as much
  1667. // code as possible out of the inline
  1668. // StateSave function.
  1669. int32_t fStackLimit; // Maximum memory size to use for the backtrack
  1670. // stack, in bytes. Zero for unlimited.
  1671. URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
  1672. // NULL if there is no callback.
  1673. const void *fCallbackContext; // User Context ptr for callback function.
  1674. URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
  1675. // NULL if there is no callback.
  1676. const void *fFindProgressCallbackContext; // User Context ptr for callback function.
  1677. UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
  1678. UBool fTraceDebug; // Set true for debug tracing of match engine.
  1679. UErrorCode fDeferredStatus; // Save error state that cannot be immediately
  1680. // reported, or that permanently disables this matcher.
  1681. RuleBasedBreakIterator *fWordBreakItr;
  1682. };
  1683. U_NAMESPACE_END
  1684. #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
  1685. #endif