rbbi.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 1999-2016 International Business Machines Corporation *
  6. * and others. All rights reserved. *
  7. ***************************************************************************
  8. **********************************************************************
  9. * Date Name Description
  10. * 10/22/99 alan Creation.
  11. * 11/11/99 rgillam Complete port from Java.
  12. **********************************************************************
  13. */
  14. #ifndef RBBI_H
  15. #define RBBI_H
  16. #include "unicode/utypes.h"
  17. /**
  18. * \file
  19. * \brief C++ API: Rule Based Break Iterator
  20. */
  21. #if !UCONFIG_NO_BREAK_ITERATION
  22. #include "unicode/brkiter.h"
  23. #include "unicode/udata.h"
  24. #include "unicode/parseerr.h"
  25. #include "unicode/schriter.h"
  26. #include "unicode/uchriter.h"
  27. struct UTrie;
  28. U_NAMESPACE_BEGIN
  29. /** @internal */
  30. struct RBBIDataHeader;
  31. class RuleBasedBreakIteratorTables;
  32. class BreakIterator;
  33. class RBBIDataWrapper;
  34. class UStack;
  35. class LanguageBreakEngine;
  36. class UnhandledEngine;
  37. struct RBBIStateTable;
  38. /**
  39. *
  40. * A subclass of BreakIterator whose behavior is specified using a list of rules.
  41. * <p>Instances of this class are most commonly created by the factory methods of
  42. * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
  43. * and then used via the abstract API in class BreakIterator</p>
  44. *
  45. * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
  46. *
  47. * <p>This class is not intended to be subclassed.</p>
  48. */
  49. class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {
  50. private:
  51. /**
  52. * The UText through which this BreakIterator accesses the text
  53. * @internal
  54. */
  55. UText *fText;
  56. /**
  57. * A character iterator that refers to the same text as the UText, above.
  58. * Only included for compatibility with old API, which was based on CharacterIterators.
  59. * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
  60. */
  61. CharacterIterator *fCharIter;
  62. /**
  63. * When the input text is provided by a UnicodeString, this will point to
  64. * a characterIterator that wraps that data. Needed only for the
  65. * implementation of getText(), a backwards compatibility issue.
  66. */
  67. StringCharacterIterator *fSCharIter;
  68. /**
  69. * When the input text is provided by a UText, this
  70. * dummy CharacterIterator over an empty string will
  71. * be returned from getText()
  72. */
  73. UCharCharacterIterator *fDCharIter;
  74. /**
  75. * The rule data for this BreakIterator instance
  76. * @internal
  77. */
  78. RBBIDataWrapper *fData;
  79. /** Index of the Rule {tag} values for the most recent match.
  80. * @internal
  81. */
  82. int32_t fLastRuleStatusIndex;
  83. /**
  84. * Rule tag value valid flag.
  85. * Some iterator operations don't intrinsically set the correct tag value.
  86. * This flag lets us lazily compute the value if we are ever asked for it.
  87. * @internal
  88. */
  89. UBool fLastStatusIndexValid;
  90. /**
  91. * Counter for the number of characters encountered with the "dictionary"
  92. * flag set.
  93. * @internal
  94. */
  95. uint32_t fDictionaryCharCount;
  96. /**
  97. * When a range of characters is divided up using the dictionary, the break
  98. * positions that are discovered are stored here, preventing us from having
  99. * to use either the dictionary or the state table again until the iterator
  100. * leaves this range of text. Has the most impact for line breaking.
  101. * @internal
  102. */
  103. int32_t* fCachedBreakPositions;
  104. /**
  105. * The number of elements in fCachedBreakPositions
  106. * @internal
  107. */
  108. int32_t fNumCachedBreakPositions;
  109. /**
  110. * if fCachedBreakPositions is not null, this indicates which item in the
  111. * cache the current iteration position refers to
  112. * @internal
  113. */
  114. int32_t fPositionInCache;
  115. /**
  116. *
  117. * If present, UStack of LanguageBreakEngine objects that might handle
  118. * dictionary characters. Searched from top to bottom to find an object to
  119. * handle a given character.
  120. * @internal
  121. */
  122. UStack *fLanguageBreakEngines;
  123. /**
  124. *
  125. * If present, the special LanguageBreakEngine used for handling
  126. * characters that are in the dictionary set, but not handled by any
  127. * LangugageBreakEngine.
  128. * @internal
  129. */
  130. UnhandledEngine *fUnhandledBreakEngine;
  131. /**
  132. *
  133. * The type of the break iterator, or -1 if it has not been set.
  134. * @internal
  135. */
  136. int32_t fBreakType;
  137. //=======================================================================
  138. // constructors
  139. //=======================================================================
  140. /**
  141. * Constructor from a flattened set of RBBI data in malloced memory.
  142. * RulesBasedBreakIterators built from a custom set of rules
  143. * are created via this constructor; the rules are compiled
  144. * into memory, then the break iterator is constructed here.
  145. *
  146. * The break iterator adopts the memory, and will
  147. * free it when done.
  148. * @internal
  149. */
  150. RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
  151. friend class RBBIRuleBuilder;
  152. /** @internal */
  153. friend class BreakIterator;
  154. public:
  155. /** Default constructor. Creates an empty shell of an iterator, with no
  156. * rules or text to iterate over. Object can subsequently be assigned to.
  157. * @stable ICU 2.2
  158. */
  159. RuleBasedBreakIterator();
  160. /**
  161. * Copy constructor. Will produce a break iterator with the same behavior,
  162. * and which iterates over the same text, as the one passed in.
  163. * @param that The RuleBasedBreakIterator passed to be copied
  164. * @stable ICU 2.0
  165. */
  166. RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
  167. /**
  168. * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
  169. * @param rules The break rules to be used.
  170. * @param parseError In the event of a syntax error in the rules, provides the location
  171. * within the rules of the problem.
  172. * @param status Information on any errors encountered.
  173. * @stable ICU 2.2
  174. */
  175. RuleBasedBreakIterator( const UnicodeString &rules,
  176. UParseError &parseError,
  177. UErrorCode &status);
  178. /**
  179. * Contruct a RuleBasedBreakIterator from a set of precompiled binary rules.
  180. * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
  181. * Construction of a break iterator in this way is substantially faster than
  182. * constuction from source rules.
  183. *
  184. * Ownership of the storage containing the compiled rules remains with the
  185. * caller of this function. The compiled rules must not be modified or
  186. * deleted during the life of the break iterator.
  187. *
  188. * The compiled rules are not compatible across different major versions of ICU.
  189. * The compiled rules are comaptible only between machines with the same
  190. * byte ordering (little or big endian) and the same base character set family
  191. * (ASCII or EBCDIC).
  192. *
  193. * @see #getBinaryRules
  194. * @param compiledRules A pointer to the compiled break rules to be used.
  195. * @param ruleLength The length of the compiled break rules, in bytes. This
  196. * corresponds to the length value produced by getBinaryRules().
  197. * @param status Information on any errors encountered, including invalid
  198. * binary rules.
  199. * @stable ICU 4.8
  200. */
  201. RuleBasedBreakIterator(const uint8_t *compiledRules,
  202. uint32_t ruleLength,
  203. UErrorCode &status);
  204. /**
  205. * This constructor uses the udata interface to create a BreakIterator
  206. * whose internal tables live in a memory-mapped file. "image" is an
  207. * ICU UDataMemory handle for the pre-compiled break iterator tables.
  208. * @param image handle to the memory image for the break iterator data.
  209. * Ownership of the UDataMemory handle passes to the Break Iterator,
  210. * which will be responsible for closing it when it is no longer needed.
  211. * @param status Information on any errors encountered.
  212. * @see udata_open
  213. * @see #getBinaryRules
  214. * @stable ICU 2.8
  215. */
  216. RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
  217. /**
  218. * Destructor
  219. * @stable ICU 2.0
  220. */
  221. virtual ~RuleBasedBreakIterator();
  222. /**
  223. * Assignment operator. Sets this iterator to have the same behavior,
  224. * and iterate over the same text, as the one passed in.
  225. * @param that The RuleBasedBreakItertor passed in
  226. * @return the newly created RuleBasedBreakIterator
  227. * @stable ICU 2.0
  228. */
  229. RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
  230. /**
  231. * Equality operator. Returns TRUE if both BreakIterators are of the
  232. * same class, have the same behavior, and iterate over the same text.
  233. * @param that The BreakIterator to be compared for equality
  234. * @return TRUE if both BreakIterators are of the
  235. * same class, have the same behavior, and iterate over the same text.
  236. * @stable ICU 2.0
  237. */
  238. virtual UBool operator==(const BreakIterator& that) const;
  239. /**
  240. * Not-equal operator. If operator== returns TRUE, this returns FALSE,
  241. * and vice versa.
  242. * @param that The BreakIterator to be compared for inequality
  243. * @return TRUE if both BreakIterators are not same.
  244. * @stable ICU 2.0
  245. */
  246. UBool operator!=(const BreakIterator& that) const;
  247. /**
  248. * Returns a newly-constructed RuleBasedBreakIterator with the same
  249. * behavior, and iterating over the same text, as this one.
  250. * Differs from the copy constructor in that it is polymorphic, and
  251. * will correctly clone (copy) a derived class.
  252. * clone() is thread safe. Multiple threads may simultaeneously
  253. * clone the same source break iterator.
  254. * @return a newly-constructed RuleBasedBreakIterator
  255. * @stable ICU 2.0
  256. */
  257. virtual BreakIterator* clone() const;
  258. /**
  259. * Compute a hash code for this BreakIterator
  260. * @return A hash code
  261. * @stable ICU 2.0
  262. */
  263. virtual int32_t hashCode(void) const;
  264. /**
  265. * Returns the description used to create this iterator
  266. * @return the description used to create this iterator
  267. * @stable ICU 2.0
  268. */
  269. virtual const UnicodeString& getRules(void) const;
  270. //=======================================================================
  271. // BreakIterator overrides
  272. //=======================================================================
  273. /**
  274. * <p>
  275. * Return a CharacterIterator over the text being analyzed.
  276. * The returned character iterator is owned by the break iterator, and must
  277. * not be deleted by the caller. Repeated calls to this function may
  278. * return the same CharacterIterator.
  279. * </p>
  280. * <p>
  281. * The returned character iterator must not be used concurrently with
  282. * the break iterator. If concurrent operation is needed, clone the
  283. * returned character iterator first and operate on the clone.
  284. * </p>
  285. * <p>
  286. * When the break iterator is operating on text supplied via a UText,
  287. * this function will fail. Lacking any way to signal failures, it
  288. * returns an CharacterIterator containing no text.
  289. * The function getUText() provides similar functionality,
  290. * is reliable, and is more efficient.
  291. * </p>
  292. *
  293. * TODO: deprecate this function?
  294. *
  295. * @return An iterator over the text being analyzed.
  296. * @stable ICU 2.0
  297. */
  298. virtual CharacterIterator& getText(void) const;
  299. /**
  300. * Get a UText for the text being analyzed.
  301. * The returned UText is a shallow clone of the UText used internally
  302. * by the break iterator implementation. It can safely be used to
  303. * access the text without impacting any break iterator operations,
  304. * but the underlying text itself must not be altered.
  305. *
  306. * @param fillIn A UText to be filled in. If NULL, a new UText will be
  307. * allocated to hold the result.
  308. * @param status receives any error codes.
  309. * @return The current UText for this break iterator. If an input
  310. * UText was provided, it will always be returned.
  311. * @stable ICU 3.4
  312. */
  313. virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
  314. /**
  315. * Set the iterator to analyze a new piece of text. This function resets
  316. * the current iteration position to the beginning of the text.
  317. * @param newText An iterator over the text to analyze. The BreakIterator
  318. * takes ownership of the character iterator. The caller MUST NOT delete it!
  319. * @stable ICU 2.0
  320. */
  321. virtual void adoptText(CharacterIterator* newText);
  322. /**
  323. * Set the iterator to analyze a new piece of text. This function resets
  324. * the current iteration position to the beginning of the text.
  325. *
  326. * The BreakIterator will retain a reference to the supplied string.
  327. * The caller must not modify or delete the text while the BreakIterator
  328. * retains the reference.
  329. *
  330. * @param newText The text to analyze.
  331. * @stable ICU 2.0
  332. */
  333. virtual void setText(const UnicodeString& newText);
  334. /**
  335. * Reset the break iterator to operate over the text represented by
  336. * the UText. The iterator position is reset to the start.
  337. *
  338. * This function makes a shallow clone of the supplied UText. This means
  339. * that the caller is free to immediately close or otherwise reuse the
  340. * Utext that was passed as a parameter, but that the underlying text itself
  341. * must not be altered while being referenced by the break iterator.
  342. *
  343. * @param text The UText used to change the text.
  344. * @param status Receives any error codes.
  345. * @stable ICU 3.4
  346. */
  347. virtual void setText(UText *text, UErrorCode &status);
  348. /**
  349. * Sets the current iteration position to the beginning of the text, position zero.
  350. * @return The offset of the beginning of the text, zero.
  351. * @stable ICU 2.0
  352. */
  353. virtual int32_t first(void);
  354. /**
  355. * Sets the current iteration position to the end of the text.
  356. * @return The text's past-the-end offset.
  357. * @stable ICU 2.0
  358. */
  359. virtual int32_t last(void);
  360. /**
  361. * Advances the iterator either forward or backward the specified number of steps.
  362. * Negative values move backward, and positive values move forward. This is
  363. * equivalent to repeatedly calling next() or previous().
  364. * @param n The number of steps to move. The sign indicates the direction
  365. * (negative is backwards, and positive is forwards).
  366. * @return The character offset of the boundary position n boundaries away from
  367. * the current one.
  368. * @stable ICU 2.0
  369. */
  370. virtual int32_t next(int32_t n);
  371. /**
  372. * Advances the iterator to the next boundary position.
  373. * @return The position of the first boundary after this one.
  374. * @stable ICU 2.0
  375. */
  376. virtual int32_t next(void);
  377. /**
  378. * Moves the iterator backwards, to the last boundary preceding this one.
  379. * @return The position of the last boundary position preceding this one.
  380. * @stable ICU 2.0
  381. */
  382. virtual int32_t previous(void);
  383. /**
  384. * Sets the iterator to refer to the first boundary position following
  385. * the specified position.
  386. * @param offset The position from which to begin searching for a break position.
  387. * @return The position of the first break after the current position.
  388. * @stable ICU 2.0
  389. */
  390. virtual int32_t following(int32_t offset);
  391. /**
  392. * Sets the iterator to refer to the last boundary position before the
  393. * specified position.
  394. * @param offset The position to begin searching for a break from.
  395. * @return The position of the last boundary before the starting position.
  396. * @stable ICU 2.0
  397. */
  398. virtual int32_t preceding(int32_t offset);
  399. /**
  400. * Returns true if the specfied position is a boundary position. As a side
  401. * effect, leaves the iterator pointing to the first boundary position at
  402. * or after "offset".
  403. * @param offset the offset to check.
  404. * @return True if "offset" is a boundary position.
  405. * @stable ICU 2.0
  406. */
  407. virtual UBool isBoundary(int32_t offset);
  408. /**
  409. * Returns the current iteration position.
  410. * @return The current iteration position.
  411. * @stable ICU 2.0
  412. */
  413. virtual int32_t current(void) const;
  414. /**
  415. * Return the status tag from the break rule that determined the most recently
  416. * returned break position. For break rules that do not specify a
  417. * status, a default value of 0 is returned. If more than one break rule
  418. * would cause a boundary to be located at some position in the text,
  419. * the numerically largest of the applicable status values is returned.
  420. * <p>
  421. * Of the standard types of ICU break iterators, only word break and
  422. * line break provide status values. The values are defined in
  423. * the header file ubrk.h. For Word breaks, the status allows distinguishing between words
  424. * that contain alphabetic letters, "words" that appear to be numbers,
  425. * punctuation and spaces, words containing ideographic characters, and
  426. * more. For Line Break, the status distinguishes between hard (mandatory) breaks
  427. * and soft (potential) break positions.
  428. * <p>
  429. * <code>getRuleStatus()</code> can be called after obtaining a boundary
  430. * position from <code>next()</code>, <code>previous()</code>, or
  431. * any other break iterator functions that returns a boundary position.
  432. * <p>
  433. * When creating custom break rules, one is free to define whatever
  434. * status values may be convenient for the application.
  435. * <p>
  436. * Note: this function is not thread safe. It should not have been
  437. * declared const, and the const remains only for compatibility
  438. * reasons. (The function is logically const, but not bit-wise const).
  439. * <p>
  440. * @return the status from the break rule that determined the most recently
  441. * returned break position.
  442. *
  443. * @see UWordBreak
  444. * @stable ICU 2.2
  445. */
  446. virtual int32_t getRuleStatus() const;
  447. /**
  448. * Get the status (tag) values from the break rule(s) that determined the most
  449. * recently returned break position.
  450. * <p>
  451. * The returned status value(s) are stored into an array provided by the caller.
  452. * The values are stored in sorted (ascending) order.
  453. * If the capacity of the output array is insufficient to hold the data,
  454. * the output will be truncated to the available length, and a
  455. * U_BUFFER_OVERFLOW_ERROR will be signaled.
  456. *
  457. * @param fillInVec an array to be filled in with the status values.
  458. * @param capacity the length of the supplied vector. A length of zero causes
  459. * the function to return the number of status values, in the
  460. * normal way, without attemtping to store any values.
  461. * @param status receives error codes.
  462. * @return The number of rule status values from rules that determined
  463. * the most recent boundary returned by the break iterator.
  464. * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
  465. * is the total number of status values that were available,
  466. * not the reduced number that were actually returned.
  467. * @see getRuleStatus
  468. * @stable ICU 3.0
  469. */
  470. virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
  471. /**
  472. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
  473. * This method is to implement a simple version of RTTI, since not all
  474. * C++ compilers support genuine RTTI. Polymorphic operator==() and
  475. * clone() methods call this method.
  476. *
  477. * @return The class ID for this object. All objects of a
  478. * given class have the same class ID. Objects of
  479. * other classes have different class IDs.
  480. * @stable ICU 2.0
  481. */
  482. virtual UClassID getDynamicClassID(void) const;
  483. /**
  484. * Returns the class ID for this class. This is useful only for
  485. * comparing to a return value from getDynamicClassID(). For example:
  486. *
  487. * Base* polymorphic_pointer = createPolymorphicObject();
  488. * if (polymorphic_pointer->getDynamicClassID() ==
  489. * Derived::getStaticClassID()) ...
  490. *
  491. * @return The class ID for all objects of this class.
  492. * @stable ICU 2.0
  493. */
  494. static UClassID U_EXPORT2 getStaticClassID(void);
  495. /**
  496. * Deprecated functionality. Use clone() instead.
  497. *
  498. * Create a clone (copy) of this break iterator in memory provided
  499. * by the caller. The idea is to increase performance by avoiding
  500. * a storage allocation. Use of this functoin is NOT RECOMMENDED.
  501. * Performance gains are minimal, and correct buffer management is
  502. * tricky. Use clone() instead.
  503. *
  504. * @param stackBuffer The pointer to the memory into which the cloned object
  505. * should be placed. If NULL, allocate heap memory
  506. * for the cloned object.
  507. * @param BufferSize The size of the buffer. If zero, return the required
  508. * buffer size, but do not clone the object. If the
  509. * size was too small (but not zero), allocate heap
  510. * storage for the cloned object.
  511. *
  512. * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
  513. * returned if the the provided buffer was too small, and
  514. * the clone was therefore put on the heap.
  515. *
  516. * @return Pointer to the clone object. This may differ from the stackBuffer
  517. * address if the byte alignment of the stack buffer was not suitable
  518. * or if the stackBuffer was too small to hold the clone.
  519. * @deprecated ICU 52. Use clone() instead.
  520. */
  521. virtual BreakIterator * createBufferClone(void *stackBuffer,
  522. int32_t &BufferSize,
  523. UErrorCode &status);
  524. /**
  525. * Return the binary form of compiled break rules,
  526. * which can then be used to create a new break iterator at some
  527. * time in the future. Creating a break iterator from pre-compiled rules
  528. * is much faster than building one from the source form of the
  529. * break rules.
  530. *
  531. * The binary data can only be used with the same version of ICU
  532. * and on the same platform type (processor endian-ness)
  533. *
  534. * @param length Returns the length of the binary data. (Out paramter.)
  535. *
  536. * @return A pointer to the binary (compiled) rule data. The storage
  537. * belongs to the RulesBasedBreakIterator object, not the
  538. * caller, and must not be modified or deleted.
  539. * @stable ICU 4.8
  540. */
  541. virtual const uint8_t *getBinaryRules(uint32_t &length);
  542. /**
  543. * Set the subject text string upon which the break iterator is operating
  544. * without changing any other aspect of the matching state.
  545. * The new and previous text strings must have the same content.
  546. *
  547. * This function is intended for use in environments where ICU is operating on
  548. * strings that may move around in memory. It provides a mechanism for notifying
  549. * ICU that the string has been relocated, and providing a new UText to access the
  550. * string in its new position.
  551. *
  552. * Note that the break iterator implementation never copies the underlying text
  553. * of a string being processed, but always operates directly on the original text
  554. * provided by the user. Refreshing simply drops the references to the old text
  555. * and replaces them with references to the new.
  556. *
  557. * Caution: this function is normally used only by very specialized,
  558. * system-level code. One example use case is with garbage collection that moves
  559. * the text in memory.
  560. *
  561. * @param input The new (moved) text string.
  562. * @param status Receives errors detected by this function.
  563. * @return *this
  564. *
  565. * @stable ICU 49
  566. */
  567. virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
  568. private:
  569. //=======================================================================
  570. // implementation
  571. //=======================================================================
  572. /**
  573. * Dumps caches and performs other actions associated with a complete change
  574. * in text or iteration position.
  575. * @internal
  576. */
  577. void reset(void);
  578. /**
  579. * Set the type of the break iterator.
  580. * @internal
  581. */
  582. void setBreakType(int32_t type);
  583. /**
  584. * Common initialization function, used by constructors and bufferClone.
  585. * @internal
  586. */
  587. void init();
  588. /**
  589. * This method backs the iterator back up to a "safe position" in the text.
  590. * This is a position that we know, without any context, must be a break position.
  591. * The various calling methods then iterate forward from this safe position to
  592. * the appropriate position to return. (For more information, see the description
  593. * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
  594. * @param statetable state table used of moving backwards
  595. * @internal
  596. */
  597. int32_t handlePrevious(const RBBIStateTable *statetable);
  598. /**
  599. * This method is the actual implementation of the next() method. All iteration
  600. * vectors through here. This method initializes the state machine to state 1
  601. * and advances through the text character by character until we reach the end
  602. * of the text or the state machine transitions to state 0. We update our return
  603. * value every time the state machine passes through a possible end state.
  604. * @param statetable state table used of moving forwards
  605. * @internal
  606. */
  607. int32_t handleNext(const RBBIStateTable *statetable);
  608. /**
  609. * This is the function that actually implements dictionary-based
  610. * breaking. Covering at least the range from startPos to endPos,
  611. * it checks for dictionary characters, and if it finds them determines
  612. * the appropriate object to deal with them. It may cache found breaks in
  613. * fCachedBreakPositions as it goes. It may well also look at text outside
  614. * the range startPos to endPos.
  615. * If going forward, endPos is the normal Unicode break result, and
  616. * if goind in reverse, startPos is the normal Unicode break result
  617. * @param startPos The start position of a range of text
  618. * @param endPos The end position of a range of text
  619. * @param reverse The call is for the reverse direction
  620. * @internal
  621. */
  622. int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
  623. /**
  624. * This function returns the appropriate LanguageBreakEngine for a
  625. * given character c.
  626. * @param c A character in the dictionary set
  627. * @internal
  628. */
  629. const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
  630. /**
  631. * @internal
  632. */
  633. void makeRuleStatusValid();
  634. };
  635. //------------------------------------------------------------------------------
  636. //
  637. // Inline Functions Definitions ...
  638. //
  639. //------------------------------------------------------------------------------
  640. inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
  641. return !operator==(that);
  642. }
  643. U_NAMESPACE_END
  644. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  645. #endif