ICU 73.2  73.2
rbbi.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 
9 **********************************************************************
10 * Date Name Description
11 * 10/22/99 alan Creation.
12 * 11/11/99 rgillam Complete port from Java.
13 **********************************************************************
14 */
15 
16 #ifndef RBBI_H
17 #define RBBI_H
18 
19 #include "unicode/utypes.h"
20 
21 #if U_SHOW_CPLUSPLUS_API
22 
28 #if !UCONFIG_NO_BREAK_ITERATION
29 
30 #include "unicode/brkiter.h"
31 #include "unicode/udata.h"
32 #include "unicode/parseerr.h"
33 #include "unicode/schriter.h"
34 
35 struct UCPTrie;
36 
37 U_NAMESPACE_BEGIN
38 
40 class LanguageBreakEngine;
41 struct RBBIDataHeader;
42 class RBBIDataWrapper;
43 class UnhandledEngine;
44 class UStack;
45 
58 
59 private:
64  UText fText = UTEXT_INITIALIZER;
65 
66 #ifndef U_HIDE_INTERNAL_API
67 public:
68 #endif /* U_HIDE_INTERNAL_API */
69 
74  RBBIDataWrapper *fData = nullptr;
75 
76 private:
81  UErrorCode fErrorCode = U_ZERO_ERROR;
82 
87  int32_t fPosition = 0;
88 
92  int32_t fRuleStatusIndex = 0;
93 
97  class BreakCache;
98  BreakCache *fBreakCache = nullptr;
99 
104  class DictionaryCache;
105  DictionaryCache *fDictionaryCache = nullptr;
106 
114  UStack *fLanguageBreakEngines = nullptr;
115 
123  UnhandledEngine *fUnhandledBreakEngine = nullptr;
124 
130  uint32_t fDictionaryCharCount = 0;
131 
137  CharacterIterator *fCharIter = &fSCharIter;
138 
144  UCharCharacterIterator fSCharIter {u"", 0};
145 
149  bool fDone = false;
150 
154  int32_t *fLookAheadMatches = nullptr;
155 
159  UBool fIsPhraseBreaking = false;
160 
161  //=======================================================================
162  // constructors
163  //=======================================================================
164 
175  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
176 
190  RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
191 
193  friend class RBBIRuleBuilder;
195  friend class BreakIterator;
196 
204 
205 public:
206 
213 
221 
231  UParseError &parseError,
232  UErrorCode &status);
233 
257  RuleBasedBreakIterator(const uint8_t *compiledRules,
258  uint32_t ruleLength,
259  UErrorCode &status);
260 
274 
279  virtual ~RuleBasedBreakIterator();
280 
289 
298  virtual bool operator==(const BreakIterator& that) const override;
299 
307  inline bool operator!=(const BreakIterator& that) const {
308  return !operator==(that);
309  }
310 
321  virtual RuleBasedBreakIterator* clone() const override;
322 
328  virtual int32_t hashCode(void) const;
329 
335  virtual const UnicodeString& getRules(void) const;
336 
337  //=======================================================================
338  // BreakIterator overrides
339  //=======================================================================
340 
365  virtual CharacterIterator& getText(void) const override;
366 
367 
382  virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
383 
391  virtual void adoptText(CharacterIterator* newText) override;
392 
404  virtual void setText(const UnicodeString& newText) override;
405 
419  virtual void setText(UText *text, UErrorCode &status) override;
420 
426  virtual int32_t first(void) override;
427 
433  virtual int32_t last(void) override;
434 
445  virtual int32_t next(int32_t n) override;
446 
452  virtual int32_t next(void) override;
453 
459  virtual int32_t previous(void) override;
460 
468  virtual int32_t following(int32_t offset) override;
469 
477  virtual int32_t preceding(int32_t offset) override;
478 
487  virtual UBool isBoundary(int32_t offset) override;
488 
497  virtual int32_t current(void) const override;
498 
499 
531  virtual int32_t getRuleStatus() const override;
532 
556  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
557 
569  virtual UClassID getDynamicClassID(void) const override;
570 
582  static UClassID U_EXPORT2 getStaticClassID(void);
583 
584 #ifndef U_FORCE_HIDE_DEPRECATED_API
585 
611  virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
612  int32_t &BufferSize,
613  UErrorCode &status) override;
614 #endif // U_FORCE_HIDE_DEPRECATED_API
615 
633  virtual const uint8_t *getBinaryRules(uint32_t &length);
634 
660  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
661 
662 
663 private:
664  //=======================================================================
665  // implementation
666  //=======================================================================
676  int32_t handleSafePrevious(int32_t fromPosition);
677 
690  int32_t handleNext();
691 
692  /*
693  * Templatized version of handleNext() and handleSafePrevious().
694  *
695  * There will be exactly four instantiations, two each for 8 and 16 bit tables,
696  * two each for 8 and 16 bit trie.
697  * Having separate instantiations for the table types keeps conditional tests of
698  * the table type out of the inner loops, at the expense of replicated code.
699  *
700  * The template parameter for the Trie access function is a value, not a type.
701  * Doing it this way, the compiler will inline the Trie function in the
702  * expanded functions. (Both the 8 and 16 bit access functions have the same type
703  * signature)
704  */
705 
706  typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
707 
708  template<typename RowType, PTrieFunc trieFunc>
709  int32_t handleSafePrevious(int32_t fromPosition);
710 
711  template<typename RowType, PTrieFunc trieFunc>
712  int32_t handleNext();
713 
714 
721  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
722 
723  public:
724 #ifndef U_HIDE_INTERNAL_API
725 
729  void dumpCache();
730 
735  void dumpTables();
736 #endif /* U_HIDE_INTERNAL_API */
737 };
738 
739 U_NAMESPACE_END
740 
741 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
742 
743 #endif /* U_SHOW_CPLUSPLUS_API */
744 
745 #endif
icu::BreakIterator
The BreakIterator class implements methods for finding the location of boundaries in text.
Definition: brkiter.h:106
parseerr.h
C API: Parse Error Information.
utypes.h
Basic definitions for ICU, for both C and C++ APIs.
icu::BreakIterator::current
virtual int32_t current(void) const =0
Return character index of the current iterator position within the text.
UBool
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
icu::BreakIterator::getRuleStatus
virtual int32_t getRuleStatus() const
For RuleBasedBreakIterators, return the status tag from the break rule that determined the boundary a...
U_COMMON_API
#define U_COMMON_API
Definition: utypes.h:300
icu::BreakIterator::operator=
BreakIterator & operator=(const BreakIterator &other)
icu::BreakIterator::createBufferClone
virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Deprecated functionality.
icu::BreakIterator::setText
virtual void setText(const UnicodeString &text)=0
Change the text over which this operates.
icu::BreakIterator::isBoundary
virtual UBool isBoundary(int32_t offset)=0
Return true if the specified position is a boundary position.
icu::RuleBasedBreakIterator::operator!=
bool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:307
UTEXT_INITIALIZER
#define UTEXT_INITIALIZER
initializer to be used with local (stack) instances of a UText struct.
Definition: utext.h:1558
brkiter.h
C++ API: Break Iterator.
UParseError
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
UCPTrie
Immutable Unicode code point trie structure.
Definition: ucptrie.h:59
icu::BreakIterator::getDynamicClassID
virtual UClassID getDynamicClassID(void) const override=0
Return a polymorphic class ID for this object.
icu::BreakIterator::preceding
virtual int32_t preceding(int32_t offset)=0
Set the iterator position to the first boundary preceding the specified offset.
icu::UnicodeString
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
schriter.h
C++ API: String Character Iterator.
UChar32
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:435
UClassID
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
icu::BreakIterator::refreshInputText
virtual BreakIterator & refreshInputText(UText *input, UErrorCode &status)=0
Set the subject text string upon which the break iterator is operating without changing any other asp...
UErrorCode
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
icu::BreakIterator::operator==
virtual bool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
icu::RuleBasedBreakIterator
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:57
udata.h
C API: Data loading interface.
icu::BreakIterator::following
virtual int32_t following(int32_t offset)=0
Advance the iterator to the first boundary following the specified offset.
UText
UText struct.
Definition: utext.h:1328
icu::BreakIterator::adoptText
virtual void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
icu::BreakIterator::clone
virtual BreakIterator * clone() const =0
Return a polymorphic copy of this object.
U_ZERO_ERROR
@ U_ZERO_ERROR
No error, no warning.
Definition: utypes.h:449
UDataMemory
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:161
icu::BreakIterator::last
virtual int32_t last(void)=0
Set the iterator position to the index immediately BEYOND the last character in the text being scanne...
icu::BreakIterator::getRuleStatusVec
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status)
For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) that determined the b...
icu::CharacterIterator
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:361
icu::BreakIterator::first
virtual int32_t first(void)=0
Sets the current iteration position to the beginning of the text, position zero.
icu::BreakIterator::getUText
virtual UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
icu::BreakIterator::next
virtual int32_t next(void)=0
Advance the iterator to the boundary following the current boundary.
icu::UCharCharacterIterator
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:38
icu::BreakIterator::previous
virtual int32_t previous(void)=0
Set the iterator position to the boundary preceding the current boundary.
icu::BreakIterator::getText
virtual CharacterIterator & getText(void) const =0
Return a CharacterIterator over the text being analyzed.