ICU 78.3  78.3
rbbi.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 
9 **********************************************************************
10 * Date Name Description
11 * 10/22/99 alan Creation.
12 * 11/11/99 rgillam Complete port from Java.
13 **********************************************************************
14 */
15 
16 #ifndef RBBI_H
17 #define RBBI_H
18 
19 #include "unicode/utypes.h"
20 
21 #if U_SHOW_CPLUSPLUS_API
22 
28 #if !UCONFIG_NO_BREAK_ITERATION
29 
30 #include "unicode/brkiter.h"
31 #include "unicode/udata.h"
32 #include "unicode/parseerr.h"
33 #include "unicode/schriter.h"
34 
35 struct UCPTrie;
36 
37 U_NAMESPACE_BEGIN
38 
40 class LanguageBreakEngine;
41 struct RBBIDataHeader;
42 class RBBIDataWrapper;
43 class UnhandledEngine;
44 class UStack;
45 
46 
47 #ifndef U_HIDE_INTERNAL_API
48 
60 class ExternalBreakEngine : public UObject {
61  public:
66  virtual ~ExternalBreakEngine() {}
67 
77  virtual bool isFor(UChar32 c, const char* locale) const = 0;
78 
87  virtual bool handles(UChar32 c) const = 0;
88 
102  virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
103  int32_t* foundBreaks, int32_t foundBreaksCapacity,
104  UErrorCode& status) const = 0;
105 };
106 #endif /* U_HIDE_INTERNAL_API */
107 
108 
121 
122 private:
126  UText fText = UTEXT_INITIALIZER;
127 
128 #ifndef U_HIDE_INTERNAL_API
129 public:
130 #endif /* U_HIDE_INTERNAL_API */
131 
136  RBBIDataWrapper *fData = nullptr;
137 
138 private:
143  UErrorCode fErrorCode = U_ZERO_ERROR;
144 
149  int32_t fPosition = 0;
150 
154  int32_t fRuleStatusIndex = 0;
155 
159  class BreakCache;
160  BreakCache *fBreakCache = nullptr;
161 
166  class DictionaryCache;
167  DictionaryCache *fDictionaryCache = nullptr;
168 
175  UStack *fLanguageBreakEngines = nullptr;
176 
183  UnhandledEngine *fUnhandledBreakEngine = nullptr;
184 
189  uint32_t fDictionaryCharCount = 0;
190 
196  CharacterIterator *fCharIter = &fSCharIter;
197 
203  UCharCharacterIterator fSCharIter {u"", 0};
204 
208  bool fDone = false;
209 
213  int32_t *fLookAheadMatches = nullptr;
214 
218  UBool fIsPhraseBreaking = false;
219 
220  //=======================================================================
221  // constructors
222  //=======================================================================
223 
233  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
234 
247  RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
248 
249  friend class RBBIRuleBuilder;
250  friend class BreakIterator;
251 
258 
259 public:
260 
267 
275 
285  UParseError &parseError,
286  UErrorCode &status);
287 
311  RuleBasedBreakIterator(const uint8_t *compiledRules,
312  uint32_t ruleLength,
313  UErrorCode &status);
314 
328 
333  virtual ~RuleBasedBreakIterator();
334 
343 
352  virtual bool operator==(const BreakIterator& that) const override;
353 
361  inline bool operator!=(const BreakIterator& that) const {
362  return !operator==(that);
363  }
364 
375  virtual RuleBasedBreakIterator* clone() const override;
376 
382  virtual int32_t hashCode() const;
383 
389  virtual const UnicodeString& getRules() const;
390 
391  //=======================================================================
392  // BreakIterator overrides
393  //=======================================================================
394 
419  virtual CharacterIterator& getText() const override;
420 
435  virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
436 
444  virtual void adoptText(CharacterIterator* newText) override;
445 
457  virtual void setText(const UnicodeString& newText) override;
458 
472  virtual void setText(UText *text, UErrorCode &status) override;
473 
479  virtual int32_t first() override;
480 
486  virtual int32_t last() override;
487 
498  virtual int32_t next(int32_t n) override;
499 
505  virtual int32_t next() override;
506 
512  virtual int32_t previous() override;
513 
521  virtual int32_t following(int32_t offset) override;
522 
530  virtual int32_t preceding(int32_t offset) override;
531 
540  virtual UBool isBoundary(int32_t offset) override;
541 
550  virtual int32_t current() const override;
551 
583  virtual int32_t getRuleStatus() const override;
584 
608  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
609 
621  virtual UClassID getDynamicClassID() const override;
622 
634  static UClassID U_EXPORT2 getStaticClassID();
635 
636 #ifndef U_FORCE_HIDE_DEPRECATED_API
637 
663  virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
664  int32_t &BufferSize,
665  UErrorCode &status) override;
666 #endif // U_FORCE_HIDE_DEPRECATED_API
667 
685  virtual const uint8_t *getBinaryRules(uint32_t &length);
686 
712  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
713 
714 
715 private:
716  //=======================================================================
717  // implementation
718  //=======================================================================
727  int32_t handleSafePrevious(int32_t fromPosition);
728 
739  int32_t handleNext();
740 
741  /*
742  * Templatized version of handleNext() and handleSafePrevious().
743  *
744  * There will be exactly four instantiations, two each for 8 and 16 bit tables,
745  * two each for 8 and 16 bit trie.
746  * Having separate instantiations for the table types keeps conditional tests of
747  * the table type out of the inner loops, at the expense of replicated code.
748  *
749  * The template parameter for the Trie access function is a value, not a type.
750  * Doing it this way, the compiler will inline the Trie function in the
751  * expanded functions. (Both the 8 and 16 bit access functions have the same type
752  * signature)
753  */
754 
755  typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
756 
757  template<typename RowType, PTrieFunc trieFunc>
758  int32_t handleSafePrevious(int32_t fromPosition);
759 
760  template<typename RowType, PTrieFunc trieFunc>
761  int32_t handleNext();
762 
763 
770  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
771 
772  public:
773 #ifndef U_HIDE_INTERNAL_API
774 
778  void dumpCache();
779 
784  void dumpTables();
785 #endif /* U_HIDE_INTERNAL_API */
786 
787 #ifndef U_HIDE_INTERNAL_API
788 
797  static void U_EXPORT2 registerExternalBreakEngine(
798  ExternalBreakEngine* toAdopt, UErrorCode& status);
799 #endif /* U_HIDE_INTERNAL_API */
800 
801 };
802 
803 
804 U_NAMESPACE_END
805 
806 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
807 
808 #endif /* U_SHOW_CPLUSPLUS_API */
809 
810 #endif
virtual ~ExternalBreakEngine()
destructor
Definition: rbbi.h:66
C++ API: Break Iterator.
virtual U_COMMON_API UClassID getDynamicClassID() const override=0
Return a polymorphic class ID for this object.
virtual U_COMMON_API void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
The ExternalBreakEngine class define an abstract interface for the host environment to provide a low ...
Definition: rbbi.h:60
bool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:361
virtual U_COMMON_API int32_t last()=0
Set the iterator position to the index immediately BEYOND the last character in the text being scanne...
U_COMMON_API BreakIterator & operator=(const BreakIterator &other)
Immutable Unicode code point trie structure.
Definition: ucptrie.h:59
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
virtual U_COMMON_API int32_t next()=0
Advance the iterator to the boundary following the current boundary.
No error, no warning.
Definition: utypes.h:544
virtual U_COMMON_API BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Deprecated functionality.
virtual U_COMMON_API int32_t getRuleStatus() const
For RuleBasedBreakIterators, return the status tag from the break rule that determined the boundary a...
virtual U_COMMON_API int32_t following(int32_t offset)=0
Advance the iterator to the first boundary following the specified offset.
virtual U_COMMON_API CharacterIterator & getText() const =0
Return a CharacterIterator over the text being analyzed.
virtual U_COMMON_API bool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
virtual U_COMMON_API int32_t first()=0
Sets the current iteration position to the beginning of the text, position zero.
virtual U_COMMON_API void setText(const UnicodeString &text)=0
Change the text over which this operates.
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:361
C++ API: String Character Iterator.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:38
The BreakIterator class implements methods for finding the location of boundaries in text...
Definition: brkiter.h:106
virtual U_COMMON_API int32_t current() const =0
Return character index of the current iterator position within the text.
#define UTEXT_INITIALIZER
initializer to be used with local (stack) instances of a UText struct.
Definition: utext.h:1558
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:449
virtual U_COMMON_API BreakIterator * clone() const =0
Return a polymorphic copy of this object.
C API: Data loading interface.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:161
virtual U_COMMON_API UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
C API: Parse Error Information.
virtual U_COMMON_API int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status)
For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) that determined the b...
virtual U_COMMON_API int32_t preceding(int32_t offset)=0
Set the iterator position to the first boundary preceding the specified offset.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:509
virtual U_COMMON_API int32_t previous()=0
Set the iterator position to the boundary preceding the current boundary.
UText struct.
Definition: utext.h:1328
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:120
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:315
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:302
virtual U_COMMON_API BreakIterator & refreshInputText(UText *input, UErrorCode &status)=0
Set the subject text string upon which the break iterator is operating without changing any other asp...
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:222
virtual U_COMMON_API UBool isBoundary(int32_t offset)=0
Return true if the specified position is a boundary position.
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269