Update the Unicode Data and Algorithms up to Unicode 6.3.0

* Mongolian and Phags-pa characters have been given a Joining_Type
  classification for contextual shaping. As a part of these additions,
  one Phags-pa character has the Joining_Type value of L (Left Joining),
  which no character had been assigned before.
* The unassigned code points in the Currency Symbols block have been
  given the Bidi_Class property value ET and the Line_Break property
  value PR, to help implementations support new currency symbols,
  when they are encoded.
* Hebrew letters and basic punctuation marks have been assigned
  the newly introduced Word_Break property values Hebrew_Letter,
  Single_Quote, and Double_Quote.
* The Bidi_Class property has been extended with four new values
  for directional isolates.
For more details, see http://www.unicode.org/versions/Unicode6.3.0/

Change-Id: Iad62d02edc58a8497898dcd6d6c70d5aece317ea
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
Konstantin Ritt 2014-01-12 21:14:25 +02:00 committed by The Qt Project
parent a6046be428
commit edfce46a6c
6 changed files with 199 additions and 66 deletions

View File

@ -185,8 +185,9 @@ QT_BEGIN_NAMESPACE
\value Unicode_6_0 Version 6.0 \value Unicode_6_0 Version 6.0
\value Unicode_6_1 Version 6.1 \value Unicode_6_1 Version 6.1
\value Unicode_6_2 Version 6.2 \value Unicode_6_2 Version 6.2
\value Unicode_6_3 Version 6.3 Since Qt 5.3
\value Unicode_Unassigned The value is not assigned to any character \value Unicode_Unassigned The value is not assigned to any character
in version 6.2 of Unicode. in version 6.3 of Unicode.
\sa unicodeVersion(), currentUnicodeVersion() \sa unicodeVersion(), currentUnicodeVersion()
*/ */
@ -408,14 +409,18 @@ QT_BEGIN_NAMESPACE
\value DirEN \value DirEN
\value DirES \value DirES
\value DirET \value DirET
\value DirFSI Since Qt 5.3
\value DirL \value DirL
\value DirLRE \value DirLRE
\value DirLRI Since Qt 5.3
\value DirLRO \value DirLRO
\value DirNSM \value DirNSM
\value DirON \value DirON
\value DirPDF \value DirPDF
\value DirPDI Since Qt 5.3
\value DirR \value DirR
\value DirRLE \value DirRLE
\value DirRLI Since Qt 5.3
\value DirRLO \value DirRLO
\value DirS \value DirS
\value DirWS \value DirWS

View File

@ -262,7 +262,8 @@ public:
enum Direction enum Direction
{ {
DirL, DirR, DirEN, DirES, DirET, DirAN, DirCS, DirB, DirS, DirWS, DirON, DirL, DirR, DirEN, DirES, DirET, DirAN, DirCS, DirB, DirS, DirWS, DirON,
DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN,
DirLRI, DirRLI, DirFSI, DirPDI
}; };
enum Decomposition enum Decomposition
@ -332,7 +333,8 @@ public:
Unicode_5_2, Unicode_5_2,
Unicode_6_0, Unicode_6_0,
Unicode_6_1, Unicode_6_1,
Unicode_6_2 Unicode_6_2,
Unicode_6_3
}; };
// ****** WHEN ADDING FUNCTIONS, CONSIDER ADDING TO QCharRef TOO // ****** WHEN ADDING FUNCTIONS, CONSIDER ADDING TO QCharRef TOO

View File

@ -57,7 +57,7 @@ namespace QUnicodeTools {
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
// //
// The text boundaries determination algorithm. // The text boundaries determination algorithm.
// See http://www.unicode.org/reports/tr29/tr29-21.html // See http://www.unicode.org/reports/tr29/tr29-23.html
// //
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
@ -112,26 +112,30 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes
namespace WB { namespace WB {
enum Action { enum Action {
NoBreak = 0, NoBreak,
Break = 1, Break,
Lookup = 2 Lookup,
LookupW
}; };
static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = { static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = {
// Other CR LF Newline Extend RI Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet // Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Other { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other
{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Extend { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
{ Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
}; };
} // namespace WB } // namespace WB
@ -160,8 +164,8 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
if (qt_initcharattributes_default_algorithm_only) { if (qt_initcharattributes_default_algorithm_only) {
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
// which caused "hi.there" to be treated like if it were just a single word; // which caused "hi.there" to be treated like if it were just a single word;
// by remapping those characters in the Unicode tables generator. // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
// this code is needed to pass the coverage tests; remove once the issue is fixed. // and this code is needed to pass the coverage tests; remove once the issue is fixed.
if (ucs4 == 0x002E) // FULL STOP if (ucs4 == 0x002E) // FULL STOP
ncls = QUnicodeTables::WordBreak_MidNumLet; ncls = QUnicodeTables::WordBreak_MidNumLet;
else if (ucs4 == 0x003A) // COLON else if (ucs4 == 0x003A) // COLON
@ -170,8 +174,17 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
#endif #endif
uchar action = WB::breakTable[cls][ncls]; uchar action = WB::breakTable[cls][ncls];
if (Q_UNLIKELY(action == WB::Lookup)) { switch (action) {
action = WB::Break; case WB::Break:
break;
case WB::NoBreak:
if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) {
// WB4: X(Extend|Format)* -> X
continue;
}
break;
case WB::Lookup:
case WB::LookupW:
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) { for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
ucs4 = string[lookahead]; ucs4 = string[lookahead];
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) { if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
@ -184,20 +197,28 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
prop = QUnicodeTables::properties(ucs4); prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend))
if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) {
// WB4: X(Extend|Format)* -> X
continue; continue;
if (Q_LIKELY(tcls == cls)) { }
if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
|| tcls == QUnicodeTables::WordBreak_ALetter)))) {
i = lookahead; i = lookahead;
ncls = tcls; ncls = tcls;
action = WB::NoBreak; action = WB::NoBreak;
} }
break; break;
} }
} else if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { if (action != WB::NoBreak) {
// WB4: X(Extend|Format)* -> X action = WB::Break;
if (Q_LIKELY(action != WB::Break)) if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
continue; action = WB::NoBreak; // WB7a
} }
break;
}
cls = ncls; cls = ncls;
if (action == WB::Break) { if (action == WB::Break) {
attributes[pos].wordBreak = true; attributes[pos].wordBreak = true;
@ -208,6 +229,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
currentWordType = WordTypeHiraganaKatakana; currentWordType = WordTypeHiraganaKatakana;
attributes[pos].wordStart = true; attributes[pos].wordStart = true;
break; break;
case QUnicodeTables::WordBreak_HebrewLetter:
case QUnicodeTables::WordBreak_ALetter: case QUnicodeTables::WordBreak_ALetter:
case QUnicodeTables::WordBreak_Numeric: case QUnicodeTables::WordBreak_Numeric:
currentWordType = WordTypeAlphaNumeric; currentWordType = WordTypeAlphaNumeric;
@ -327,7 +349,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
// //
// The line breaking algorithm. // The line breaking algorithm.
// See http://www.unicode.org/reports/tr14/tr14-30.html // See http://www.unicode.org/reports/tr14/tr14-32.html
// //
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------

View File

@ -241,7 +241,8 @@ using namespace std;
static const char *directions[] = { static const char *directions[] = {
"DirL", "DirR", "DirEN", "DirES", "DirET", "DirAN", "DirCS", "DirB", "DirS", "DirWS", "DirON", "DirL", "DirR", "DirEN", "DirES", "DirET", "DirAN", "DirCS", "DirB", "DirS", "DirWS", "DirON",
"DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN" "DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN",
"DirLRI", "DirRLI", "DirFSI", "DirPDI"
}; };
#endif #endif
@ -2536,7 +2537,8 @@ static inline bool nextCharJoins(const QString &string, int pos)
++pos; ++pos;
if (pos == string.length()) if (pos == string.length())
return false; return false;
return string.at(pos).joining() != QChar::OtherJoining; // ### U+A872 has joining type L
return string.at(pos) == QChar(0xA872) || string.at(pos).joining() != QChar::OtherJoining;
} }
static inline bool prevCharJoins(const QString &string, int pos) static inline bool prevCharJoins(const QString &string, int pos)
@ -2551,13 +2553,9 @@ static inline bool prevCharJoins(const QString &string, int pos)
static inline bool isRetainableControlCode(QChar c) static inline bool isRetainableControlCode(QChar c)
{ {
return (c.unicode() == 0x202a // LRE return (c.unicode() >= 0x202a && c.unicode() <= 0x202e) // LRE, RLE, PDF, LRO, RLO
|| c.unicode() == 0x202b // LRE || (c.unicode() >= 0x200e && c.unicode() <= 0x200f) // LRM, RLM
|| c.unicode() == 0x202c // PDF || (c.unicode() >= 0x2066 && c.unicode() <= 0x2069); // LRM, RLM
|| c.unicode() == 0x202d // LRO
|| c.unicode() == 0x202e // RLO
|| c.unicode() == 0x200e // LRM
|| c.unicode() == 0x200f); // RLM
} }
static QString stringMidRetainingBidiCC(const QString &string, static QString stringMidRetainingBidiCC(const QString &string,

View File

@ -450,6 +450,18 @@ void tst_QChar::category()
void tst_QChar::direction() void tst_QChar::direction()
{ {
QVERIFY(QChar::direction(0x200E) == QChar::DirL);
QVERIFY(QChar::direction(0x200F) == QChar::DirR);
QVERIFY(QChar::direction(0x202A) == QChar::DirLRE);
QVERIFY(QChar::direction(0x202B) == QChar::DirRLE);
QVERIFY(QChar::direction(0x202C) == QChar::DirPDF);
QVERIFY(QChar::direction(0x202D) == QChar::DirLRO);
QVERIFY(QChar::direction(0x202E) == QChar::DirRLO);
QVERIFY(QChar::direction(0x2066) == QChar::DirLRI);
QVERIFY(QChar::direction(0x2067) == QChar::DirRLI);
QVERIFY(QChar::direction(0x2068) == QChar::DirFSI);
QVERIFY(QChar::direction(0x2069) == QChar::DirPDI);
QVERIFY(QChar('a').direction() == QChar::DirL); QVERIFY(QChar('a').direction() == QChar::DirL);
QVERIFY(QChar('0').direction() == QChar::DirEN); QVERIFY(QChar('0').direction() == QChar::DirEN);
QVERIFY(QChar((ushort)0x627).direction() == QChar::DirAL); QVERIFY(QChar((ushort)0x627).direction() == QChar::DirAL);
@ -492,6 +504,9 @@ void tst_QChar::joining()
QVERIFY(QChar::joining(0xf0000u) == QChar::OtherJoining); QVERIFY(QChar::joining(0xf0000u) == QChar::OtherJoining);
QVERIFY(QChar::joining(0xE0030u) == QChar::OtherJoining); QVERIFY(QChar::joining(0xE0030u) == QChar::OtherJoining);
QVERIFY(QChar::joining(0x2FA17u) == QChar::OtherJoining); QVERIFY(QChar::joining(0x2FA17u) == QChar::OtherJoining);
// ### U+A872 has joining type L
QVERIFY(QChar::joining((uint)0xA872) == QChar::OtherJoining);
} }
void tst_QChar::combiningClass() void tst_QChar::combiningClass()
@ -605,6 +620,11 @@ void tst_QChar::unicodeVersion()
QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2); QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2);
QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2); QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2);
QVERIFY(QChar(0x061c).unicodeVersion() == QChar::Unicode_6_3);
QVERIFY(QChar::unicodeVersion((ushort)0x061c) == QChar::Unicode_6_3);
QVERIFY(QChar::unicodeVersion((uint)0x061c) == QChar::Unicode_6_3);
QVERIFY(QChar::unicodeVersion((uint)0x061c) == QChar::Unicode_6_3);
QVERIFY(QChar(0x09ff).unicodeVersion() == QChar::Unicode_Unassigned); QVERIFY(QChar(0x09ff).unicodeVersion() == QChar::Unicode_Unassigned);
QVERIFY(QChar::unicodeVersion((ushort)0x09ff) == QChar::Unicode_Unassigned); QVERIFY(QChar::unicodeVersion((ushort)0x09ff) == QChar::Unicode_Unassigned);
QVERIFY(QChar::unicodeVersion((uint)0x09ff) == QChar::Unicode_Unassigned); QVERIFY(QChar::unicodeVersion((uint)0x09ff) == QChar::Unicode_Unassigned);

View File

@ -77,6 +77,7 @@ static void initAgeMap()
{ QChar::Unicode_6_0, "6.0" }, { QChar::Unicode_6_0, "6.0" },
{ QChar::Unicode_6_1, "6.1" }, { QChar::Unicode_6_1, "6.1" },
{ QChar::Unicode_6_2, "6.2" }, { QChar::Unicode_6_2, "6.2" },
{ QChar::Unicode_6_3, "6.3" },
{ QChar::Unicode_Unassigned, 0 } { QChar::Unicode_Unassigned, 0 }
}; };
AgeMap *d = ageMap; AgeMap *d = ageMap;
@ -176,34 +177,66 @@ static void initDecompositionMap()
} }
static QHash<QByteArray, QChar::Direction> directionMap; enum Direction {
DirL = QChar::DirL,
DirR = QChar::DirR,
DirEN = QChar::DirEN,
DirES = QChar::DirES,
DirET = QChar::DirET,
DirAN = QChar::DirAN,
DirCS = QChar::DirCS,
DirB = QChar::DirB,
DirS = QChar::DirS,
DirWS = QChar::DirWS,
DirON = QChar::DirON,
DirLRE = QChar::DirLRE,
DirLRO = QChar::DirLRO,
DirAL = QChar::DirAL,
DirRLE = QChar::DirRLE,
DirRLO = QChar::DirRLO,
DirPDF = QChar::DirPDF,
DirNSM = QChar::DirNSM,
DirBN = QChar::DirBN,
DirLRI = QChar::DirLRI,
DirRLI = QChar::DirRLI,
DirFSI = QChar::DirFSI,
DirPDI = QChar::DirPDI
, Dir_Unassigned
};
static QHash<QByteArray, Direction> directionMap;
static void initDirectionMap() static void initDirectionMap()
{ {
struct Dir { struct Dir {
QChar::Direction dir; Direction dir;
const char *name; const char *name;
} directions[] = { } directions[] = {
{ QChar::DirL, "L" }, { DirL, "L" },
{ QChar::DirR, "R" }, { DirR, "R" },
{ QChar::DirEN, "EN" }, { DirEN, "EN" },
{ QChar::DirES, "ES" }, { DirES, "ES" },
{ QChar::DirET, "ET" }, { DirET, "ET" },
{ QChar::DirAN, "AN" }, { DirAN, "AN" },
{ QChar::DirCS, "CS" }, { DirCS, "CS" },
{ QChar::DirB, "B" }, { DirB, "B" },
{ QChar::DirS, "S" }, { DirS, "S" },
{ QChar::DirWS, "WS" }, { DirWS, "WS" },
{ QChar::DirON, "ON" }, { DirON, "ON" },
{ QChar::DirLRE, "LRE" }, { DirLRE, "LRE" },
{ QChar::DirLRO, "LRO" }, { DirLRO, "LRO" },
{ QChar::DirAL, "AL" }, { DirAL, "AL" },
{ QChar::DirRLE, "RLE" }, { DirRLE, "RLE" },
{ QChar::DirRLO, "RLO" }, { DirRLO, "RLO" },
{ QChar::DirPDF, "PDF" }, { DirPDF, "PDF" },
{ QChar::DirNSM, "NSM" }, { DirNSM, "NSM" },
{ QChar::DirBN, "BN" }, { DirBN, "BN" },
{ QChar::DirL, 0 } { DirLRI, "LRI" },
{ DirRLI, "RLI" },
{ DirFSI, "FSI" },
{ DirPDI, "PDI" },
{ Dir_Unassigned, 0 }
}; };
Dir *d = directions; Dir *d = directions;
while (d->name) { while (d->name) {
@ -323,7 +356,10 @@ static const char *word_break_class_string =
" WordBreak_Extend,\n" " WordBreak_Extend,\n"
" WordBreak_RegionalIndicator,\n" " WordBreak_RegionalIndicator,\n"
" WordBreak_Katakana,\n" " WordBreak_Katakana,\n"
" WordBreak_HebrewLetter,\n"
" WordBreak_ALetter,\n" " WordBreak_ALetter,\n"
" WordBreak_SingleQuote,\n"
" WordBreak_DoubleQuote,\n"
" WordBreak_MidNumLet,\n" " WordBreak_MidNumLet,\n"
" WordBreak_MidLetter,\n" " WordBreak_MidLetter,\n"
" WordBreak_MidNum,\n" " WordBreak_MidNum,\n"
@ -339,7 +375,10 @@ enum WordBreakClass {
WordBreak_Extend, WordBreak_Extend,
WordBreak_RegionalIndicator, WordBreak_RegionalIndicator,
WordBreak_Katakana, WordBreak_Katakana,
WordBreak_HebrewLetter,
WordBreak_ALetter, WordBreak_ALetter,
WordBreak_SingleQuote,
WordBreak_DoubleQuote,
WordBreak_MidNumLet, WordBreak_MidNumLet,
WordBreak_MidLetter, WordBreak_MidLetter,
WordBreak_MidNum, WordBreak_MidNum,
@ -365,7 +404,10 @@ static void initWordBreak()
{ WordBreak_Extend, "Format" }, { WordBreak_Extend, "Format" },
{ WordBreak_RegionalIndicator, "Regional_Indicator" }, { WordBreak_RegionalIndicator, "Regional_Indicator" },
{ WordBreak_Katakana, "Katakana" }, { WordBreak_Katakana, "Katakana" },
{ WordBreak_HebrewLetter, "Hebrew_Letter" },
{ WordBreak_ALetter, "ALetter" }, { WordBreak_ALetter, "ALetter" },
{ WordBreak_SingleQuote, "Single_Quote" },
{ WordBreak_DoubleQuote, "Double_Quote" },
{ WordBreak_MidNumLet, "MidNumLet" }, { WordBreak_MidNumLet, "MidNumLet" },
{ WordBreak_MidLetter, "MidLetter" }, { WordBreak_MidLetter, "MidLetter" },
{ WordBreak_MidNum, "MidNum" }, { WordBreak_MidNum, "MidNum" },
@ -815,6 +857,31 @@ static int appendToSpecialCaseMap(const QList<int> &map)
return pos; return pos;
} }
static inline bool isDefaultIgnorable(uint ucs4)
{
// Default_Ignorable_Code_Point:
// Generated from
// Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
// - White_Space - FFF9..FFFB (Annotation Characters)
// - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
if (ucs4 <= 0xff)
return ucs4 == 0xad;
return ucs4 == 0x034f
|| (ucs4 >= 0x115f && ucs4 <= 0x1160)
|| (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
|| (ucs4 >= 0x180b && ucs4 <= 0x180d)
|| (ucs4 >= 0x200b && ucs4 <= 0x200f)
|| (ucs4 >= 0x202a && ucs4 <= 0x202e)
|| (ucs4 >= 0x2060 && ucs4 <= 0x206f)
|| ucs4 == 0x3164
|| (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
|| ucs4 == 0xfeff
|| ucs4 == 0xffa0
|| (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
|| (ucs4 >= 0x1d173 && ucs4 <= 0xe0fff && (ucs4 <= 0x1d17a || ucs4 >= 0xe0000));
}
struct UnicodeData { struct UnicodeData {
UnicodeData(int codepoint = 0) { UnicodeData(int codepoint = 0) {
p.category = QChar::Other_NotAssigned; // Cn p.category = QChar::Other_NotAssigned; // Cn
@ -842,6 +909,17 @@ struct UnicodeData {
|| (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) { || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
p.direction = QChar::DirR; p.direction = QChar::DirR;
} }
// The unassigned code points that default to ET are in the range:
// [U+20A0..U+20CF]
else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
p.direction = QChar::DirET;
}
// The unassigned code points that default to BN have one of the following properties:
// Default_Ignorable_Code_Point
// Noncharacter_Code_Point
else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
p.direction = QChar::DirBN;
}
p.lineBreakClass = LineBreak_AL; // XX -> AL p.lineBreakClass = LineBreak_AL; // XX -> AL
// LineBreak.txt // LineBreak.txt
@ -858,6 +936,11 @@ struct UnicodeData {
|| (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) { || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
p.lineBreakClass = LineBreak_ID; p.lineBreakClass = LineBreak_ID;
} }
// The unassigned code points that default to "PR" comprise a range in the following block:
// [U+20A0..U+20CF]
else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
p.lineBreakClass = LineBreak_PR;
}
mirroredChar = 0; mirroredChar = 0;
decompositionType = QChar::NoDecomposition; decompositionType = QChar::NoDecomposition;
@ -1008,7 +1091,10 @@ static void readUnicodeData()
else else
++combiningClassUsage[data.p.combiningClass]; ++combiningClassUsage[data.p.combiningClass];
data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction); Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
if (dir == Dir_Unassigned)
qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
data.p.direction = QChar::Direction(dir);
if (!properties[UD_UpperCase].isEmpty()) { if (!properties[UD_UpperCase].isEmpty()) {
int upperCase = properties[UD_UpperCase].toInt(&ok, 16); int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
@ -1180,8 +1266,8 @@ static void readArabicShaping()
qFatal("unassigned or unhandled joining value: %s", l[2].constData()); qFatal("unassigned or unhandled joining value: %s", l[2].constData());
if (joining == Joining_Left) { if (joining == Joining_Left) {
// There are currently no characters of joining type Left_Joining defined in Unicode. qWarning("ACHTUNG!!! joining type '%s' has been met for U+%X; the current implementation needs to be revised!",
qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData()); l[2].trimmed().constData(), codepoint);
} }
UnicodeData &d = UnicodeData::valueRef(codepoint); UnicodeData &d = UnicodeData::valueRef(codepoint);