Update the Unicode Data and Algorithms up to Unicode 6.3.0

* Mongolian and Phags-pa characters have been given a Joining_Type classification for contextual shaping. As a part of these additions, one Phags-pa character has the Joining_Type value of L (Left Joining), which no character had been assigned before. * The unassigned code points in the Currency Symbols block have been given the Bidi_Class property value ET and the Line_Break property value PR, to help implementations support new currency symbols, when they are encoded. * Hebrew letters and basic punctuation marks have been assigned the newly introduced Word_Break property values Hebrew_Letter, Single_Quote, and Double_Quote. * The Bidi_Class property has been extended with four new values for directional isolates. For more details, see http://www.unicode.org/versions/Unicode6.3.0/ Change-Id: Iad62d02edc58a8497898dcd6d6c70d5aece317ea Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2014-01-12 21:14:25 +02:00 · 2014-01-12 21:14:25 +02:00 · edfce46a6c
commit edfce46a6c
parent a6046be428
6 changed files with 199 additions and 66 deletions
--- a/src/corelib/tools/qchar.cpp
+++ b/src/corelib/tools/qchar.cpp
@ -185,8 +185,9 @@ QT_BEGIN_NAMESPACE
    \value Unicode_6_0  Version 6.0
    \value Unicode_6_1  Version 6.1
    \value Unicode_6_2  Version 6.2
+    \value Unicode_6_3  Version 6.3  Since Qt 5.3
    \value Unicode_Unassigned  The value is not assigned to any character
-                               in version 6.2 of Unicode.
+                               in version 6.3 of Unicode.

    \sa unicodeVersion(), currentUnicodeVersion()
 */
@ -408,14 +409,18 @@ QT_BEGIN_NAMESPACE
    \value DirEN
    \value DirES
    \value DirET
+    \value DirFSI Since Qt 5.3
    \value DirL
    \value DirLRE
+    \value DirLRI Since Qt 5.3
    \value DirLRO
    \value DirNSM
    \value DirON
    \value DirPDF
+    \value DirPDI Since Qt 5.3
    \value DirR
    \value DirRLE
+    \value DirRLI Since Qt 5.3
    \value DirRLO
    \value DirS
    \value DirWS
--- a/src/corelib/tools/qchar.h
+++ b/src/corelib/tools/qchar.h
@ -262,7 +262,8 @@ public:
    enum Direction
    {
        DirL, DirR, DirEN, DirES, DirET, DirAN, DirCS, DirB, DirS, DirWS, DirON,
-        DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN
+        DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN,
+        DirLRI, DirRLI, DirFSI, DirPDI
    };

    enum Decomposition
@ -332,7 +333,8 @@ public:
        Unicode_5_2,
        Unicode_6_0,
        Unicode_6_1,
-        Unicode_6_2
+        Unicode_6_2,
+        Unicode_6_3
    };
    // ****** WHEN ADDING FUNCTIONS, CONSIDER ADDING TO QCharRef TOO

--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@ -57,7 +57,7 @@ namespace QUnicodeTools {
 // -----------------------------------------------------------------------------------------------------
 //
 // The text boundaries determination algorithm.
-// See http://www.unicode.org/reports/tr29/tr29-21.html
+// See http://www.unicode.org/reports/tr29/tr29-23.html
 //
 // -----------------------------------------------------------------------------------------------------

@ -112,26 +112,30 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes
 namespace WB {

 enum Action {
-    NoBreak = 0,
-    Break = 1,
-    Lookup = 2
+    NoBreak,
+    Break,
+    Lookup,
+    LookupW
 };

 static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = {
-//    Other      CR       LF    Newline   Extend    RI    Katakana ALetter MidNumLet MidLetter MidNum  Numeric  ExtendNumLet
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Other
-    { Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // CR
-    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // LF
-    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Newline
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Extend
-    { Break  , Break  , Break  , Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // RegionalIndicator
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , NoBreak }, // Katakana
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , NoBreak, Lookup , Lookup , Break  , NoBreak, NoBreak }, // ALetter
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNumLet
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidLetter
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNum
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , NoBreak, Lookup , Break  , Lookup , NoBreak, NoBreak }, // Numeric
-    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, NoBreak, Break  , Break  , Break  , NoBreak, NoBreak }, // ExtendNumLet
+//    Other      CR       LF    Newline   Extend    RI    Katakana HLetter  ALetter  SQuote   DQuote  MidNumLet MidLetter MidNum  Numeric  ExtendNumLet
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Other
+    { Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // CR
+    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // LF
+    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Newline
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Extend
+    { Break  , Break  , Break  , Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // RegionalIndicator
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , NoBreak }, // Katakana
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break  , NoBreak, NoBreak }, // HebrewLetter
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , NoBreak, NoBreak, LookupW, Break  , LookupW, LookupW, Break  , NoBreak, NoBreak }, // ALetter
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // SingleQuote
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // DoubleQuote
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNumLet
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidLetter
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNum
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , NoBreak, NoBreak, Lookup , Break  , Lookup , Break  , Lookup , NoBreak, NoBreak }, // Numeric
+    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break  , NoBreak, NoBreak }, // ExtendNumLet
 };

 } // namespace WB
@ -160,8 +164,8 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
        if (qt_initcharattributes_default_algorithm_only) {
            // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
            // which caused "hi.there" to be treated like if it were just a single word;
-            // by remapping those characters in the Unicode tables generator.
-            // this code is needed to pass the coverage tests; remove once the issue is fixed.
+            // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
+            // and this code is needed to pass the coverage tests; remove once the issue is fixed.
            if (ucs4 == 0x002E) // FULL STOP
                ncls = QUnicodeTables::WordBreak_MidNumLet;
            else if (ucs4 == 0x003A) // COLON
@ -170,8 +174,17 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
 #endif

        uchar action = WB::breakTable[cls][ncls];
-        if (Q_UNLIKELY(action == WB::Lookup)) {
-            action = WB::Break;
+        switch (action) {
+        case WB::Break:
+            break;
+        case WB::NoBreak:
+            if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) {
+                // WB4: X(Extend|Format)* -> X
+                continue;
+            }
+            break;
+        case WB::Lookup:
+        case WB::LookupW:
            for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
                ucs4 = string[lookahead];
                if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
@ -184,20 +197,28 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at

                prop = QUnicodeTables::properties(ucs4);
                QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
-                if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend))
+
+                if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) {
+                    // WB4: X(Extend|Format)* -> X
                    continue;
-                if (Q_LIKELY(tcls == cls)) {
+                }
+
+                if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
+                                                                       || tcls == QUnicodeTables::WordBreak_ALetter)))) {
                    i = lookahead;
                    ncls = tcls;
                    action = WB::NoBreak;
                }
                break;
            }
-        } else if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) {
-            // WB4: X(Extend|Format)* -> X
-            if (Q_LIKELY(action != WB::Break))
-                continue;
+            if (action != WB::NoBreak) {
+                action = WB::Break;
+                if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
+                    action = WB::NoBreak; // WB7a
+            }
+            break;
        }
+
        cls = ncls;
        if (action == WB::Break) {
            attributes[pos].wordBreak = true;
@ -208,6 +229,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
                currentWordType = WordTypeHiraganaKatakana;
                attributes[pos].wordStart = true;
                break;
+            case QUnicodeTables::WordBreak_HebrewLetter:
            case QUnicodeTables::WordBreak_ALetter:
            case QUnicodeTables::WordBreak_Numeric:
                currentWordType = WordTypeAlphaNumeric;
@ -327,7 +349,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
 // -----------------------------------------------------------------------------------------------------
 //
 // The line breaking algorithm.
-// See http://www.unicode.org/reports/tr14/tr14-30.html
+// See http://www.unicode.org/reports/tr14/tr14-32.html
 //
 // -----------------------------------------------------------------------------------------------------

--- a/src/gui/text/qtextengine.cpp
+++ b/src/gui/text/qtextengine.cpp
@ -241,7 +241,8 @@ using namespace std;

 static const char *directions[] = {
    "DirL", "DirR", "DirEN", "DirES", "DirET", "DirAN", "DirCS", "DirB", "DirS", "DirWS", "DirON",
-    "DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN"
+    "DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN",
+    "DirLRI", "DirRLI", "DirFSI", "DirPDI"
 };

 #endif
@ -2536,7 +2537,8 @@ static inline bool nextCharJoins(const QString &string, int pos)
        ++pos;
    if (pos == string.length())
        return false;
-    return string.at(pos).joining() != QChar::OtherJoining;
+    // ### U+A872 has joining type L
+    return string.at(pos) == QChar(0xA872) || string.at(pos).joining() != QChar::OtherJoining;
 }

 static inline bool prevCharJoins(const QString &string, int pos)
@ -2551,13 +2553,9 @@ static inline bool prevCharJoins(const QString &string, int pos)

 static inline bool isRetainableControlCode(QChar c)
 {
-    return (c.unicode() == 0x202a       // LRE
-            || c.unicode() == 0x202b    // LRE
-            || c.unicode() == 0x202c    // PDF
-            || c.unicode() == 0x202d    // LRO
-            || c.unicode() == 0x202e    // RLO
-            || c.unicode() == 0x200e    // LRM
-            || c.unicode() == 0x200f);  // RLM
+    return (c.unicode() >= 0x202a && c.unicode() <= 0x202e) // LRE, RLE, PDF, LRO, RLO
+            || (c.unicode() >= 0x200e && c.unicode() <= 0x200f) // LRM, RLM
+            || (c.unicode() >= 0x2066 && c.unicode() <= 0x2069); // LRM, RLM
 }

 static QString stringMidRetainingBidiCC(const QString &string,
--- a/tests/auto/corelib/tools/qchar/tst_qchar.cpp
+++ b/tests/auto/corelib/tools/qchar/tst_qchar.cpp
@ -450,6 +450,18 @@ void tst_QChar::category()

 void tst_QChar::direction()
 {
+    QVERIFY(QChar::direction(0x200E) == QChar::DirL);
+    QVERIFY(QChar::direction(0x200F) == QChar::DirR);
+    QVERIFY(QChar::direction(0x202A) == QChar::DirLRE);
+    QVERIFY(QChar::direction(0x202B) == QChar::DirRLE);
+    QVERIFY(QChar::direction(0x202C) == QChar::DirPDF);
+    QVERIFY(QChar::direction(0x202D) == QChar::DirLRO);
+    QVERIFY(QChar::direction(0x202E) == QChar::DirRLO);
+    QVERIFY(QChar::direction(0x2066) == QChar::DirLRI);
+    QVERIFY(QChar::direction(0x2067) == QChar::DirRLI);
+    QVERIFY(QChar::direction(0x2068) == QChar::DirFSI);
+    QVERIFY(QChar::direction(0x2069) == QChar::DirPDI);
+
    QVERIFY(QChar('a').direction() == QChar::DirL);
    QVERIFY(QChar('0').direction() == QChar::DirEN);
    QVERIFY(QChar((ushort)0x627).direction() == QChar::DirAL);
@ -492,6 +504,9 @@ void tst_QChar::joining()
    QVERIFY(QChar::joining(0xf0000u) == QChar::OtherJoining);
    QVERIFY(QChar::joining(0xE0030u) == QChar::OtherJoining);
    QVERIFY(QChar::joining(0x2FA17u) == QChar::OtherJoining);
+
+    // ### U+A872 has joining type L
+    QVERIFY(QChar::joining((uint)0xA872) == QChar::OtherJoining);
 }

 void tst_QChar::combiningClass()
@ -605,6 +620,11 @@ void tst_QChar::unicodeVersion()
    QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2);
    QVERIFY(QChar::unicodeVersion((uint)0x20ba) == QChar::Unicode_6_2);

+    QVERIFY(QChar(0x061c).unicodeVersion() == QChar::Unicode_6_3);
+    QVERIFY(QChar::unicodeVersion((ushort)0x061c) == QChar::Unicode_6_3);
+    QVERIFY(QChar::unicodeVersion((uint)0x061c) == QChar::Unicode_6_3);
+    QVERIFY(QChar::unicodeVersion((uint)0x061c) == QChar::Unicode_6_3);
+
    QVERIFY(QChar(0x09ff).unicodeVersion() == QChar::Unicode_Unassigned);
    QVERIFY(QChar::unicodeVersion((ushort)0x09ff) == QChar::Unicode_Unassigned);
    QVERIFY(QChar::unicodeVersion((uint)0x09ff) == QChar::Unicode_Unassigned);
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@ -77,6 +77,7 @@ static void initAgeMap()
        { QChar::Unicode_6_0,   "6.0" },
        { QChar::Unicode_6_1,   "6.1" },
        { QChar::Unicode_6_2,   "6.2" },
+        { QChar::Unicode_6_3,   "6.3" },
        { QChar::Unicode_Unassigned, 0 }
    };
    AgeMap *d = ageMap;
@ -176,34 +177,66 @@ static void initDecompositionMap()
 }


-static QHash<QByteArray, QChar::Direction> directionMap;
+enum Direction {
+    DirL = QChar::DirL,
+    DirR = QChar::DirR,
+    DirEN = QChar::DirEN,
+    DirES = QChar::DirES,
+    DirET = QChar::DirET,
+    DirAN = QChar::DirAN,
+    DirCS = QChar::DirCS,
+    DirB = QChar::DirB,
+    DirS = QChar::DirS,
+    DirWS = QChar::DirWS,
+    DirON = QChar::DirON,
+    DirLRE = QChar::DirLRE,
+    DirLRO = QChar::DirLRO,
+    DirAL = QChar::DirAL,
+    DirRLE = QChar::DirRLE,
+    DirRLO = QChar::DirRLO,
+    DirPDF = QChar::DirPDF,
+    DirNSM = QChar::DirNSM,
+    DirBN = QChar::DirBN,
+    DirLRI = QChar::DirLRI,
+    DirRLI = QChar::DirRLI,
+    DirFSI = QChar::DirFSI,
+    DirPDI = QChar::DirPDI
+
+    , Dir_Unassigned
+};
+
+static QHash<QByteArray, Direction> directionMap;

 static void initDirectionMap()
 {
    struct Dir {
-        QChar::Direction dir;
+        Direction dir;
        const char *name;
    } directions[] = {
-        { QChar::DirL, "L" },
-        { QChar::DirR, "R" },
-        { QChar::DirEN, "EN" },
-        { QChar::DirES, "ES" },
-        { QChar::DirET, "ET" },
-        { QChar::DirAN, "AN" },
-        { QChar::DirCS, "CS" },
-        { QChar::DirB, "B" },
-        { QChar::DirS, "S" },
-        { QChar::DirWS, "WS" },
-        { QChar::DirON, "ON" },
-        { QChar::DirLRE, "LRE" },
-        { QChar::DirLRO, "LRO" },
-        { QChar::DirAL, "AL" },
-        { QChar::DirRLE, "RLE" },
-        { QChar::DirRLO, "RLO" },
-        { QChar::DirPDF, "PDF" },
-        { QChar::DirNSM, "NSM" },
-        { QChar::DirBN, "BN" },
-        { QChar::DirL, 0 }
+        { DirL, "L" },
+        { DirR, "R" },
+        { DirEN, "EN" },
+        { DirES, "ES" },
+        { DirET, "ET" },
+        { DirAN, "AN" },
+        { DirCS, "CS" },
+        { DirB, "B" },
+        { DirS, "S" },
+        { DirWS, "WS" },
+        { DirON, "ON" },
+        { DirLRE, "LRE" },
+        { DirLRO, "LRO" },
+        { DirAL, "AL" },
+        { DirRLE, "RLE" },
+        { DirRLO, "RLO" },
+        { DirPDF, "PDF" },
+        { DirNSM, "NSM" },
+        { DirBN, "BN" },
+        { DirLRI, "LRI" },
+        { DirRLI, "RLI" },
+        { DirFSI, "FSI" },
+        { DirPDI, "PDI" },
+        { Dir_Unassigned, 0 }
    };
    Dir *d = directions;
    while (d->name) {
@ -323,7 +356,10 @@ static const char *word_break_class_string =
    "    WordBreak_Extend,\n"
    "    WordBreak_RegionalIndicator,\n"
    "    WordBreak_Katakana,\n"
+    "    WordBreak_HebrewLetter,\n"
    "    WordBreak_ALetter,\n"
+    "    WordBreak_SingleQuote,\n"
+    "    WordBreak_DoubleQuote,\n"
    "    WordBreak_MidNumLet,\n"
    "    WordBreak_MidLetter,\n"
    "    WordBreak_MidNum,\n"
@ -339,7 +375,10 @@ enum WordBreakClass {
    WordBreak_Extend,
    WordBreak_RegionalIndicator,
    WordBreak_Katakana,
+    WordBreak_HebrewLetter,
    WordBreak_ALetter,
+    WordBreak_SingleQuote,
+    WordBreak_DoubleQuote,
    WordBreak_MidNumLet,
    WordBreak_MidLetter,
    WordBreak_MidNum,
@ -365,7 +404,10 @@ static void initWordBreak()
        { WordBreak_Extend, "Format" },
        { WordBreak_RegionalIndicator, "Regional_Indicator" },
        { WordBreak_Katakana, "Katakana" },
+        { WordBreak_HebrewLetter, "Hebrew_Letter" },
        { WordBreak_ALetter, "ALetter" },
+        { WordBreak_SingleQuote, "Single_Quote" },
+        { WordBreak_DoubleQuote, "Double_Quote" },
        { WordBreak_MidNumLet, "MidNumLet" },
        { WordBreak_MidLetter, "MidLetter" },
        { WordBreak_MidNum, "MidNum" },
@ -815,6 +857,31 @@ static int appendToSpecialCaseMap(const QList<int> &map)
    return pos;
 }

+static inline bool isDefaultIgnorable(uint ucs4)
+{
+    // Default_Ignorable_Code_Point:
+    //  Generated from
+    //    Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
+    //    - White_Space - FFF9..FFFB (Annotation Characters)
+    //    - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
+    if (ucs4 <= 0xff)
+        return ucs4 == 0xad;
+
+    return ucs4 == 0x034f
+            || (ucs4 >= 0x115f && ucs4 <= 0x1160)
+            || (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
+            || (ucs4 >= 0x180b && ucs4 <= 0x180d)
+            || (ucs4 >= 0x200b && ucs4 <= 0x200f)
+            || (ucs4 >= 0x202a && ucs4 <= 0x202e)
+            || (ucs4 >= 0x2060 && ucs4 <= 0x206f)
+            || ucs4 == 0x3164
+            || (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
+            || ucs4 == 0xfeff
+            || ucs4 == 0xffa0
+            || (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
+            || (ucs4 >= 0x1d173 && ucs4 <= 0xe0fff && (ucs4 <= 0x1d17a || ucs4 >= 0xe0000));
+}
+
 struct UnicodeData {
    UnicodeData(int codepoint = 0) {
        p.category = QChar::Other_NotAssigned; // Cn
@ -842,6 +909,17 @@ struct UnicodeData {
            || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
            p.direction = QChar::DirR;
        }
+        // The unassigned code points that default to ET are in the range:
+        //     [U+20A0..U+20CF]
+        else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
+            p.direction = QChar::DirET;
+        }
+        // The unassigned code points that default to BN have one of the following properties:
+        //     Default_Ignorable_Code_Point
+        //     Noncharacter_Code_Point
+        else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
+            p.direction = QChar::DirBN;
+        }

        p.lineBreakClass = LineBreak_AL; // XX -> AL
        // LineBreak.txt
@ -858,6 +936,11 @@ struct UnicodeData {
            || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
            p.lineBreakClass = LineBreak_ID;
        }
+        // The unassigned code points that default to "PR" comprise a range in the following block:
+        //     [U+20A0..U+20CF]
+        else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
+            p.lineBreakClass = LineBreak_PR;
+        }

        mirroredChar = 0;
        decompositionType = QChar::NoDecomposition;
@ -1008,7 +1091,10 @@ static void readUnicodeData()
        else
            ++combiningClassUsage[data.p.combiningClass];

-        data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
+        Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
+        if (dir == Dir_Unassigned)
+            qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
+        data.p.direction = QChar::Direction(dir);

        if (!properties[UD_UpperCase].isEmpty()) {
            int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
@ -1180,8 +1266,8 @@ static void readArabicShaping()
            qFatal("unassigned or unhandled joining value: %s", l[2].constData());

        if (joining == Joining_Left) {
-            // There are currently no characters of joining type Left_Joining defined in Unicode.
-            qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
+            qWarning("ACHTUNG!!! joining type '%s' has been met for U+%X; the current implementation needs to be revised!",
+                     l[2].trimmed().constData(), codepoint);
        }

        UnicodeData &d = UnicodeData::valueRef(codepoint);