Update Unicode data & algorithms up to v8.0

* Georgian lari currency symbol
* A large collection of CJK unified ideographs
* Emoji symbols and symbol modifiers
* Letters to support the Ik language in Uganda, Kulango in
  the Côte d’Ivoire, and other languages of Africa
* A set of lowercase Cherokee syllables, forming case pairs
  with the existing Cherokee characters
* The Ahom script for support of the Tai Ahom language in India
* Arabic letters to support Arwi—the Tamil language written in the Arabic script

For more details, see http://www.unicode.org/versions/Unicode8.0.0/

[ChangeLog][QtCore] Unicode data updated to v.8.0

Change-Id: If255f95c9c45655b721369a116299da3cabbba0a
Reviewed-by: Lars Knoll <lars.knoll@theqtcompany.com>
This commit is contained in:
Konstantin Ritt 2015-11-02 08:28:14 +04:00
parent a98b541f26
commit 0e1f3aab11
8 changed files with 101 additions and 31 deletions

View File

@ -179,8 +179,9 @@ QT_BEGIN_NAMESPACE
\value Unicode_6_2 Version 6.2
\value Unicode_6_3 Version 6.3 Since Qt 5.3
\value Unicode_7_0 Version 7.0 Since Qt 5.5
\value Unicode_8_0 Version 8.0 Since Qt 5.6
\value Unicode_Unassigned The value is not assigned to any character
in version 6.3 of Unicode.
in version 8.0 of Unicode.
\sa unicodeVersion(), currentUnicodeVersion()
*/
@ -401,6 +402,12 @@ QT_BEGIN_NAMESPACE
\value Script_Khudawadi
\value Script_Tirhuta
\value Script_WarangCiti
\value Script_Ahom
\value Script_AnatolianHieroglyphs
\value Script_Hatran
\value Script_Multani
\value Script_OldHungarian
\value Script_SignWriting
\omitvalue ScriptCount

View File

@ -275,6 +275,14 @@ public:
Script_Tirhuta,
Script_WarangCiti,
// Unicode 8.0 additions
Script_Ahom,
Script_AnatolianHieroglyphs,
Script_Hatran,
Script_Multani,
Script_OldHungarian,
Script_SignWriting,
ScriptCount
};
@ -365,7 +373,8 @@ public:
Unicode_6_1,
Unicode_6_2,
Unicode_6_3,
Unicode_7_0
Unicode_7_0,
Unicode_8_0
};
// ****** WHEN ADDING FUNCTIONS, CONSIDER ADDING TO QCharRef TOO

View File

@ -49,7 +49,7 @@ namespace QUnicodeTools {
// -----------------------------------------------------------------------------------------------------
//
// The text boundaries determination algorithm.
// See http://www.unicode.org/reports/tr29/tr29-25.html
// See http://www.unicode.org/reports/tr29/tr29-27.html
//
// -----------------------------------------------------------------------------------------------------
@ -244,8 +244,9 @@ namespace SB {
enum State {
Initial,
Lower,
Upper,
UpATerm,
LUATerm,
ATerm,
ATermC,
ACS,
@ -260,10 +261,11 @@ enum State {
static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreak_Close + 1] = {
// Other CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, UpATerm, STerm , STerm , Initial }, // Upper
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, STerm , STerm , Initial }, // Upper
{ Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // UpATerm
{ Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
@ -341,7 +343,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
// -----------------------------------------------------------------------------------------------------
//
// The line breaking algorithm.
// See http://www.unicode.org/reports/tr14/tr14-33.html
// See http://www.unicode.org/reports/tr14/tr14-35.html
//
// -----------------------------------------------------------------------------------------------------
@ -408,26 +410,29 @@ inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category categor
/* In order to support the tailored implementation of LB25 properly
the following changes were made in the pair table to allow breaks
where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
CL->PO from IB to DB
CP->PO from IB to DB
CL->PR from IB to DB
CP->PR from IB to DB
PO->OP from IB to DB
PR->OP from IB to DB
IS->NU from IB to DB
SY->NU from IB to DB
(CL)(PO) from IB to DB
(CP)(PO) from IB to DB
(CL)(PR) from IB to DB
(CP)(PR) from IB to DB
(PO)(OP) from IB to DB
(PR)(OP) from IB to DB
(IS)(NU) from IB to DB
(SY)(NU) from IB to DB
*/
// The following line break classes are not treated by the pair table
// and must be resolved outside:
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
/* In order to implementat LB21a properly a special rule HH has been introduced and
the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
(HL)(HY|BA) from IB to CI
(HY|BA)(!CB) from DB to HH
*/
enum Action {
ProhibitedBreak, PB = ProhibitedBreak,
DirectBreak, DB = DirectBreak,
IndirectBreak, IB = IndirectBreak,
CombiningIndirectBreak, CI = CombiningIndirectBreak,
CombiningProhibitedBreak, CP = CombiningProhibitedBreak
CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
};
static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::LineBreak_CB + 1] = {
@ -438,18 +443,18 @@ static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::
/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB },
/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB },
/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
@ -464,6 +469,10 @@ static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::
/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }
};
// The following line break classes are not treated by the pair table
// and must be resolved outside:
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
} // namespace LB
static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
@ -555,6 +564,10 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at
if (lcls != QUnicodeTables::LineBreak_SP)
goto next_no_cls_update;
break;
case LB::ProhibitedBreakAfterHebrewPlusHyphen:
if (lcls != QUnicodeTables::LineBreak_HL)
attributes[pos].lineBreak = true;
break;
case LB::ProhibitedBreak:
// nothing to do
default:
@ -659,7 +672,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
// ----------------------------------------------------------------------------
//
// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-22.html
// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
//
// ----------------------------------------------------------------------------

View File

@ -188,7 +188,15 @@ static const hb_script_t _qtscript_to_hbscript[] = {
HB_SCRIPT_SIDDHAM,
HB_SCRIPT_KHUDAWADI,
HB_SCRIPT_TIRHUTA,
HB_SCRIPT_WARANG_CITI
HB_SCRIPT_WARANG_CITI,
// Unicode 8.0 additions
HB_SCRIPT_AHOM,
HB_SCRIPT_ANATOLIAN_HIEROGLYPHS,
HB_SCRIPT_HATRAN,
HB_SCRIPT_MULTANI,
HB_SCRIPT_OLD_HUNGARIAN,
HB_SCRIPT_SIGNWRITING
};
Q_STATIC_ASSERT(QChar::ScriptCount == sizeof(_qtscript_to_hbscript) / sizeof(_qtscript_to_hbscript[0]));

View File

@ -242,7 +242,13 @@ static const char *specialLanguages[] = {
"sa", // Siddham
"sd", // Khudawadi
"mai", // Tirhuta
"hoc" // WarangCiti
"hoc", // WarangCiti
"", // Ahom
"", // AnatolianHieroglyphs
"", // Hatran
"", // Multani
"", // OldHungarian
"" // SignWriting
};
Q_STATIC_ASSERT(sizeof(specialLanguages) / sizeof(const char *) == QChar::ScriptCount);

View File

@ -594,6 +594,10 @@ void tst_QChar::unicodeVersion()
QVERIFY(QChar::unicodeVersion(0x20bd) == QChar::Unicode_7_0);
QVERIFY(QChar::unicodeVersion(0x16b00) == QChar::Unicode_7_0);
QVERIFY(QChar(0x08b3).unicodeVersion() == QChar::Unicode_8_0);
QVERIFY(QChar::unicodeVersion(0x08b3) == QChar::Unicode_8_0);
QVERIFY(QChar::unicodeVersion(0x108e0) == QChar::Unicode_8_0);
QVERIFY(QChar(0x09ff).unicodeVersion() == QChar::Unicode_Unassigned);
QVERIFY(QChar::unicodeVersion(0x09ff) == QChar::Unicode_Unassigned);
QVERIFY(QChar::unicodeVersion(0x110000) == QChar::Unicode_Unassigned);

View File

@ -248,6 +248,15 @@ static const EnumLookup scriptEnumLookup[] =
{QChar::Script_Tirhuta, "Script_Tirhuta"},
{QChar::Script_WarangCiti, "Script_WarangCiti"},
#endif // Qt 5.5
#if QT_VERSION >= 0x050600
{QChar::Script_Ahom, "Script_Ahom"},
{QChar::Script_AnatolianHieroglyphs, "Script_AnatolianHieroglyphs"},
{QChar::Script_Hatran, "Script_Hatran"},
{QChar::Script_Multani, "Script_Multani"},
{QChar::Script_OldHungarian, "Script_OldHungarian"},
{QChar::Script_SignWriting, "Script_SignWriting"},
#endif // Qt 5.5
};
#endif // Qt 5.1
@ -364,6 +373,9 @@ static const EnumLookup unicodeVersionEnumLookup[] =
#if QT_VERSION >= 0x050500
{QChar::Unicode_7_0, "Unicode_7_0"},
#endif // Qt 5.5
#if QT_VERSION >= 0x050600
{QChar::Unicode_8_0, "Unicode_8_0"},
#endif // Qt 5.6
#endif // Qt 5
};

View File

@ -43,8 +43,8 @@
#include <private/qunicodetables_p.h>
#endif
#define DATA_VERSION_S "7.0"
#define DATA_VERSION_STR "QChar::Unicode_7_0"
#define DATA_VERSION_S "8.0"
#define DATA_VERSION_STR "QChar::Unicode_8_0"
static QHash<QByteArray, QChar::UnicodeVersion> age_map;
@ -71,6 +71,7 @@ static void initAgeMap()
{ QChar::Unicode_6_2, "6.2" },
{ QChar::Unicode_6_3, "6.3" },
{ QChar::Unicode_7_0, "7.0" },
{ QChar::Unicode_8_0, "8.0" },
{ QChar::Unicode_Unassigned, 0 }
};
AgeMap *d = ageMap;
@ -719,6 +720,13 @@ static void initScriptMap()
{ QChar::Script_Khudawadi, "Khudawadi" },
{ QChar::Script_Tirhuta, "Tirhuta" },
{ QChar::Script_WarangCiti, "WarangCiti" },
// 8.0
{ QChar::Script_Ahom, "Ahom" },
{ QChar::Script_AnatolianHieroglyphs, "AnatolianHieroglyphs" },
{ QChar::Script_Hatran, "Hatran" },
{ QChar::Script_Multani, "Multani" },
{ QChar::Script_OldHungarian, "OldHungarian" },
{ QChar::Script_SignWriting, "SignWriting" },
// unhandled
{ QChar::Script_Unknown, 0 }
};
@ -946,13 +954,16 @@ struct UnicodeData {
p.lineBreakClass = LineBreak_AL; // XX -> AL
// LineBreak.txt
// The unassigned code points that default to "ID" include ranges in the following blocks:
// [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2F800..U+2FA1F, U+20000..U+2FFFD, U+30000..U+3FFFD]
// [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2B820..U+2CEAF, U+2F800..U+2FA1F]
// and any other reserved code points on
// [U+20000..U+2FFFD, U+30000..U+3FFFD]
if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
|| (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
|| (codepoint >= 0xF900 && codepoint <= 0xFAFF)
|| (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
|| (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
|| (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
|| (codepoint >= 0x2B820 && codepoint <= 0x2CEAF)
|| (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
|| (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
|| (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {