Add additional grapheme, word, and sentence break class tests from tr29
Stop turning THAI CHARACTER SARA AM into a grapheme boundary because it breaks a test and chromium does not consider it to be a separate grapheme. Fixes: QTBUG-88545 Change-Id: Ib1aea8dbb66ac42b2129cf9fe04c39f5f76eeb36 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
76b4739e07
commit
09291eead4
@ -1503,10 +1503,6 @@ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAtt
|
|||||||
for (j = 1; j < cell_length; j++)
|
for (j = 1; j < cell_length; j++)
|
||||||
attributes[i + j].graphemeBoundary = false;
|
attributes[i + j].graphemeBoundary = false;
|
||||||
|
|
||||||
/* Set graphemeBoundary for SARA AM */
|
|
||||||
if (cstr[i + cell_length - 1] == static_cast<char>(0xd3))
|
|
||||||
attributes[i + cell_length - 1].graphemeBoundary = true;
|
|
||||||
|
|
||||||
i += cell_length;
|
i += cell_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ add_subdirectory(qstringmatcher)
|
|||||||
add_subdirectory(qstringtokenizer)
|
add_subdirectory(qstringtokenizer)
|
||||||
add_subdirectory(qstringview)
|
add_subdirectory(qstringview)
|
||||||
add_subdirectory(qtextboundaryfinder)
|
add_subdirectory(qtextboundaryfinder)
|
||||||
|
add_subdirectory(qunicodetools)
|
||||||
# QTBUG-87414 # special case
|
# QTBUG-87414 # special case
|
||||||
if(NOT ANDROID)
|
if(NOT ANDROID)
|
||||||
add_subdirectory(qlocale)
|
add_subdirectory(qlocale)
|
||||||
|
@ -30,7 +30,6 @@
|
|||||||
#include <qchar.h>
|
#include <qchar.h>
|
||||||
#include <qfile.h>
|
#include <qfile.h>
|
||||||
#include <qstringlist.h>
|
#include <qstringlist.h>
|
||||||
#include <private/qunicodetables_p.h>
|
|
||||||
|
|
||||||
class tst_QChar : public QObject
|
class tst_QChar : public QObject
|
||||||
{
|
{
|
||||||
@ -67,7 +66,6 @@ private slots:
|
|||||||
void digitValue();
|
void digitValue();
|
||||||
void mirroredChar();
|
void mirroredChar();
|
||||||
void decomposition();
|
void decomposition();
|
||||||
void lineBreakClass();
|
|
||||||
void script();
|
void script();
|
||||||
void normalization_data();
|
void normalization_data();
|
||||||
void normalization();
|
void normalization();
|
||||||
@ -748,24 +746,6 @@ void tst_QChar::decomposition()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void tst_QChar::lineBreakClass()
|
|
||||||
{
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x0029) == QUnicodeTables::LineBreak_CP);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x0041) == QUnicodeTables::LineBreak_AL);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x0033) == QUnicodeTables::LineBreak_NU);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x00ad) == QUnicodeTables::LineBreak_BA);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x05d0) == QUnicodeTables::LineBreak_HL);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0xfffc) == QUnicodeTables::LineBreak_CB);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0xe0164) == QUnicodeTables::LineBreak_CM);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x2f9a4) == QUnicodeTables::LineBreak_ID);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x10000) == QUnicodeTables::LineBreak_AL);
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x1f1e6) == QUnicodeTables::LineBreak_RI);
|
|
||||||
|
|
||||||
// mapped to AL:
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0xfffd) == QUnicodeTables::LineBreak_AL); // AI -> AL
|
|
||||||
QVERIFY(QUnicodeTables::lineBreakClass(0x100000) == QUnicodeTables::LineBreak_AL); // XX -> AL
|
|
||||||
}
|
|
||||||
|
|
||||||
void tst_QChar::script()
|
void tst_QChar::script()
|
||||||
{
|
{
|
||||||
QVERIFY(QChar::script(0x0020) == QChar::Script_Common);
|
QVERIFY(QChar::script(0x0020) == QChar::Script_Common);
|
||||||
|
11
tests/auto/corelib/text/qunicodetools/CMakeLists.txt
Normal file
11
tests/auto/corelib/text/qunicodetools/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#####################################################################
|
||||||
|
## tst_qunicodetools Test:
|
||||||
|
#####################################################################
|
||||||
|
|
||||||
|
qt_internal_add_test(tst_qunicodetools
|
||||||
|
SOURCES
|
||||||
|
tst_qunicodetools.cpp
|
||||||
|
PUBLIC_LIBRARIES
|
||||||
|
Qt::CorePrivate
|
||||||
|
)
|
||||||
|
|
224
tests/auto/corelib/text/qunicodetools/tst_qunicodetools.cpp
Normal file
224
tests/auto/corelib/text/qunicodetools/tst_qunicodetools.cpp
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
**
|
||||||
|
** Copyright (C) 2021 The Qt Company Ltd.
|
||||||
|
** Contact: https://www.qt.io/licensing/
|
||||||
|
**
|
||||||
|
** This file is part of the test suite of the Qt Toolkit.
|
||||||
|
**
|
||||||
|
** $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
||||||
|
** Commercial License Usage
|
||||||
|
** Licensees holding valid commercial Qt licenses may use this file in
|
||||||
|
** accordance with the commercial license agreement provided with the
|
||||||
|
** Software or, alternatively, in accordance with the terms contained in
|
||||||
|
** a written agreement between you and The Qt Company. For licensing terms
|
||||||
|
** and conditions see https://www.qt.io/terms-conditions. For further
|
||||||
|
** information use the contact form at https://www.qt.io/contact-us.
|
||||||
|
**
|
||||||
|
** GNU General Public License Usage
|
||||||
|
** Alternatively, this file may be used under the terms of the GNU
|
||||||
|
** General Public License version 3 as published by the Free Software
|
||||||
|
** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
||||||
|
** included in the packaging of this file. Please review the following
|
||||||
|
** information to ensure the GNU General Public License requirements will
|
||||||
|
** be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
||||||
|
**
|
||||||
|
** $QT_END_LICENSE$
|
||||||
|
**
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
#include <QTest>
|
||||||
|
#include <qchar.h>
|
||||||
|
#include <qfile.h>
|
||||||
|
#include <qstringlist.h>
|
||||||
|
#include <private/qunicodetables_p.h>
|
||||||
|
#include <private/qunicodetools_p.h>
|
||||||
|
|
||||||
|
class tst_QUnicodeTools : public QObject
|
||||||
|
{
|
||||||
|
Q_OBJECT
|
||||||
|
private slots:
|
||||||
|
void lineBreakClass();
|
||||||
|
void graphemeBreakClass_data();
|
||||||
|
void graphemeBreakClass();
|
||||||
|
void wordBreakClass_data();
|
||||||
|
void wordBreakClass();
|
||||||
|
void sentenceBreakClass_data();
|
||||||
|
void sentenceBreakClass();
|
||||||
|
};
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::lineBreakClass()
|
||||||
|
{
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x0029) == QUnicodeTables::LineBreak_CP);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x0041) == QUnicodeTables::LineBreak_AL);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x0033) == QUnicodeTables::LineBreak_NU);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x00ad) == QUnicodeTables::LineBreak_BA);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x05d0) == QUnicodeTables::LineBreak_HL);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0xfffc) == QUnicodeTables::LineBreak_CB);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0xe0164) == QUnicodeTables::LineBreak_CM);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x2f9a4) == QUnicodeTables::LineBreak_ID);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x10000) == QUnicodeTables::LineBreak_AL);
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x1f1e6) == QUnicodeTables::LineBreak_RI);
|
||||||
|
|
||||||
|
// mapped to AL:
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0xfffd) == QUnicodeTables::LineBreak_AL); // AI -> AL
|
||||||
|
QVERIFY(QUnicodeTables::lineBreakClass(0x100000) == QUnicodeTables::LineBreak_AL); // XX -> AL
|
||||||
|
}
|
||||||
|
|
||||||
|
static void verifyCharClassPattern(QString str, qulonglong pattern,
|
||||||
|
QUnicodeTools::CharAttributeOptions type)
|
||||||
|
{
|
||||||
|
QUnicodeTools::ScriptItemArray scriptItems;
|
||||||
|
QUnicodeTools::initScripts(str, &scriptItems);
|
||||||
|
QCharAttributes cleared;
|
||||||
|
memset(&cleared, 0, sizeof(QCharAttributes));
|
||||||
|
QList<QCharAttributes> attributes(str.size() + 1, cleared);
|
||||||
|
QUnicodeTools::initCharAttributes(str, scriptItems.data(), scriptItems.count(),
|
||||||
|
attributes.data(), type);
|
||||||
|
|
||||||
|
qulonglong bit = 1ull << str.size();
|
||||||
|
Q_ASSERT(str.size() < std::numeric_limits<decltype(bit)>::digits);
|
||||||
|
for (qsizetype i = 0; i < str.size(); ++i) {
|
||||||
|
bit >>= 1;
|
||||||
|
bool test = pattern & bit;
|
||||||
|
bool isSet = false;
|
||||||
|
switch (type) {
|
||||||
|
case QUnicodeTools::GraphemeBreaks:
|
||||||
|
isSet = attributes[i].graphemeBoundary;
|
||||||
|
break;
|
||||||
|
case QUnicodeTools::WordBreaks:
|
||||||
|
isSet = attributes[i].wordBreak;
|
||||||
|
break;
|
||||||
|
case QUnicodeTools::SentenceBreaks:
|
||||||
|
isSet = attributes[i].sentenceBoundary;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
Q_UNREACHABLE();
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
QVERIFY2(isSet == test,
|
||||||
|
qPrintable(QString("Character #%1: 0x%2, isSet: %3")
|
||||||
|
.arg(i).arg(str[i].unicode(), 0, 16).arg(isSet)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::graphemeBreakClass_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QString>("str");
|
||||||
|
QTest::addColumn<int>("pattern");
|
||||||
|
|
||||||
|
// A grapheme cluster is a set of unicode code points that is
|
||||||
|
// seen as a single character.
|
||||||
|
// The pattern has one bit per code point.
|
||||||
|
// A pattern bit is set whenever a new grapheme cluster begins.
|
||||||
|
// A pattern bit is cleared for every code point that modifies
|
||||||
|
// the current graphene cluster.
|
||||||
|
|
||||||
|
QTest::addRow("g and combining diaeresis")
|
||||||
|
<< u8"g\u0308"
|
||||||
|
<< 0b10;
|
||||||
|
QTest::addRow("hangul gag single")
|
||||||
|
<< u8"\uAC01"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("hangul gag cluster")
|
||||||
|
<< u8"\u1100\u1161\u11A8"
|
||||||
|
<< 0b100;
|
||||||
|
QTest::addRow("thai ko")
|
||||||
|
<< u8"\u0E01"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("tamil ni")
|
||||||
|
<< u8"\u0BA8\u0BBF"
|
||||||
|
<< 0b10;
|
||||||
|
QTest::addRow("thai e")
|
||||||
|
<< u8"\u0E40"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("thai kam")
|
||||||
|
<< u8"\u0E01\u0E33"
|
||||||
|
<< 0b10;
|
||||||
|
QTest::addRow("devanagari ssi")
|
||||||
|
<< u8"\u0937\u093F"
|
||||||
|
<< 0b10;
|
||||||
|
QTest::addRow("thai am")
|
||||||
|
<< u8"\u0E33"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("devanagari ssa")
|
||||||
|
<< u8"\u0937"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("devanagari i")
|
||||||
|
<< u8"\u093F"
|
||||||
|
<< 0b1;
|
||||||
|
QTest::addRow("devanagari kshi")
|
||||||
|
<< u8"\u0915\u094D\u0937\u093F"
|
||||||
|
<< 0b1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::graphemeBreakClass()
|
||||||
|
{
|
||||||
|
QFETCH(QString, str);
|
||||||
|
QFETCH(int, pattern);
|
||||||
|
|
||||||
|
verifyCharClassPattern(str, pattern, QUnicodeTools::GraphemeBreaks);
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::wordBreakClass_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QString>("str");
|
||||||
|
QTest::addColumn<qulonglong>("pattern");
|
||||||
|
|
||||||
|
// Word boundaries are used for things like selection and whole word search.
|
||||||
|
// Typically they are beginning of words, whitespaces and punctuation.
|
||||||
|
|
||||||
|
QTest::addRow("two words")
|
||||||
|
<< "two words"
|
||||||
|
<< 0b100110000ULL;
|
||||||
|
// breaks at beginning of words and space
|
||||||
|
QTest::addRow("three words")
|
||||||
|
<< "The quick fox"
|
||||||
|
<< 0b1001100001100ULL;
|
||||||
|
// breaks at beginning of words and spaces
|
||||||
|
QTest::addRow("quoted")
|
||||||
|
<< u8"The quick (\"brown\") fox"
|
||||||
|
<< 0b10011000011'110000'111100ULL;
|
||||||
|
// as above plus quotes and parentesis
|
||||||
|
QTest::addRow("long")
|
||||||
|
<< "The quick (\"brown\") fox can’t jump 32.3 feet, right?"
|
||||||
|
<< 0b10011000011'110000'11110011000011000110001100011100001ULL;
|
||||||
|
// as above plus commma and question mark
|
||||||
|
// but decimal separator and apostrophes are not word breaks
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::wordBreakClass()
|
||||||
|
{
|
||||||
|
QFETCH(QString, str);
|
||||||
|
QFETCH(qulonglong, pattern);
|
||||||
|
|
||||||
|
verifyCharClassPattern(str, pattern, QUnicodeTools::WordBreaks);
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::sentenceBreakClass_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QString>("str");
|
||||||
|
QTest::addColumn<qulonglong>("pattern");
|
||||||
|
|
||||||
|
// Sentence boundaries are at the beginning of each new sentence
|
||||||
|
|
||||||
|
QTest::addRow("one sentence")
|
||||||
|
<< "One sentence."
|
||||||
|
<< 0b1000000000000ULL;
|
||||||
|
QTest::addRow("two sentences")
|
||||||
|
<< "One sentence. One more."
|
||||||
|
<< 0b10000000000000100000000ULL;
|
||||||
|
QTest::addRow("question")
|
||||||
|
<< "Who said \"Hey you?\" I did."
|
||||||
|
<< 0b100000000'000000000'00100000ULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QUnicodeTools::sentenceBreakClass()
|
||||||
|
{
|
||||||
|
QFETCH(QString, str);
|
||||||
|
QFETCH(qulonglong, pattern);
|
||||||
|
|
||||||
|
verifyCharClassPattern(str, pattern, QUnicodeTools::SentenceBreaks);
|
||||||
|
}
|
||||||
|
|
||||||
|
QTEST_APPLESS_MAIN(tst_QUnicodeTools)
|
||||||
|
#include "tst_qunicodetools.moc"
|
Loading…
x
Reference in New Issue
Block a user