QCharAttributes: add wordStart/wordEnd flags

A simple heuristic is used to detect the word beginning and ending by
looking at the word break property value of surrounding characters.
This behaves better than the white-spaces based implementation used before
and makes it possible to tailor the default algorithm for complex scripts.

BIG FAT WARNING: The QCharAttributes buffer now has to have a length
                 of string length + 1 for the flags at end of text.

Task-Id: QTBUG-6498

Change-Id: I5589b191ffde6a50d2af0c14a00430d3852c67b4
Reviewed-by: Konstantin Ritt <ritt.ks@gmail.com>
This commit is contained in:
Konstantin Ritt 2012-09-25 23:55:54 +03:00 committed by The Qt Project
parent aeb21c73c5
commit a798b956b9
6 changed files with 143 additions and 45 deletions

View File

@ -133,7 +133,9 @@ typedef struct {
hb_bitfield sentenceBoundary : 1;
hb_bitfield lineBreak : 1;
hb_bitfield whiteSpace : 1; /* A unicode whitespace character */
hb_bitfield unused : 3;
hb_bitfield wordStart : 1;
hb_bitfield wordEnd : 1;
hb_bitfield unused : 1;
} HB_CharAttributes;
void HB_GetTailoredCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,

View File

@ -395,8 +395,10 @@ static void HB_ThaiAssignAttributes(const HB_UChar16 *string, hb_uint32 len, HB_
to_tis620(string, len, cstr);
for (i = 0; i < len; ++i) {
attributes[i].lineBreak = FALSE;
attributes[i].wordBreak = FALSE;
attributes[i].wordStart = FALSE;
attributes[i].wordEnd = FALSE;
attributes[i].lineBreak = FALSE;
}
if (len > 128) {
@ -411,11 +413,17 @@ static void HB_ThaiAssignAttributes(const HB_UChar16 *string, hb_uint32 len, HB_
if (break_positions) {
attributes[0].wordBreak = TRUE;
attributes[0].wordStart = TRUE;
attributes[0].wordEnd = FALSE;
numbreaks = th_brk((const unsigned char *)cstr, break_positions, brp_size);
for (i = 0; i < numbreaks; ++i) {
attributes[break_positions[i]].wordBreak = TRUE;
attributes[break_positions[i]].wordStart = TRUE;
attributes[break_positions[i]].wordEnd = TRUE;
attributes[break_positions[i]].lineBreak = TRUE;
}
if (numbreaks > 0)
attributes[break_positions[numbreaks - 1]].wordStart = FALSE;
if (break_positions != brp)
free(break_positions);

View File

@ -89,7 +89,7 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
scriptItems.append(item);
}
QUnicodeTools::CharAttributeOptions options = QUnicodeTools::WhiteSpaces;
QUnicodeTools::CharAttributeOptions options = 0;
switch (type) {
case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break;
case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break;
@ -189,9 +189,9 @@ QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
, pos(other.pos)
, freePrivate(true)
{
d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes));
d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
Q_CHECK_PTR(d);
memcpy(d, other.d, length*sizeof(QCharAttributes));
memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes));
}
/*!
@ -209,11 +209,11 @@ QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &o
pos = other.pos;
QTextBoundaryFinderPrivate *newD = (QTextBoundaryFinderPrivate *)
realloc(freePrivate ? d : 0, length*sizeof(QCharAttributes));
realloc(freePrivate ? d : 0, (length + 1) * sizeof(QCharAttributes));
Q_CHECK_PTR(newD);
freePrivate = true;
d = newD;
memcpy(d, other.d, length*sizeof(QCharAttributes));
memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes));
return *this;
}
@ -238,7 +238,7 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &strin
, pos(0)
, freePrivate(true)
{
d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes));
d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
Q_CHECK_PTR(d);
init(t, chars, length, d->attributes);
}
@ -249,7 +249,8 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &strin
\a buffer is an optional working buffer of size \a bufferSize you can pass to
the QTextBoundaryFinder. If the buffer is large enough to hold the working
data required, it will use this instead of allocating its own buffer.
data required (bufferSize >= length + 1), it will use this
instead of allocating its own buffer.
\warning QTextBoundaryFinder does not create a copy of \a chars. It is the
application programmer's responsibility to ensure the array is allocated for
@ -262,11 +263,11 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars,
, length(length)
, pos(0)
{
if (buffer && (uint)bufferSize >= length*sizeof(QCharAttributes)) {
if (buffer && (uint)bufferSize >= (length + 1) * sizeof(QCharAttributes)) {
d = (QTextBoundaryFinderPrivate *)buffer;
freePrivate = false;
} else {
d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes));
d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
Q_CHECK_PTR(d);
freePrivate = true;
}
@ -455,38 +456,30 @@ bool QTextBoundaryFinder::isAtBoundary() const
*/
QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
{
if (!d)
return NotAtBoundary;
if (! isAtBoundary())
return NotAtBoundary;
if (pos == 0) {
if (d->attributes[pos].whiteSpace)
return NotAtBoundary;
return StartWord;
}
if (pos == length) {
if (d->attributes[length-1].whiteSpace)
return NotAtBoundary;
return EndWord;
BoundaryReasons reasons = NotAtBoundary;
if (!d || !isAtBoundary())
return reasons;
switch (t) {
case Word:
if (d->attributes[pos].wordStart)
reasons |= StartWord;
if (d->attributes[pos].wordEnd)
reasons |= EndWord;
break;
case Line:
if (pos > 0 && chars[pos - 1].unicode() == QChar::SoftHyphen)
reasons |= SoftHyphen;
// fall through
case Grapheme:
case Sentence:
reasons |= StartWord | EndWord;
break;
default:
break;
}
if (t == Line && chars[pos - 1].unicode() == QChar::SoftHyphen)
return SoftHyphen;
if (t != Word)
return BoundaryReasons(StartWord | EndWord);
const bool nextIsSpace = d->attributes[pos].whiteSpace;
const bool prevIsSpace = d->attributes[pos - 1].whiteSpace;
if (prevIsSpace && !nextIsSpace)
return StartWord;
else if (!prevIsSpace && nextIsSpace)
return EndWord;
else if (!prevIsSpace && !nextIsSpace)
return BoundaryReasons(StartWord | EndWord);
else
return NotAtBoundary;
return reasons;
}
QT_END_NAMESPACE

View File

@ -102,6 +102,8 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes
lcls = cls;
}
attributes[len].graphemeBoundary = true; // GB2
}
@ -133,6 +135,10 @@ static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnico
static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
{
enum WordType {
WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
} currentWordType = WordTypeNone;
QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
@ -178,9 +184,30 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
continue;
}
cls = ncls;
if (action == WB::Break)
if (action == WB::Break) {
attributes[pos].wordBreak = true;
if (currentWordType != WordTypeNone)
attributes[pos].wordEnd = true;
switch (cls) {
case QUnicodeTables::WordBreak_Katakana:
currentWordType = WordTypeHiraganaKatakana;
attributes[pos].wordStart = true;
break;
case QUnicodeTables::WordBreak_ALetter:
case QUnicodeTables::WordBreak_Numeric:
currentWordType = WordTypeAlphaNumeric;
attributes[pos].wordStart = true;
break;
default:
currentWordType = WordTypeNone;
break;
}
}
}
if (currentWordType != WordTypeNone)
attributes[len].wordEnd = true;
attributes[len].wordBreak = true; // WB2
}
@ -277,6 +304,8 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
state = SB::breakTable[SB::Initial][ncls];
}
}
attributes[len].sentenceBoundary = true; // SB2
}
@ -514,6 +543,7 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at
}
attributes[0].lineBreak = false; // LB2
attributes[len].lineBreak = true; // LB3
}
@ -543,7 +573,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
return;
if (!(options & DontClearAttributes))
::memset(attributes, 0, length * sizeof(QCharAttributes));
::memset(attributes, 0, (length + 1) * sizeof(QCharAttributes));
if (options & GraphemeBreaks)
getGraphemeBreaks(string, length, attributes);

View File

@ -64,7 +64,9 @@ struct Q_PACKED QCharAttributes
uchar sentenceBoundary : 1;
uchar lineBreak : 1;
uchar whiteSpace : 1;
uchar unused : 3;
uchar wordStart : 1;
uchar wordEnd : 1;
uchar unused : 1;
};
Q_DECLARE_TYPEINFO(QCharAttributes, Q_PRIMITIVE_TYPE);
@ -89,6 +91,7 @@ enum CharAttributeOption {
};
Q_DECLARE_FLAGS(CharAttributeOptions, CharAttributeOption)
// attributes buffer has to have a length of string length + 1
Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
const ScriptItem *items, int numItems,
QCharAttributes *attributes, CharAttributeOptions options = DefaultOptionsCompat);

View File

@ -72,6 +72,7 @@ private slots:
void lineBoundaries_manual();
void fastConstructor();
void wordBoundaries_qtbug6498();
void isAtSoftHyphen_data();
void isAtSoftHyphen();
void thaiLineBreak();
@ -544,6 +545,67 @@ void tst_QTextBoundaryFinder::fastConstructor()
QCOMPARE(finder.boundaryReasons(), QTextBoundaryFinder::NotAtBoundary);
}
void tst_QTextBoundaryFinder::wordBoundaries_qtbug6498()
{
// text with trailing space
QString text("Please test me. Finish ");
QTextBoundaryFinder finder(QTextBoundaryFinder::Word, text);
QCOMPARE(finder.position(), 0);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord);
QCOMPARE(finder.toNextBoundary(), 6);
QCOMPARE(finder.position(), 6);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord);
QCOMPARE(finder.toNextBoundary(), 7);
QCOMPARE(finder.position(), 7);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord);
QCOMPARE(finder.toNextBoundary(), 11);
QCOMPARE(finder.position(), 11);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord);
QCOMPARE(finder.toNextBoundary(), 12);
QCOMPARE(finder.position(), 12);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord);
QCOMPARE(finder.toNextBoundary(), 14);
QCOMPARE(finder.position(), 14);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord);
QCOMPARE(finder.toNextBoundary(), 15);
QCOMPARE(finder.position(), 15);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary);
QCOMPARE(finder.toNextBoundary(), 16);
QCOMPARE(finder.position(), 16);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord);
QCOMPARE(finder.toNextBoundary(), 22);
QCOMPARE(finder.position(), 22);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord);
QCOMPARE(finder.toNextBoundary(), 23);
QCOMPARE(finder.position(), 23);
QVERIFY(finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary);
QCOMPARE(finder.toNextBoundary(), -1);
QCOMPARE(finder.position(), -1);
QVERIFY(!finder.isAtBoundary());
QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary);
}
void tst_QTextBoundaryFinder::isAtSoftHyphen_data()
{
QTest::addColumn<QString>("testString");
@ -568,7 +630,7 @@ void tst_QTextBoundaryFinder::isAtSoftHyphen()
QVERIFY(expectedBreakPositions.contains(i + 1));
boundaryFinder.setPosition(i + 1);
QVERIFY(boundaryFinder.isAtBoundary());
QVERIFY(boundaryFinder.boundaryReasons() == QTextBoundaryFinder::SoftHyphen);
QVERIFY(boundaryFinder.boundaryReasons() & QTextBoundaryFinder::SoftHyphen);
}
}