summaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
Diffstat (limited to 'util')
-rw-r--r--util/unicode/main.cpp9
1 files changed, 9 insertions, 0 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 7bc667ca14..8e612f0b03 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -1572,6 +1572,15 @@ static void readWordBreak()
qFatal("unassigned word break class: %s", l[1].constData());
for (int codepoint = from; codepoint <= to; ++codepoint) {
+ // ### [
+ // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+ // which caused "hi.there" to be treated like if it were just a single word;
+ // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
+ if (codepoint == 0x002E) // FULL STOP
+ brk = WordBreak_MidNum;
+ else if (codepoint == 0x003A) // COLON
+ brk = WordBreak_Other;
+ // ] ###
UnicodeData &ud = UnicodeData::valueRef(codepoint);
ud.p.wordBreakClass = brk;
}