summaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
authorEdward Welbourne <edward.welbourne@qt.io>2017-05-30 14:55:33 +0200
committerEdward Welbourne <edward.welbourne@qt.io>2017-06-13 11:28:09 +0000
commit424d9e9e56314bae09a0dcbf09be0e3c0c9e0ac6 (patch)
tree24f213cbb1f3b6be0a8c865fc901007965a27bcf /util
parent536b918ecaed0b8a04ca2b7c0884eea12ddb6931 (diff)
Add byte-based units to CLDR data
Scan CLDR for {,kilo,mega,giga,tera,peta,exa}byte forms and their IEC equivalents, providing SI and IEC defaults when missing (which all of IEC are) in addition to the usual numeric data. Extrapolate from any present data (e.g. French's ko, Mo, Go, To imply Po, Eo and, for IEC, Kio, Mio, etc.), since CLDR only goes up to tera. Propagate this data to QLocale's database ready for use by QLocale::formattedDataSize(). Change-Id: Ie6ee978948c68be9f71ab784a128cbfae3d80ee1 Reviewed-by: Shawn Rutledge <shawn.rutledge@qt.io>
Diffstat (limited to 'util')
-rwxr-xr-xutil/local_database/cldr2qlocalexml.py69
-rw-r--r--util/local_database/localexml.py10
-rwxr-xr-xutil/local_database/qlocalexml2cpp.py14
3 files changed, 89 insertions, 4 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index fbc28ca712..58ea21edab 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -86,6 +86,47 @@ def parse_list_pattern_part_format(pattern):
# This is a very limited parsing of the format for list pattern part only.
return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
+def unit_quantifiers(find, path, stem, suffix, known,
+ # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+ # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+ si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+ """Work out the unit quantifiers.
+
+ Unfortunately, the CLDR data only go up to terabytes and we want
+ all the way to exabytes; but we can recognize the SI quantifiers
+ as prefixes, strip and identify the tail as the localized
+ translation for 'B' (e.g. French has 'octet' for 'byte' and uses
+ ko, Mo, Go, To from which we can extrapolate Po, Eo).
+
+ Should be called first for the SI quantifiers, with suffix = 'B',
+ then for the IEC ones, with suffix = 'iB'; the list known
+ (initially empty before first call) is used to let the second call
+ know what the first learned about the localized unit.
+ """
+ if suffix == 'B': # first call, known = []
+ tail = suffix
+ for q in si_quantifiers:
+ it = find(path, stem % q)
+ # kB for kilobyte, in contrast with KiB for IEC:
+ q = q[0] if q == 'kilo' else q[0].upper()
+ if not it:
+ it = q + tail
+ elif it.startswith(q):
+ rest = it[1:]
+ tail = rest if all(rest == k for k in known) else suffix
+ known.append(rest)
+ yield it
+ else: # second call, re-using first's known
+ assert suffix == 'iB'
+ if known:
+ byte = known.pop()
+ if all(byte == k for k in known):
+ suffix = 'i' + byte
+ for q in si_quantifiers:
+ yield find(path, stem % q[:2],
+ # Those don't (yet, v31) exist in CLDR, so we always fall back to:
+ q[0].upper() + suffix)
+
def generateLocaleInfo(path):
if not path.endswith(".xml"):
return {}
@@ -261,6 +302,34 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
'[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
]) + ';'
+ def findUnitDef(path, stem, fallback=''):
+ # The displayName for a quantified unit in en.xml is kByte
+ # instead of kB (etc.), so prefer any unitPattern provided:
+ for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+ try:
+ ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
+ except xpathlite.Error:
+ continue
+
+ # TODO: epxloit count-handling, instead of discarding placeholders
+ if ans.startswith('{0}'):
+ ans = ans[3:].lstrip()
+ if ans:
+ return ans
+
+ return findEntryDef(path, stem + 'displayName', fallback)
+
+ # First without quantifier, then quantified each way:
+ result['byte_unit'] = findEntryDef(
+ path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
+ 'bytes')
+ stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
+ known = [] # cases where we *do* have a given version:
+ result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
+ # IEC 60027-2
+ # http://physics.nist.gov/cuu/Units/binary.html
+ result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
+
# Used for month and day data:
namings = (
('standaloneLong', 'stand-alone', 'wide'),
diff --git a/util/local_database/localexml.py b/util/local_database/localexml.py
index 6db10e2b9a..a47fa6a5ff 100644
--- a/util/local_database/localexml.py
+++ b/util/local_database/localexml.py
@@ -111,6 +111,7 @@ class Locale:
__astxt = ("language", "languageEndonym", "script", "country", "countryEndonym",
"listPatternPartStart", "listPatternPartMiddle",
"listPatternPartEnd", "listPatternPartTwo", "am", "pm",
+ 'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
"currencyIsoCode", "currencySymbol", "currencyDisplayName",
"currencyFormat", "currencyNegativeFormat"
) + tuple(propsMonthDay())
@@ -169,6 +170,7 @@ class Locale:
'alternateQuotationStart', 'alternateQuotationEnd',
'listPatternPartStart', 'listPatternPartMiddle',
'listPatternPartEnd', 'listPatternPartTwo',
+ 'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
'am', 'pm', 'firstDayOfWeek',
'weekendStart', 'weekendEnd',
'longDateFormat', 'shortDateFormat',
@@ -180,7 +182,7 @@ class Locale:
'standaloneLongDays', 'standaloneShortDays', 'standaloneNarrowDays',
'currencyIsoCode', 'currencySymbol', 'currencyDisplayName',
'currencyFormat', 'currencyNegativeFormat'):
- ent = camelCase(key.split('_')) if '_' in key else key
+ ent = camelCase(key.split('_')) if key.endswith('_endonym') else key
print inner + "<%s>%s</%s>" % (ent, escape(get(key)).encode('utf-8'), ent)
for key in ('currencyDigits', 'currencyRounding'):
@@ -198,7 +200,8 @@ class Locale:
months = ('January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December', ''),
days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
- 'Thursday', 'Friday', 'Saturday', '')):
+ 'Thursday', 'Friday', 'Saturday', ''),
+ quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
"""Returns an object representing the C locale."""
return cls(language='C', language_code='0', language_endonym='',
script='AnyScript', script_code='0',
@@ -211,6 +214,9 @@ class Locale:
listPatternPartMiddle='%1, %2',
listPatternPartEnd='%1, %2',
listPatternPartTwo='%1, %2',
+ byte_unit='bytes',
+ byte_si_quantified=';'.join(q + 'B' for q in quantifiers),
+ byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
am='AM', pm='PM', firstDayOfWeek='mon',
weekendStart='sat', weekendEnd='sun',
longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy',
diff --git a/util/local_database/qlocalexml2cpp.py b/util/local_database/qlocalexml2cpp.py
index baa5a60263..0f10f8ce2d 100755
--- a/util/local_database/qlocalexml2cpp.py
+++ b/util/local_database/qlocalexml2cpp.py
@@ -445,6 +445,7 @@ def main():
days_data = StringData('days_data')
am_data = StringData('am_data')
pm_data = StringData('pm_data')
+ byte_unit_data = StringData('byte_unit_data')
currency_symbol_data = StringData('currency_symbol_data')
currency_display_name_data = StringData('currency_display_name_data')
currency_format_data = StringData('currency_format_data')
@@ -494,6 +495,10 @@ def main():
+ ' nDays '
+ ' am ' # am/pm indicators
+ ' pm '
+ # Width 8 + comma
+ + ' byte '
+ + ' siQuant '
+ + 'iecQuant '
# Width 8+4 + comma
+ ' currISO '
# Width 11 + comma:
@@ -527,6 +532,8 @@ def main():
+ '%8d,' * 4
# List patterns, date/time formats, month/day names, am/pm:
+ '%11s,' * 22
+ # SI/IEC byte-unit abbreviations:
+ + '%8s,' * 3
# Currency ISO code:
+ ' %10s, '
# Currency and endonyms
@@ -574,6 +581,9 @@ def main():
days_data.append(l.narrowDays),
am_data.append(l.am),
pm_data.append(l.pm),
+ byte_unit_data.append(l.byte_unit),
+ byte_unit_data.append(l.byte_si_quantified),
+ byte_unit_data.append(l.byte_iec_quantified),
currencyIsoCodeData(l.currencyIsoCode),
currency_symbol_data.append(l.currencySymbol),
currency_display_name_data.append(l.currencyDisplayName),
@@ -588,7 +598,7 @@ def main():
l.weekendEnd)
+ ", // %s/%s/%s\n" % (l.language, l.script, l.country))
data_temp_file.write(line_format # All zeros, matching the format:
- % ( (0,) * (3 + 8 + 4) + ("0,0",) * 22
+ % ( (0,) * (3 + 8 + 4) + ("0,0",) * (22 + 3)
+ (currencyIsoCodeData(0),)
+ ("0,0",) * 6 + (0,) * (2 + 3))
+ " // trailing 0s\n")
@@ -597,7 +607,7 @@ def main():
# StringData tables:
for data in (list_pattern_part_data, date_format_data,
time_format_data, months_data, days_data,
- am_data, pm_data, currency_symbol_data,
+ byte_unit_data, am_data, pm_data, currency_symbol_data,
currency_display_name_data, currency_format_data,
endonyms_data):
data_temp_file.write("\nstatic const ushort %s[] = {\n" % data.name)