3 files changed, 89 insertions, 4 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index fbc28ca712..58ea21edab 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -86,6 +86,47 @@ def parse_list_pattern_part_format(pattern):
     # This is a very limited parsing of the format for list pattern part only.
     return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
 
+def unit_quantifiers(find, path, stem, suffix, known,
+                     # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+                     # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+                     si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+    """Work out the unit quantifiers.
+
+    Unfortunately, the CLDR data only go up to terabytes and we want
+    all the way to exabytes; but we can recognize the SI quantifiers
+    as prefixes, strip and identify the tail as the localized
+    translation for 'B' (e.g. French has 'octet' for 'byte' and uses
+    ko, Mo, Go, To from which we can extrapolate Po, Eo).
+
+    Should be called first for the SI quantifiers, with suffix = 'B',
+    then for the IEC ones, with suffix = 'iB'; the list known
+    (initially empty before first call) is used to let the second call
+    know what the first learned about the localized unit.
+    """
+    if suffix == 'B': # first call, known = []
+        tail = suffix
+        for q in si_quantifiers:
+            it = find(path, stem % q)
+            # kB for kilobyte, in contrast with KiB for IEC:
+            q = q[0] if q == 'kilo' else q[0].upper()
+            if not it:
+                it = q + tail
+            elif it.startswith(q):
+                rest = it[1:]
+                tail = rest if all(rest == k for k in known) else suffix
+                known.append(rest)
+            yield it
+    else: # second call, re-using first's known
+        assert suffix == 'iB'
+        if known:
+            byte = known.pop()
+            if all(byte == k for k in known):
+                suffix = 'i' + byte
+        for q in si_quantifiers:
+            yield find(path, stem % q[:2],
+                       # Those don't (yet, v31) exist in CLDR, so we always fall back to:
+                       q[0].upper() + suffix)
+
 def generateLocaleInfo(path):
     if not path.endswith(".xml"):
         return {}
@@ -261,6 +302,34 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
                 '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
                 ]) + ';'
 
+    def findUnitDef(path, stem, fallback=''):
+        # The displayName for a quantified unit in en.xml is kByte
+        # instead of kB (etc.), so prefer any unitPattern provided:
+        for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+            try:
+                ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
+            except xpathlite.Error:
+                continue
+
+            # TODO: epxloit count-handling, instead of discarding placeholders
+            if ans.startswith('{0}'):
+                ans = ans[3:].lstrip()
+            if ans:
+                return ans
+
+        return findEntryDef(path, stem + 'displayName', fallback)
+
+    # First without quantifier, then quantified each way:
+    result['byte_unit'] = findEntryDef(
+        path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
+        'bytes')
+    stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
+    known = [] # cases where we *do* have a given version:
+    result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
+    # IEC 60027-2
+    # http://physics.nist.gov/cuu/Units/binary.html
+    result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
+
     # Used for month and day data:
     namings = (
         ('standaloneLong', 'stand-alone', 'wide'),
diff --git a/util/local_database/localexml.py b/util/local_database/localexml.py
index 6db10e2b9a..a47fa6a5ff 100644
--- a/util/local_database/localexml.py
+++ b/util/local_database/localexml.py
@@ -111,6 +111,7 @@ class Locale:
     __astxt = ("language", "languageEndonym", "script", "country", "countryEndonym",
                "listPatternPartStart", "listPatternPartMiddle",
                "listPatternPartEnd", "listPatternPartTwo", "am", "pm",
+               'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
                "currencyIsoCode", "currencySymbol", "currencyDisplayName",
                "currencyFormat", "currencyNegativeFormat"
                ) + tuple(propsMonthDay())
@@ -169,6 +170,7 @@ class Locale:
                     'alternateQuotationStart', 'alternateQuotationEnd',
                     'listPatternPartStart', 'listPatternPartMiddle',
                     'listPatternPartEnd', 'listPatternPartTwo',
+                    'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
                     'am', 'pm', 'firstDayOfWeek',
                     'weekendStart', 'weekendEnd',
                     'longDateFormat', 'shortDateFormat',
@@ -180,7 +182,7 @@ class Locale:
                     'standaloneLongDays', 'standaloneShortDays', 'standaloneNarrowDays',
                     'currencyIsoCode', 'currencySymbol', 'currencyDisplayName',
                     'currencyFormat', 'currencyNegativeFormat'):
-            ent = camelCase(key.split('_')) if '_' in key else key
+            ent = camelCase(key.split('_')) if key.endswith('_endonym') else key
             print inner + "<%s>%s</%s>" % (ent, escape(get(key)).encode('utf-8'), ent)
 
         for key in ('currencyDigits', 'currencyRounding'):
@@ -198,7 +200,8 @@ class Locale:
           months = ('January', 'February', 'March', 'April', 'May', 'June', 'July',
                     'August', 'September', 'October', 'November', 'December', ''),
           days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
-                  'Thursday', 'Friday', 'Saturday', '')):
+                  'Thursday', 'Friday', 'Saturday', ''),
+          quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
         """Returns an object representing the C locale."""
         return cls(language='C', language_code='0', language_endonym='',
                    script='AnyScript', script_code='0',
@@ -211,6 +214,9 @@ class Locale:
                    listPatternPartMiddle='%1, %2',
                    listPatternPartEnd='%1, %2',
                    listPatternPartTwo='%1, %2',
+                   byte_unit='bytes',
+                   byte_si_quantified=';'.join(q + 'B' for q in quantifiers),
+                   byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
                    am='AM', pm='PM', firstDayOfWeek='mon',
                    weekendStart='sat', weekendEnd='sun',
                    longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy',
diff --git a/util/local_database/qlocalexml2cpp.py b/util/local_database/qlocalexml2cpp.py
index baa5a60263..0f10f8ce2d 100755
--- a/util/local_database/qlocalexml2cpp.py
+++ b/util/local_database/qlocalexml2cpp.py
@@ -445,6 +445,7 @@ def main():
     days_data = StringData('days_data')
     am_data = StringData('am_data')
     pm_data = StringData('pm_data')
+    byte_unit_data = StringData('byte_unit_data')
     currency_symbol_data = StringData('currency_symbol_data')
     currency_display_name_data = StringData('currency_display_name_data')
     currency_format_data = StringData('currency_format_data')
@@ -494,6 +495,10 @@ def main():
                          + '    nDays   '
                          + '     am     ' # am/pm indicators
                          + '     pm     '
+                         # Width 8 + comma
+                         + '  byte   '
+                         + ' siQuant '
+                         + 'iecQuant '
                          # Width 8+4 + comma
                          + '   currISO   '
                          # Width 11 + comma:
@@ -527,6 +532,8 @@ def main():
                    + '%8d,' * 4
                    # List patterns, date/time formats, month/day names, am/pm:
                    + '%11s,' * 22
+                   # SI/IEC byte-unit abbreviations:
+                   + '%8s,' * 3
                    # Currency ISO code:
                    + ' %10s, '
                    # Currency and endonyms
@@ -574,6 +581,9 @@ def main():
                         days_data.append(l.narrowDays),
                         am_data.append(l.am),
                         pm_data.append(l.pm),
+                        byte_unit_data.append(l.byte_unit),
+                        byte_unit_data.append(l.byte_si_quantified),
+                        byte_unit_data.append(l.byte_iec_quantified),
                         currencyIsoCodeData(l.currencyIsoCode),
                         currency_symbol_data.append(l.currencySymbol),
                         currency_display_name_data.append(l.currencyDisplayName),
@@ -588,7 +598,7 @@ def main():
                         l.weekendEnd)
                              + ", // %s/%s/%s\n" % (l.language, l.script, l.country))
     data_temp_file.write(line_format # All zeros, matching the format:
-                         % ( (0,) * (3 + 8 + 4) + ("0,0",) * 22
+                         % ( (0,) * (3 + 8 + 4) + ("0,0",) * (22 + 3)
                              + (currencyIsoCodeData(0),)
                              + ("0,0",) * 6 + (0,) * (2 + 3))
                          + " // trailing 0s\n")
@@ -597,7 +607,7 @@ def main():
     # StringData tables:
     for data in (list_pattern_part_data, date_format_data,
                  time_format_data, months_data, days_data,
-                 am_data, pm_data, currency_symbol_data,
+                 byte_unit_data, am_data, pm_data, currency_symbol_data,
                  currency_display_name_data, currency_format_data,
                  endonyms_data):
         data_temp_file.write("\nstatic const ushort %s[] = {\n" % data.name)