Round float->qfloat16 to even

Calibrated to match F16C and ARM-FP16 hardware conversions. Change-Id: I3bdd4d3db3046fee4aeb24e4ce8b9bc9a06e0397 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-07-29 12:11:35 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-09-06 12:35:12 +0200
commit: d3ff95dcb84861e8f42b480910d822b4ca8715b1 (patch)
tree: 7ade0148126e83ed589258983b632fbe3488d7fb /util/qfloat16-tables
parent: a0e0b51001edfc1c7aea113c472ce995efa833fd (diff)
1 files changed, 31 insertions, 3 deletions
diff --git a/util/qfloat16-tables/gen_qfloat16_tables.cpp b/util/qfloat16-tables/gen_qfloat16_tables.cpp
index 5d7bab01d2..af172c5513 100644
--- a/util/qfloat16-tables/gen_qfloat16_tables.cpp
+++ b/util/qfloat16-tables/gen_qfloat16_tables.cpp
@@ -2,6 +2,7 @@
 **
 ** Copyright (C) 2016 by Southwest Research Institute (R)
 ** Copyright (C) 2019 Intel Corporation.
+** Copyright (C) 2020 The Qt Company Ltd.
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -71,6 +72,7 @@ uint32_t convertmantissa(int32_t i)
 // to more closely map the implementation given in the paper.
 uint32_t basetable[512];
 uint32_t shifttable[512];
+uint32_t roundtable[512];
 
 int main()
 {
@@ -113,50 +115,76 @@ int main()
     int32_t e;
     for (i = 0; i < 256; ++i) {
         e = i - 127;
-        if (e < -24) {   // Very small numbers map to zero
+        if (e < -25) {   // Very small numbers map to zero
             basetable[i | 0x000] = 0x0000;
             basetable[i | 0x100] = 0x8000;
             shifttable[i | 0x000] = 24;
             shifttable[i | 0x100] = 24;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;
 
         } else if (e < -14) {             // Small numbers map to denorms
             basetable[i | 0x000] = (0x0400 >> (-e - 14));
             basetable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
             shifttable[i | 0x000] = -e - 1;
             shifttable[i | 0x100] = -e - 1;
+            if (e == -25) {
+                // rounds up
+                roundtable[i | 0x000] = (1 << 24);
+                roundtable[i | 0x100] = (1 << 24);
+            } else if (e == -24) {
+                // rounds half up
+                roundtable[i | 0x000] = (1 << 22) + 1;
+                roundtable[i | 0x100] = (1 << 22) + 1;
+            } else {
+                roundtable[i | 0x000] = (1 << (-e - 2));
+                roundtable[i | 0x100] = (1 << (-e - 2));
+            }
 
         } else if (e <= 15) {            // Normal numbers just lose precision
             basetable[i | 0x000] = ((e + 15) << 10);
             basetable[i | 0x100] = ((e + 15) << 10) | 0x8000;
             shifttable[i | 0x000] = 13;
             shifttable[i | 0x100] = 13;
+            roundtable[i | 0x000] = (1 << 12);
+            roundtable[i | 0x100] = (1 << 12);
 
         } else if (e < 128) {            // Large numbers map to Infinity
             basetable[i | 0x000] = 0x7C00;
             basetable[i | 0x100] = 0xFC00;
             shifttable[i | 0x000] = 24;
             shifttable[i | 0x100] = 24;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;
 
         } else {                     // Infinity and NaN's stay Infinity and NaN's
             basetable[i | 0x000] = 0x7C00;
             basetable[i | 0x100] = 0xFC00;
             shifttable[i | 0x000] = 13;
             shifttable[i | 0x100] = 13;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;
         }
     }
 
-    printf("const quint32 qfloat16::basetable[512] = {\n");
+    printf("const quint16 qfloat16::basetable[512] = {\n");
     for (i = 0; i < 512; i++)
         printf("0x%XU,\n", basetable[i]);
 
     printf("};\n\n");
 
-    printf("const quint32 qfloat16::shifttable[512] = {\n");
+    printf("const quint16 qfloat16::shifttable[512] = {\n");
     for (i = 0; i < 512; i++)
         printf("0x%XU,\n", shifttable[i]);
 
     printf("};\n\n");
 
+    printf("const quint32 qfloat16::roundtable[512] = {\n");
+    for (i = 0; i < 512; i++)
+        printf("0x%XU,\n", roundtable[i]);
+
+    printf("};\n\n");
+
     printf("#endif // !__ARM_FP16_FORMAT_IEEE\n\n");
     printf("QT_END_NAMESPACE\n");
     return 0;
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-07-29 12:11:35 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-09-06 12:35:12 +0200
commit	d3ff95dcb84861e8f42b480910d822b4ca8715b1 (patch)
tree	7ade0148126e83ed589258983b632fbe3488d7fb /util/qfloat16-tables
parent	a0e0b51001edfc1c7aea113c472ce995efa833fd (diff)