aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h')
-rw-r--r--src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h157
1 files changed, 157 insertions, 0 deletions
diff --git a/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h b/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h
new file mode 100644
index 00000000..5e1d7818
--- /dev/null
+++ b/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTDEF_H__
+#define PINYINIME_INCLUDE_DICTDEF_H__
+
+#include <stdlib.h>
+#include "./utf16char.h"
+
+namespace ime_pinyin {
+
+// Enable the following line when building the binary dictionary model.
+// #define ___BUILD_MODEL___
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+
+typedef signed char int8;
+typedef short int16;
+typedef int int32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+const bool kPrintDebug0 = false;
+const bool kPrintDebug1 = false;
+const bool kPrintDebug2 = false;
+
+// The max length of a lemma.
+const size_t kMaxLemmaSize = 8;
+
+// The max length of a Pinyin (spelling).
+const size_t kMaxPinyinSize = 6;
+
+// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
+// See SpellingTrie.h for details.
+const size_t kHalfSpellingIdNum = 29;
+
+// The maximum number of full spellings. For Chinese Pinyin, there are only
+// about 410 spellings.
+// If change this value is bigger(needs more bits), please also update
+// other structures like SpellingNode, to make sure than a spelling id can be
+// stored.
+// -1 is because that 0 is never used.
+const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
+const size_t kMaxSearchSteps = 40;
+
+// One character predicts its following characters.
+const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
+
+// LemmaIdType must always be size_t.
+typedef size_t LemmaIdType;
+const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage.
+const size_t kLemmaIdComposing = 0xffffff;
+
+typedef uint16 LmaScoreType;
+typedef uint16 KeyScoreType;
+
+// Number of items with highest score are kept for prediction purpose.
+const size_t kTopScoreLemmaNum = 10;
+
+const size_t kMaxPredictNumByGt3 = 1;
+const size_t kMaxPredictNumBy3 = 2;
+const size_t kMaxPredictNumBy2 = 2;
+
+// The last lemma id (included) for the system dictionary. The system
+// dictionary's ids always start from 1.
+const LemmaIdType kSysDictIdEnd = 500000;
+
+// The first lemma id for the user dictionary.
+const LemmaIdType kUserDictIdStart = 500001;
+
+// The last lemma id (included) for the user dictionary.
+const LemmaIdType kUserDictIdEnd = 600000;
+
+typedef struct {
+ uint16 half_splid:5;
+ uint16 full_splid:11;
+} SpellingId, *PSpellingId;
+
+
+/**
+ * We use different node types for different layers
+ * Statistical data of the building result for a testing dictionary:
+ * root, level 0, level 1, level 2, level 3
+ * max son num of one node: 406 280 41 2 -
+ * max homo num of one node: 0 90 23 2 2
+ * total node num of a layer: 1 406 31766 13516 993
+ * total homo num of a layer: 9 5674 44609 12667 995
+ *
+ * The node number for root and level 0 won't be larger than 500
+ * According to the information above, two kinds of nodes can be used; one for
+ * root and level 0, the other for these layers deeper than 0.
+ *
+ * LE = less and equal,
+ * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
+ */
+struct LmaNodeLE0 {
+ uint32 son_1st_off;
+ uint32 homo_idx_buf_off;
+ uint16 spl_idx;
+ uint16 num_of_son;
+ uint16 num_of_homo;
+};
+
+/**
+ * GE = great and equal
+ * A node occupies 8 bytes.
+ */
+struct LmaNodeGE1 {
+ uint16 son_1st_off_l; // Low bits of the son_1st_off
+ uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1
+ uint16 spl_idx;
+ unsigned char num_of_son; // number of son nodes
+ unsigned char num_of_homo; // number of homo words
+ unsigned char son_1st_off_h; // high bits of the son_1st_off
+ unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off
+};
+
+#ifdef ___BUILD_MODEL___
+struct SingleCharItem {
+ float freq;
+ char16 hz;
+ SpellingId splid;
+};
+
+struct LemmaEntry {
+ LemmaIdType idx_by_py;
+ LemmaIdType idx_by_hz;
+ char16 hanzi_str[kMaxLemmaSize + 1];
+
+ // The SingleCharItem id for each Hanzi.
+ uint16 hanzi_scis_ids[kMaxLemmaSize];
+
+ uint16 spl_idx_arr[kMaxLemmaSize + 1];
+ char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
+ unsigned char hz_str_len;
+ float freq;
+};
+#endif // ___BUILD_MODEL___
+
+} // namespace ime_pinyin
+
+#endif // PINYINIME_INCLUDE_DICTDEF_H__