diff options
Diffstat (limited to 'src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h')
-rw-r--r-- | src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h b/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h new file mode 100644 index 00000000..5e1d7818 --- /dev/null +++ b/src/plugins/pinyin/3rdparty/pinyin/include/dictdef.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTDEF_H__ +#define PINYINIME_INCLUDE_DICTDEF_H__ + +#include <stdlib.h> +#include "./utf16char.h" + +namespace ime_pinyin { + +// Enable the following line when building the binary dictionary model. +// #define ___BUILD_MODEL___ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +typedef signed char int8; +typedef short int16; +typedef int int32; +typedef long long int64; +typedef unsigned long long uint64; + +const bool kPrintDebug0 = false; +const bool kPrintDebug1 = false; +const bool kPrintDebug2 = false; + +// The max length of a lemma. +const size_t kMaxLemmaSize = 8; + +// The max length of a Pinyin (spelling). +const size_t kMaxPinyinSize = 6; + +// The number of half spelling ids. For Chinese Pinyin, there 30 half ids. +// See SpellingTrie.h for details. +const size_t kHalfSpellingIdNum = 29; + +// The maximum number of full spellings. For Chinese Pinyin, there are only +// about 410 spellings. +// If change this value is bigger(needs more bits), please also update +// other structures like SpellingNode, to make sure than a spelling id can be +// stored. +// -1 is because that 0 is never used. +const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; +const size_t kMaxSearchSteps = 40; + +// One character predicts its following characters. +const size_t kMaxPredictSize = (kMaxLemmaSize - 1); + +// LemmaIdType must always be size_t. +typedef size_t LemmaIdType; +const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. +const size_t kLemmaIdComposing = 0xffffff; + +typedef uint16 LmaScoreType; +typedef uint16 KeyScoreType; + +// Number of items with highest score are kept for prediction purpose. +const size_t kTopScoreLemmaNum = 10; + +const size_t kMaxPredictNumByGt3 = 1; +const size_t kMaxPredictNumBy3 = 2; +const size_t kMaxPredictNumBy2 = 2; + +// The last lemma id (included) for the system dictionary. The system +// dictionary's ids always start from 1. +const LemmaIdType kSysDictIdEnd = 500000; + +// The first lemma id for the user dictionary. +const LemmaIdType kUserDictIdStart = 500001; + +// The last lemma id (included) for the user dictionary. +const LemmaIdType kUserDictIdEnd = 600000; + +typedef struct { + uint16 half_splid:5; + uint16 full_splid:11; +} SpellingId, *PSpellingId; + + +/** + * We use different node types for different layers + * Statistical data of the building result for a testing dictionary: + * root, level 0, level 1, level 2, level 3 + * max son num of one node: 406 280 41 2 - + * max homo num of one node: 0 90 23 2 2 + * total node num of a layer: 1 406 31766 13516 993 + * total homo num of a layer: 9 5674 44609 12667 995 + * + * The node number for root and level 0 won't be larger than 500 + * According to the information above, two kinds of nodes can be used; one for + * root and level 0, the other for these layers deeper than 0. + * + * LE = less and equal, + * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K + */ +struct LmaNodeLE0 { + uint32 son_1st_off; + uint32 homo_idx_buf_off; + uint16 spl_idx; + uint16 num_of_son; + uint16 num_of_homo; +}; + +/** + * GE = great and equal + * A node occupies 8 bytes. + */ +struct LmaNodeGE1 { + uint16 son_1st_off_l; // Low bits of the son_1st_off + uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 + uint16 spl_idx; + unsigned char num_of_son; // number of son nodes + unsigned char num_of_homo; // number of homo words + unsigned char son_1st_off_h; // high bits of the son_1st_off + unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off +}; + +#ifdef ___BUILD_MODEL___ +struct SingleCharItem { + float freq; + char16 hz; + SpellingId splid; +}; + +struct LemmaEntry { + LemmaIdType idx_by_py; + LemmaIdType idx_by_hz; + char16 hanzi_str[kMaxLemmaSize + 1]; + + // The SingleCharItem id for each Hanzi. + uint16 hanzi_scis_ids[kMaxLemmaSize]; + + uint16 spl_idx_arr[kMaxLemmaSize + 1]; + char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; + unsigned char hz_str_len; + float freq; +}; +#endif // ___BUILD_MODEL___ + +} // namespace ime_pinyin + +#endif // PINYINIME_INCLUDE_DICTDEF_H__ |