diff options
Diffstat (limited to 'src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp')
-rw-r--r-- | src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp | 941 |
1 files changed, 0 insertions, 941 deletions
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp deleted file mode 100644 index 0cdd0982..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp +++ /dev/null @@ -1,941 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdio.h> -#include <string.h> -#include "../include/dicttrie.h" -#include "../include/dictbuilder.h" -#include "../include/lpicache.h" -#include "../include/mystdlib.h" -#include "../include/ngram.h" - -namespace ime_pinyin { - -DictTrie::DictTrie() { - spl_trie_ = SpellingTrie::get_cpinstance(); - - root_ = NULL; - splid_le0_index_ = NULL; - lma_node_num_le0_ = 0; - nodes_ge1_ = NULL; - lma_node_num_ge1_ = 0; - lma_idx_buf_ = NULL; - lma_idx_buf_len_ = 0; - total_lma_num_ = 0; - top_lmas_num_ = 0; - dict_list_ = NULL; - - parsing_marks_ = NULL; - mile_stones_ = NULL; - reset_milestones(0, kFirstValidMileStoneHandle); -} - -DictTrie::~DictTrie() { - free_resource(true); -} - -void DictTrie::free_resource(bool free_dict_list) { - if (NULL != root_) - free(root_); - root_ = NULL; - - if (NULL != splid_le0_index_) - free(splid_le0_index_); - splid_le0_index_ = NULL; - - if (NULL != nodes_ge1_) - free(nodes_ge1_); - nodes_ge1_ = NULL; - - if (NULL != lma_idx_buf_) - free(lma_idx_buf_); - lma_idx_buf_ = NULL; - - if (free_dict_list) { - if (NULL != dict_list_) { - delete dict_list_; - } - dict_list_ = NULL; - } - - if (parsing_marks_) - delete [] parsing_marks_; - parsing_marks_ = NULL; - - if (mile_stones_) - delete [] mile_stones_; - mile_stones_ = NULL; - - reset_milestones(0, kFirstValidMileStoneHandle); -} - -inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) { - return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16)); -} - -inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) { - return ((size_t)node->homo_idx_buf_off_l + - ((size_t)node->homo_idx_buf_off_h << 16)); -} - -inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) { - LemmaIdType id = 0; - for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--) - id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos]; - id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize]; - return id; -} - -#ifdef ___BUILD_MODEL___ -bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) { - DictBuilder* dict_builder = new DictBuilder(); - - free_resource(true); - - return dict_builder->build_dict(fn_raw, fn_validhzs, this); -} - -bool DictTrie::save_dict(FILE *fp) { - if (NULL == fp) - return false; - - if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) - != lma_node_num_le0_) - return false; - - if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) - != lma_node_num_ge1_) - return false; - - if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != - lma_idx_buf_len_) - return false; - - return true; -} - -bool DictTrie::save_dict(const char *filename) { - if (NULL == filename) - return false; - - if (NULL == root_ || NULL == dict_list_) - return false; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - FILE *fp = fopen(filename, "wb"); - if (NULL == fp) - return false; - - if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) || - !save_dict(fp) || !ngram.save_ngram(fp)) { - fclose(fp); - return false; - } - - fclose(fp); - return true; -} -#endif // ___BUILD_MODEL___ - -bool DictTrie::load_dict(FILE *fp) { - if (NULL == fp) - return false; - if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 || - top_lmas_num_ >= lma_idx_buf_len_) - return false; - - free_resource(false); - - root_ = static_cast<LmaNodeLE0*> - (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0))); - nodes_ge1_ = static_cast<LmaNodeGE1*> - (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1))); - lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_); - total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize; - - size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1; - assert(lma_node_num_le0_ <= buf_size); - splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16))); - - // Init the space for parsing. - parsing_marks_ = new ParsingMark[kMaxParsingMark]; - mile_stones_ = new MileStone[kMaxMileStone]; - reset_milestones(0, kFirstValidMileStoneHandle); - - if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ || - NULL == splid_le0_index_ || NULL == parsing_marks_ || - NULL == mile_stones_) { - free_resource(false); - return false; - } - - if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) - != lma_node_num_le0_) - return false; - - if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) - != lma_node_num_ge1_) - return false; - - if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != - lma_idx_buf_len_) - return false; - - // The quick index for the first level sons - uint16 last_splid = kFullSplIdStart; - size_t last_pos = 0; - for (size_t i = 1; i < lma_node_num_le0_; i++) { - for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++) - splid_le0_index_[splid - kFullSplIdStart] = last_pos; - - splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] = - static_cast<uint16>(i); - last_splid = root_[i].spl_idx; - last_pos = i; - } - - for (uint16 splid = last_splid + 1; - splid < buf_size + kFullSplIdStart; splid++) { - assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size); - splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1; - } - - return true; -} - -bool DictTrie::load_dict(const char *filename, LemmaIdType start_id, - LemmaIdType end_id) { - if (NULL == filename || end_id <= start_id) - return false; - - FILE *fp = fopen(filename, "rb"); - if (NULL == fp) - return false; - - free_resource(true); - - dict_list_ = new DictList(); - if (NULL == dict_list_) { - fclose(fp); - return false; - } - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || - !load_dict(fp) || !ngram.load_ngram(fp) || - total_lma_num_ > end_id - start_id + 1) { - free_resource(true); - fclose(fp); - return false; - } - - fclose(fp); - return true; -} - -bool DictTrie::load_dict_fd(int sys_fd, long start_offset, - long length, LemmaIdType start_id, - LemmaIdType end_id) { - if (start_offset < 0 || length <= 0 || end_id <= start_id) - return false; - - FILE *fp = fdopen(sys_fd, "rb"); - if (NULL == fp) - return false; - - if (-1 == fseek(fp, start_offset, SEEK_SET)) { - fclose(fp); - return false; - } - - free_resource(true); - - dict_list_ = new DictList(); - if (NULL == dict_list_) { - fclose(fp); - return false; - } - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || - !load_dict(fp) || !ngram.load_ngram(fp) || - ftell(fp) < start_offset + length || - total_lma_num_ > end_id - start_id + 1) { - free_resource(true); - fclose(fp); - return false; - } - - fclose(fp); - return true; -} - -size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, - LmaNodeLE0 *node) { - size_t lpi_num = 0; - NGram& ngram = NGram::get_instance(); - for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { - lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off + - homo); - lpi_items[lpi_num].lma_len = 1; - lpi_items[lpi_num].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); - lpi_num++; - if (lpi_num >= lpi_max) - break; - } - - return lpi_num; -} - -size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, - size_t homo_buf_off, LmaNodeGE1 *node, - uint16 lma_len) { - size_t lpi_num = 0; - NGram& ngram = NGram::get_instance(); - for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { - lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo); - lpi_items[lpi_num].lma_len = lma_len; - lpi_items[lpi_num].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); - lpi_num++; - if (lpi_num >= lpi_max) - break; - } - - return lpi_num; -} - -void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { - if (0 == from_step) { - parsing_marks_pos_ = 0; - mile_stones_pos_ = kFirstValidMileStoneHandle; - } else { - if (from_handle > 0 && from_handle < mile_stones_pos_) { - mile_stones_pos_ = from_handle; - - MileStone *mile_stone = mile_stones_ + from_handle; - parsing_marks_pos_ = mile_stone->mark_start; - } - } -} - -MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, size_t lpi_max, - size_t *lpi_num) { - if (NULL == dep) - return 0; - - // from LmaNodeLE0 (root) to LmaNodeLE0 - if (0 == from_handle) { - assert(0 == dep->splids_extended); - return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num); - } - - // from LmaNodeLE0 to LmaNodeGE1 - if (1 == dep->splids_extended) - return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num); - - // From LmaNodeGE1 to LmaNodeGE1 - return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num); -} - -MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && 0 == from_handle); - *lpi_num = 0; - MileStoneHandle ret_handle = 0; - - uint16 splid = dep->splids[dep->splids_extended]; - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - LpiCache& lpi_cache = LpiCache::get_instance(); - bool cached = lpi_cache.is_cached(splid); - - // 2. Begin exgtending - // 2.1 Get the LmaPsbItem list - LmaNodeLE0 *node = root_; - size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; - size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart]; - for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { - assert(1 == node->son_1st_off); - LmaNodeLE0 *son = root_ + son_pos; - assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num); - - if (!cached && *lpi_num < lpi_max) { - bool need_lpi = true; - if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start) - need_lpi = false; - - if (need_lpi) - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, son); - } - - // If necessary, fill in a new mile stone. - if (son->spl_idx == id_start) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = son_pos; - parsing_marks_[parsing_marks_pos_].node_num = id_num; - mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_; - mile_stones_[mile_stones_pos_].mark_num = 1; - ret_handle = mile_stones_pos_; - parsing_marks_pos_++; - mile_stones_pos_++; - } - } - - if (son->spl_idx >= id_start + id_num -1) - break; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); - - MileStoneHandle ret_handle = 0; - - // 1. If this is a half Id, get its corresponding full starting Id and - // number of full Id. - size_t ret_val = 0; - - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - // 2. Begin extending. - MileStone *mile_stone = mile_stones_ + from_handle; - - for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { - ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; - uint16 ext_num = p_mark.node_num; - for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { - LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos; - size_t found_start = 0; - size_t found_num = 0; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { - assert(node->son_1st_off <= lma_node_num_ge1_); - LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos; - if (son->spl_idx >= id_start - && son->spl_idx < id_start + id_num) { - if (*lpi_num < lpi_max) { - size_t homo_buf_off = get_homo_idx_buf_offset(son); - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, homo_buf_off, son, - 2); - } - - // If necessary, fill in the new DTMI - if (0 == found_num) { - found_start = son_pos; - } - found_num++; - } - if (son->spl_idx >= id_start + id_num - 1 || son_pos == - (size_t)node->num_of_son - 1) { - if (found_num > 0) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = - node->son_1st_off + found_start; - parsing_marks_[parsing_marks_pos_].node_num = found_num; - if (0 == ret_val) - mile_stones_[mile_stones_pos_].mark_start = - parsing_marks_pos_; - parsing_marks_pos_++; - } - - ret_val++; - } - break; - } // for son_pos - } // for ext_pos - } // for h_pos - } - - if (ret_val > 0) { - mile_stones_[mile_stones_pos_].mark_num = ret_val; - ret_handle = mile_stones_pos_; - mile_stones_pos_++; - ret_val = 1; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); - - MileStoneHandle ret_handle = 0; - - // 1. If this is a half Id, get its corresponding full starting Id and - // number of full Id. - size_t ret_val = 0; - - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - // 2. Begin extending. - MileStone *mile_stone = mile_stones_ + from_handle; - - for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { - ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; - uint16 ext_num = p_mark.node_num; - for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { - LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos; - size_t found_start = 0; - size_t found_num = 0; - - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { - assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); - LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos; - if (son->spl_idx >= id_start - && son->spl_idx < id_start + id_num) { - if (*lpi_num < lpi_max) { - size_t homo_buf_off = get_homo_idx_buf_offset(son); - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, homo_buf_off, son, - dep->splids_extended + 1); - } - - // If necessary, fill in the new DTMI - if (0 == found_num) { - found_start = son_pos; - } - found_num++; - } - if (son->spl_idx >= id_start + id_num - 1 || son_pos == - (size_t)node->num_of_son - 1) { - if (found_num > 0) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = - get_son_offset(node) + found_start; - parsing_marks_[parsing_marks_pos_].node_num = found_num; - if (0 == ret_val) - mile_stones_[mile_stones_pos_].mark_start = - parsing_marks_pos_; - parsing_marks_pos_++; - } - - ret_val++; - } - break; - } - } // for son_pos - } // for ext_pos - } // for h_pos - - if (ret_val > 0) { - mile_stones_[mile_stones_pos_].mark_num = ret_val; - ret_handle = mile_stones_pos_; - mile_stones_pos_++; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num, - LemmaIdType id_lemma) { - if (0 == splid_num || NULL == splids) - return false; - - void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart]; - - for (uint16 pos = 1; pos < splid_num; pos++) { - if (1 == pos) { - LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node); - LmaNodeGE1 *node_son; - uint16 son_pos; - for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son); - son_pos++) { - assert(node_le0->son_1st_off <= lma_node_num_ge1_); - node_son = nodes_ge1_ + node_le0->son_1st_off - + son_pos; - if (node_son->spl_idx == splids[pos]) - break; - } - if (son_pos < node_le0->num_of_son) - node = reinterpret_cast<void*>(node_son); - else - return false; - } else { - LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); - LmaNodeGE1 *node_son; - uint16 son_pos; - for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son); - son_pos++) { - assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0); - node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos; - if (node_son->spl_idx == splids[pos]) - break; - } - if (son_pos < node_ge1->num_of_son) - node = reinterpret_cast<void*>(node_son); - else - return false; - } - } - - if (1 == splid_num) { - LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node); - size_t num_of_homo = (size_t)node_le0->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); - char16 str[2]; - get_lemma_str(id_this, str, 2); - if (id_this == id_lemma) - return true; - } - } else { - LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); - size_t num_of_homo = (size_t)node_ge1->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); - if (get_lemma_id(node_homo_off + homo_pos) == id_lemma) - return true; - } - } - - return false; -} - -size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len, - LmaPsbItem* lma_buf, size_t max_lma_buf) { - if (splid_str_len > kMaxLemmaSize) - return 0; - -#define MAX_EXTENDBUF_LEN 200 - - size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment - size_t* node_buf2[MAX_EXTENDBUF_LEN]; - LmaNodeLE0** node_fr_le0 = - reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from. - LmaNodeLE0** node_to_le0 = - reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to. - LmaNodeGE1** node_fr_ge1 = NULL; - LmaNodeGE1** node_to_ge1 = NULL; - size_t node_fr_num = 1; - size_t node_to_num = 0; - node_fr_le0[0] = root_; - if (NULL == node_fr_le0[0]) - return 0; - - size_t spl_pos = 0; - - while (spl_pos < splid_str_len) { - uint16 id_num = 1; - uint16 id_start = splid_str[spl_pos]; - // If it is a half id - if (spl_trie_->is_half_id(splid_str[spl_pos])) { - id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start); - assert(id_num > 0); - } - - // Extend the nodes - if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; - assert(node == root_ && 1 == node_fr_num); - size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; - size_t son_end = - splid_le0_index_[id_start + id_num - kFullSplIdStart]; - for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { - assert(1 == node->son_1st_off); - LmaNodeLE0 *node_son = root_ + son_pos; - assert(node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num); - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_le0[node_to_num] = node_son; - node_to_num++; - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeLE0 to LmaNodeGE1 - LmaNodeLE0** node_tmp = node_fr_le0; - node_fr_le0 = node_to_le0; - node_to_le0 = NULL; - node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp); - } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; - son_pos++) { - assert(node->son_1st_off <= lma_node_num_ge1_); - LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off - + son_pos; - if (node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num) { - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_ge1[node_to_num] = node_son; - node_to_num++; - } - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeGE1 to LmaNodeGE1 - node_fr_ge1 = node_to_ge1; - node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0); - node_fr_le0 = NULL; - node_to_le0 = NULL; - } else { // From LmaNodeGE1 to LmaNodeGE1 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeGE1 *node = node_fr_ge1[node_fr_pos]; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; - son_pos++) { - assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); - LmaNodeGE1 *node_son = nodes_ge1_ - + get_son_offset(node) + son_pos; - if (node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num) { - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_ge1[node_to_num] = node_son; - node_to_num++; - } - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeGE1 to LmaNodeGE1 - LmaNodeGE1 **node_tmp = node_fr_ge1; - node_fr_ge1 = node_to_ge1; - node_to_ge1 = node_tmp; - } - - // The number of node for next extending - node_fr_num = node_to_num; - node_to_num = 0; - } // while - - if (0 == node_to_num) - return 0; - - NGram &ngram = NGram::get_instance(); - size_t lma_num = 0; - - // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e', - // only those candidates for the full matched one-char id will be returned. - if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0])) - node_to_num = node_to_num > 0 ? 1 : 0; - - for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) { - size_t num_of_homo = 0; - if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes - LmaNodeLE0* node_le0 = node_to_le0[node_pos]; - num_of_homo = (size_t)node_le0->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t ch_pos = lma_num + homo_pos; - lma_buf[ch_pos].id = - get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); - lma_buf[ch_pos].lma_len = 1; - lma_buf[ch_pos].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); - - if (lma_num + homo_pos >= max_lma_buf - 1) - break; - } - } else { // Get from LmaNodeGE1 nodes - LmaNodeGE1* node_ge1 = node_to_ge1[node_pos]; - num_of_homo = (size_t)node_ge1->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t ch_pos = lma_num + homo_pos; - size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); - lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos); - lma_buf[ch_pos].lma_len = splid_str_len; - lma_buf[ch_pos].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); - - if (lma_num + homo_pos >= max_lma_buf - 1) - break; - } - } - - lma_num += num_of_homo; - if (lma_num >= max_lma_buf) { - lma_num = max_lma_buf; - break; - } - } - return lma_num; -} - -uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) { - return dict_list_->get_lemma_str(id_lemma, str_buf, str_max); -} - -uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) { - char16 lma_str[kMaxLemmaSize + 1]; - uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1); - assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max); - - uint16 spl_mtrx[kMaxLemmaSize * 5]; - uint16 spl_start[kMaxLemmaSize + 1]; - spl_start[0] = 0; - uint16 try_num = 1; - - for (uint16 pos = 0; pos < lma_len; pos++) { - uint16 cand_splids_this = 0; - if (arg_valid && spl_trie_->is_full_id(splids[pos])) { - spl_mtrx[spl_start[pos]] = splids[pos]; - cand_splids_this = 1; - } else { - cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos], - arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos], - kMaxLemmaSize * 5 - spl_start[pos]); - assert(cand_splids_this > 0); - } - spl_start[pos + 1] = spl_start[pos] + cand_splids_this; - try_num *= cand_splids_this; - } - - for (uint16 try_pos = 0; try_pos < try_num; try_pos++) { - uint16 mod = 1; - for (uint16 pos = 0; pos < lma_len; pos++) { - uint16 radix = spl_start[pos + 1] - spl_start[pos]; - splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix]; - mod *= radix; - } - - if (try_extend(splids, lma_len, id_lemma)) - return lma_len; - } - - return 0; -} - -void DictTrie::set_total_lemma_count_of_others(size_t count) { - NGram& ngram = NGram::get_instance(); - ngram.set_total_freq_none_sys(count); -} - -void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) { - return dict_list_->convert_to_hanzis(str, str_len); -} - -void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) { - return dict_list_->convert_to_scis_ids(str, str_len); -} - -LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) { - if (NULL == lemma_str || lemma_len > kMaxLemmaSize) - return 0; - - return dict_list_->get_lemma_id(lemma_str, lemma_len); -} - -size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items, - size_t npre_max, size_t b4_used) { - NGram &ngram = NGram::get_instance(); - - size_t item_num = 0; - size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_; - size_t top_lmas_pos = 0; - while (item_num < npre_max && top_lmas_pos < top_lmas_num_) { - memset(npre_items + item_num, 0, sizeof(NPredictItem)); - LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos); - top_lmas_pos += 1; - if (dict_list_->get_lemma_str(top_lma_id, - npre_items[item_num].pre_hzs, - kMaxLemmaSize - 1) == 0) { - continue; - } - npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id); - npre_items[item_num].his_len = his_len; - item_num++; - } - return item_num; -} - -size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) { - return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used); -} -} // namespace ime_pinyin |