diff options
Diffstat (limited to 'src/plugins/pinyin/3rdparty/pinyin/share/matrixsearch.cpp')
-rw-r--r-- | src/plugins/pinyin/3rdparty/pinyin/share/matrixsearch.cpp | 1981 |
1 files changed, 1981 insertions, 0 deletions
diff --git a/src/plugins/pinyin/3rdparty/pinyin/share/matrixsearch.cpp b/src/plugins/pinyin/3rdparty/pinyin/share/matrixsearch.cpp new file mode 100644 index 00000000..41e11433 --- /dev/null +++ b/src/plugins/pinyin/3rdparty/pinyin/share/matrixsearch.cpp @@ -0,0 +1,1981 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <math.h> +#include <stdio.h> +#include <string.h> +#include "../include/lpicache.h" +#include "../include/matrixsearch.h" +#include "../include/mystdlib.h" +#include "../include/ngram.h" +#include "../include/userdict.h" + +namespace ime_pinyin { + +#define PRUMING_SCORE 8000.0 + +MatrixSearch::MatrixSearch() { + inited_ = false; + spl_trie_ = SpellingTrie::get_cpinstance(); + + reset_pointers_to_null(); + + pys_decoded_len_ = 0; + mtrx_nd_pool_used_ = 0; + dmi_pool_used_ = 0; + xi_an_enabled_ = false; + dmi_c_phrase_ = false; + + assert(kMaxSearchSteps > 0); + max_sps_len_ = kMaxSearchSteps - 1; + max_hzs_len_ = kMaxSearchSteps; +} + +MatrixSearch::~MatrixSearch() { + free_resource(); +} + +void MatrixSearch::reset_pointers_to_null() { + dict_trie_ = NULL; + user_dict_ = NULL; + spl_parser_ = NULL; + + share_buf_ = NULL; + + // The following four buffers are used for decoding, and they are based on + // share_buf_, no need to delete them. + mtrx_nd_pool_ = NULL; + dmi_pool_ = NULL; + matrix_ = NULL; + dep_ = NULL; + + // Based on share_buf_, no need to delete them. + npre_items_ = NULL; +} + +bool MatrixSearch::alloc_resource() { + free_resource(); + + dict_trie_ = new DictTrie(); + user_dict_ = static_cast<AtomDictBase*>(new UserDict()); + spl_parser_ = new SpellingParser(); + + size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize; + mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t); + size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize; + dmi_size = align_to_size_t(dmi_size) / sizeof(size_t); + size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum; + matrix_size = align_to_size_t(matrix_size) / sizeof(size_t); + size_t dep_size = sizeof(DictExtPara); + dep_size = align_to_size_t(dep_size) / sizeof(size_t); + + // share_buf's size is determined by the buffers for search. + share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size]; + + if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ || + NULL == share_buf_) + return false; + + // The buffers for search are based on the share buffer + mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_); + dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size); + matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size); + dep_ = reinterpret_cast<DictExtPara*> + (share_buf_ + mtrx_nd_size + dmi_size + matrix_size); + + // The prediction buffer is also based on the share buffer. + npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_); + npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) * + sizeof(size_t) / sizeof(NPredictItem); + return true; +} + +void MatrixSearch::free_resource() { + if (NULL != dict_trie_) + delete dict_trie_; + + if (NULL != user_dict_) + delete user_dict_; + + if (NULL != spl_parser_) + delete spl_parser_; + + if (NULL != share_buf_) + delete [] share_buf_; + + reset_pointers_to_null(); +} + +bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) { + if (NULL == fn_sys_dict || NULL == fn_usr_dict) + return false; + + if (!alloc_resource()) + return false; + + if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd)) + return false; + + // If engine fails to load the user dictionary, reset the user dictionary + // to NULL. + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } else{ + user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); + } + + reset_search0(); + + inited_ = true; + return true; +} + +bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict) { + if (NULL == fn_usr_dict) + return false; + + if (!alloc_resource()) + return false; + + if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd)) + return false; + + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } else { + user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); + } + + reset_search0(); + + inited_ = true; + return true; +} + +void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) { + assert(inited_); + + if (NULL != user_dict_) { + delete user_dict_; + user_dict_ = NULL; + } + + if (NULL != fn_usr_dict) { + user_dict_ = static_cast<AtomDictBase*>(new UserDict()); + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } + } + + reset_search0(); +} + +bool MatrixSearch::is_user_dictionary_enabled() const { + return NULL != user_dict_; +} + +void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) { + if (0 != max_sps_len) + max_sps_len_ = max_sps_len; + if (0 != max_hzs_len) + max_hzs_len_ = max_hzs_len; +} + +void MatrixSearch::close() { + flush_cache(); + free_resource(); + inited_ = false; +} + +void MatrixSearch::flush_cache() { + if (NULL != user_dict_) + user_dict_->flush_cache(); +} + +void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) { + xi_an_enabled_ = xi_an_enabled; +} + +bool MatrixSearch::get_xi_an_switch() { + return xi_an_enabled_; +} + +bool MatrixSearch::reset_search() { + if (!inited_) + return false; + return reset_search0(); +} + +bool MatrixSearch::reset_search0() { + if (!inited_) + return false; + + pys_decoded_len_ = 0; + mtrx_nd_pool_used_ = 0; + dmi_pool_used_ = 0; + + // Get a MatrixNode from the pool + matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_; + matrix_[0].mtrx_nd_num = 1; + mtrx_nd_pool_used_ += 1; + + // Update the node, and make it to be a starting node + MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos; + node->id = 0; + node->score = 0; + node->from = NULL; + node->step = 0; + node->dmi_fr = (PoolPosType)-1; + + matrix_[0].dmi_pos = 0; + matrix_[0].dmi_num = 0; + matrix_[0].dmi_has_full_id = 1; + matrix_[0].mtrx_nd_fixed = node; + + lma_start_[0] = 0; + fixed_lmas_ = 0; + spl_start_[0] = 0; + fixed_hzs_ = 0; + + dict_trie_->reset_milestones(0, 0); + if (NULL != user_dict_) + user_dict_->reset_milestones(0, 0); + + return true; +} + +bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step, + bool clear_dmi_this_step, + bool clear_mtrx_this_step) { + if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum) + return false; + + if (0 == ch_pos) { + reset_search0(); + } else { + // Prepare mile stones of this step to clear. + MileStoneHandle *dict_handles_to_clear = NULL; + if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) { + dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles; + } + + // If there are more steps, and this step is not allowed to clear, find + // milestones of next step. + if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) { + dict_handles_to_clear = NULL; + if (matrix_[ch_pos + 1].dmi_num > 0) { + dict_handles_to_clear = + dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles; + } + } + + if (NULL != dict_handles_to_clear) { + dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]); + if (NULL != user_dict_) + user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]); + } + + pys_decoded_len_ = ch_pos; + + if (clear_dmi_this_step) { + dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos + + matrix_[ch_pos - 1].dmi_num; + matrix_[ch_pos].dmi_num = 0; + } else { + dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num; + } + + if (clear_mtrx_this_step) { + mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos + + matrix_[ch_pos - 1].mtrx_nd_num; + matrix_[ch_pos].mtrx_nd_num = 0; + } else { + mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos + + matrix_[ch_pos].mtrx_nd_num; + } + + // Modify fixed_hzs_ + if (fixed_hzs_ > 0 && + ((kLemmaIdComposing != lma_id_[0]) || + (kLemmaIdComposing == lma_id_[0] && + spl_start_[c_phrase_.length] <= ch_pos))) { + size_t fixed_ch_pos = ch_pos; + if (clear_fixed_this_step) + fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0; + while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0) + fixed_ch_pos--; + + fixed_lmas_ = 0; + fixed_hzs_ = 0; + if (fixed_ch_pos > 0) { + while (spl_start_[fixed_hzs_] < fixed_ch_pos) + fixed_hzs_++; + assert(spl_start_[fixed_hzs_] == fixed_ch_pos); + + while (lma_start_[fixed_lmas_] < fixed_hzs_) + fixed_lmas_++; + assert(lma_start_[fixed_lmas_] == fixed_hzs_); + } + + // Re-search the Pinyin string for the unlocked lemma + // which was previously fixed. + // + // Prepare mile stones of this step to clear. + MileStoneHandle *dict_handles_to_clear = NULL; + if (clear_dmi_this_step && ch_pos == fixed_ch_pos && + matrix_[fixed_ch_pos].dmi_num > 0) { + dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles; + } + + // If there are more steps, and this step is not allowed to clear, find + // milestones of next step. + if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) { + dict_handles_to_clear = NULL; + if (matrix_[fixed_ch_pos + 1].dmi_num > 0) { + dict_handles_to_clear = + dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles; + } + } + + if (NULL != dict_handles_to_clear) { + dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]); + if (NULL != user_dict_) + user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]); + } + + + pys_decoded_len_ = fixed_ch_pos; + + if (clear_dmi_this_step && ch_pos == fixed_ch_pos) { + dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos + + matrix_[fixed_ch_pos - 1].dmi_num; + matrix_[fixed_ch_pos].dmi_num = 0; + } else { + dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos + + matrix_[fixed_ch_pos].dmi_num; + } + + if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) { + mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos + + matrix_[fixed_ch_pos - 1].mtrx_nd_num; + matrix_[fixed_ch_pos].mtrx_nd_num = 0; + } else { + mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos + + matrix_[fixed_ch_pos].mtrx_nd_num; + } + + for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) { + add_char(pys_[re_pos]); + } + } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) { + for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) { + uint16 splpos_begin = c_phrase_.sublma_start[subpos]; + uint16 splpos_end = c_phrase_.sublma_start[subpos + 1]; + for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) { + // If ch_pos is in this spelling + uint16 spl_start = c_phrase_.spl_start[splpos]; + uint16 spl_end = c_phrase_.spl_start[splpos + 1]; + if (ch_pos >= spl_start && ch_pos < spl_end) { + // Clear everything after this position + c_phrase_.chn_str[splpos] = static_cast<char16>('\0'); + c_phrase_.sublma_start[subpos + 1] = splpos; + c_phrase_.sublma_num = subpos + 1; + c_phrase_.length = splpos; + + if (splpos == splpos_begin) { + c_phrase_.sublma_num = subpos; + } + } + } + } + + // Extend the composing phrase. + reset_search0(); + dmi_c_phrase_ = true; + uint16 c_py_pos = 0; + while (c_py_pos < spl_start_[c_phrase_.length]) { + bool b_ac_tmp = add_char(pys_[c_py_pos]); + assert(b_ac_tmp); + c_py_pos++; + } + dmi_c_phrase_ = false; + + lma_id_num_ = 1; + fixed_lmas_ = 1; + fixed_lmas_no1_[0] = 0; // A composing string is always modified. + fixed_hzs_ = c_phrase_.length; + lma_start_[1] = fixed_hzs_; + lma_id_[0] = kLemmaIdComposing; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + } + } + + return true; +} + +void MatrixSearch::del_in_pys(size_t start, size_t len) { + while (start < kMaxRowNum - len && '\0' != pys_[start]) { + pys_[start] = pys_[start + len]; + start++; + } +} + +size_t MatrixSearch::search(const char *py, size_t py_len) { + if (!inited_ || NULL == py) + return 0; + + // If the search Pinyin string is too long, it will be truncated. + if (py_len > kMaxRowNum - 1) + py_len = kMaxRowNum - 1; + + // Compare the new string with the previous one. Find their prefix to + // increase search efficiency. + size_t ch_pos = 0; + for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) { + if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos]) + break; + } + + bool clear_fix = true; + if (ch_pos == pys_decoded_len_) + clear_fix = false; + + reset_search(ch_pos, clear_fix, false, false); + + memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos); + pys_[py_len] = '\0'; + + while ('\0' != pys_[ch_pos]) { + if (!add_char(py[ch_pos])) { + pys_decoded_len_ = ch_pos; + break; + } + ch_pos++; + } + + // Get spelling ids and starting positions. + get_spl_start_id(); + + // If there are too many spellings, remove the last letter until the spelling + // number is acceptable. + while (spl_id_num_ > 9) { + py_len--; + reset_search(py_len, false, false, false); + pys_[py_len] = '\0'; + get_spl_start_id(); + } + + prepare_candidates(); + + if (kPrintDebug0) { + printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_); + printf("--DMI Pool Used: %d\n", dmi_pool_used_); + + if (kPrintDebug1) { + for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) { + debug_print_dmi(pos, 1); + } + } + } + + return ch_pos; +} + +size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step) { + if (!inited_) + return 0; + + size_t reset_pos = pos; + + // Out of range for both Pinyin mode and Spelling id mode. + if (pys_decoded_len_ <= pos) { + del_in_pys(pos, 1); + + reset_pos = pys_decoded_len_; + // Decode the string after the un-decoded position + while ('\0' != pys_[reset_pos]) { + if (!add_char(pys_[reset_pos])) { + pys_decoded_len_ = reset_pos; + break; + } + reset_pos++; + } + get_spl_start_id(); + prepare_candidates(); + return pys_decoded_len_; + } + + // Spelling id mode, but out of range. + if (is_pos_in_splid && spl_id_num_ <= pos) + return pys_decoded_len_; + + // Begin to handle two modes respectively. + // Pinyin mode by default + size_t c_py_len = 0; // The length of composing phrase's Pinyin + size_t del_py_len = 1; + if (!is_pos_in_splid) { + // Pinyin mode is only allowed to delete beyond the fixed lemmas. + if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]]) + return pys_decoded_len_; + + del_in_pys(pos, 1); + + // If the deleted character is just the one after the last fixed lemma + if (pos == spl_start_[lma_start_[fixed_lmas_]]) { + // If all fixed lemmas have been merged, and the caller of the function + // request to unlock the last fixed lemma. + if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) { + // Unlock the last sub lemma in the composing phrase. Because it is not + // easy to unlock it directly. Instead, we re-decode the modified + // composing phrase. + c_phrase_.sublma_num--; + c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num]; + reset_pos = spl_start_[c_phrase_.length]; + c_py_len = reset_pos; + } + } + } else { + del_py_len = spl_start_[pos + 1] - spl_start_[pos]; + + del_in_pys(spl_start_[pos], del_py_len); + + if (pos >= lma_start_[fixed_lmas_]) { + c_py_len = 0; + reset_pos = spl_start_[pos + 1] - del_py_len; + } else { + c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len; + reset_pos = c_py_len; + if (c_py_len > 0) + merge_fixed_lmas(pos); + } + } + + if (c_py_len > 0) { + assert(c_phrase_.length > 0 && c_py_len == + c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]); + // The composing phrase is valid, reset all search space, + // and begin a new search which will only extend the composing + // phrase. + reset_search0(); + + dmi_c_phrase_ = true; + // Extend the composing phrase. + uint16 c_py_pos = 0; + while (c_py_pos < c_py_len) { + bool b_ac_tmp = add_char(pys_[c_py_pos]); + assert(b_ac_tmp); + c_py_pos++; + } + dmi_c_phrase_ = false; + + // Fixd the composing phrase as the first choice. + lma_id_num_ = 1; + fixed_lmas_ = 1; + fixed_lmas_no1_[0] = 0; // A composing string is always modified. + fixed_hzs_ = c_phrase_.length; + lma_start_[1] = fixed_hzs_; + lma_id_[0] = kLemmaIdComposing; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + } else { + // Reseting search only clear pys_decoded_len_, but the string is kept. + reset_search(reset_pos, clear_fixed_this_step, false, false); + } + + // Decode the string after the delete position. + while ('\0' != pys_[reset_pos]) { + if (!add_char(pys_[reset_pos])) { + pys_decoded_len_ = reset_pos; + break; + } + reset_pos++; + } + + get_spl_start_id(); + prepare_candidates(); + return pys_decoded_len_; +} + +size_t MatrixSearch::get_candidate_num() { + if (!inited_ || 0 == pys_decoded_len_ || + 0 == matrix_[pys_decoded_len_].mtrx_nd_num) + return 0; + + return 1 + lpi_total_; +} + +char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str, + size_t max_len) { + if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str) + return NULL; + + if (0 == cand_id) { + return get_candidate0(cand_str, max_len, NULL, false); + } else { + cand_id--; + } + + // For this case: the current sentence is a word only, and the user fixed it, + // so the result will be fixed to the sentence space, and + // lpi_total_ will be set to 0. + if (0 == lpi_total_) { + return get_candidate0(cand_str, max_len, NULL, false); + } + + LemmaIdType id = lpi_items_[cand_id].id; + char16 s[kMaxLemmaSize + 1]; + + uint16 s_len = lpi_items_[cand_id].lma_len; + if (s_len > 1) { + s_len = get_lemma_str(id, s, kMaxLemmaSize + 1); + } else { + // For a single character, Hanzi is ready. + s[0] = lpi_items_[cand_id].hanzi; + s[1] = static_cast<char16>(0); + } + + if (s_len > 0 && max_len > s_len) { + utf16_strncpy(cand_str, s, s_len); + cand_str[s_len] = (char16)'\0'; + return cand_str; + } + + return NULL; +} + +void MatrixSearch::update_dict_freq() { + if (NULL != user_dict_) { + // Update the total frequency of all lemmas, including system lemmas and + // user dictionary lemmas. + size_t total_freq = user_dict_->get_total_lemma_count(); + dict_trie_->set_total_lemma_count_of_others(total_freq); + } +} + +bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to, + float score) { + if (lma_to - lma_fr <= 1 || NULL == user_dict_) + return false; + + char16 word_str[kMaxLemmaSize + 1]; + uint16 spl_ids[kMaxLemmaSize]; + + uint16 spl_id_fr = 0; + + for (uint16 pos = lma_fr; pos < lma_to; pos++) { + LemmaIdType lma_id = lma_id_[pos]; + if (is_user_lemma(lma_id)) { + user_dict_->update_lemma(lma_id, 1, true); + } + uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos]; + utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len); + + uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr, + kMaxLemmaSize + 1 - spl_id_fr); + assert(tmp == lma_len); + + tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true); + if (tmp != lma_len) { + return false; + } + + spl_id_fr += lma_len; + } + + assert(spl_id_fr <= kMaxLemmaSize); + + return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids, + spl_id_fr, 1); +} + +void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) { + if (dmi_pos >= dmi_pool_used_) return; + + DictMatchInfo *dmi = dmi_pool_ + dmi_pos; + + if (1 == nest_level) { + printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos); + } + if (dmi->dict_level > 1) { + debug_print_dmi(dmi->dmi_fr, nest_level + 1); + } + printf("---%d\n", dmi->dict_level); + printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]); + printf(" Spelling : %s, %d\n", SpellingTrie::get_instance(). + get_spelling_str(dmi->spl_id), dmi->spl_id); + printf(" Total Pinyin Len: %d\n", dmi->splstr_len); + if (1 == nest_level) { + printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos); + } +} + +bool MatrixSearch::try_add_cand0_to_userdict() { + size_t new_cand_num = get_candidate_num(); + if (fixed_hzs_ > 0 && 1 == new_cand_num) { + float score_from = 0; + uint16 lma_id_from = 0; + uint16 pos = 0; + bool modified = false; + while (pos < fixed_lmas_) { + if (lma_start_[pos + 1] - lma_start_[lma_id_from] > + static_cast<uint16>(kMaxLemmaSize)) { + float score_to_add = + mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] + .mtrx_nd_pos].score - score_from; + if (modified) { + score_to_add += 1.0; + if (score_to_add > NGram::kMaxScore) { + score_to_add = NGram::kMaxScore; + } + add_lma_to_userdict(lma_id_from, pos, score_to_add); + } + lma_id_from = pos; + score_from += score_to_add; + + // Clear the flag for next user lemma. + modified = false; + } + + if (0 == fixed_lmas_no1_[pos]) { + modified = true; + } + pos++; + } + + // Single-char word is not allowed to add to userdict. + if (lma_start_[pos] - lma_start_[lma_id_from] > 1) { + float score_to_add = + mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] + .mtrx_nd_pos].score - score_from; + if (modified) { + score_to_add += 1.0; + if (score_to_add > NGram::kMaxScore) { + score_to_add = NGram::kMaxScore; + } + add_lma_to_userdict(lma_id_from, pos, score_to_add); + } + } + } + return true; +} + +// Choose a candidate, and give new candidates for next step. +// If user finishes selection, we will try to communicate with user dictionary +// to add new items or update score of some existing items. +// +// Basic rule: +// 1. If user selects the first choice: +// 1.1. If the first choice is not a sentence, instead, it is a lemma: +// 1.1.1. If the first choice is a user lemma, notify the user +// dictionary that a user lemma is hit, and add occuring count +// by 1. +// 1.1.2. If the first choice is a system lemma, do nothing. +// 1.2. If the first choice is a sentence containing more than one lemma: +// 1.2.1. The whole sentence will be added as a user lemma. If the +// sentence contains user lemmas, -> hit, and add occuring count +// by 1. +size_t MatrixSearch::choose(size_t cand_id) { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + + if (0 == cand_id) { + fixed_hzs_ = spl_id_num_; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) { + fixed_lmas_no1_[pos] = 1; + } + fixed_lmas_ = lma_id_num_; + lpi_total_ = 0; // Clean all other candidates. + + // 1. It is the first choice + if (1 == lma_id_num_) { + // 1.1. The first choice is not a sentence but a lemma + if (is_user_lemma(lma_id_[0])) { + // 1.1.1. The first choice is a user lemma, notify the user dictionary + // that it is hit. + if (NULL != user_dict_) + user_dict_->update_lemma(lma_id_[0], 1, true); + } else { + // 1.1.2. do thing for a system lemma. + } + } else { + // 1.2. The first choice is a sentence. + // 1.2.1 Try to add the whole sentence to user dictionary, the whole + // sentence may be splitted into many items. + if (NULL != user_dict_) { + try_add_cand0_to_userdict(); + } + } + update_dict_freq(); + return 1; + } else { + cand_id--; + } + + // 2. It is not the full sentence candidate. + // Find the length of the candidate. + LemmaIdType id_chosen = lpi_items_[cand_id].id; + LmaScoreType score_chosen = lpi_items_[cand_id].psb; + size_t cand_len = lpi_items_[cand_id].lma_len; + + assert(cand_len > 0); + + // Notify the atom dictionary that this item is hit. + if (is_user_lemma(id_chosen)) { + if (NULL != user_dict_) { + user_dict_->update_lemma(id_chosen, 1, true); + } + update_dict_freq(); + } + + // 3. Fixed the chosen item. + // 3.1 Get the steps number. + size_t step_fr = spl_start_[fixed_hzs_]; + size_t step_to = spl_start_[fixed_hzs_ + cand_len]; + + // 3.2 Save the length of the original string. + size_t pys_decoded_len = pys_decoded_len_; + + // 3.2 Reset the space of the fixed part. + reset_search(step_to, false, false, true); + + // 3.3 For the last character of the fixed part, the previous DMI + // information will be kept, while the MTRX information will be re-extended, + // and only one node will be extended. + matrix_[step_to].mtrx_nd_num = 0; + + LmaPsbItem lpi_item; + lpi_item.psb = score_chosen; + lpi_item.id = id_chosen; + + PoolPosType step_to_dmi_fr = match_dmi(step_to, + spl_id_ + fixed_hzs_, cand_len); + //assert(step_to_dmi_fr != static_cast<PoolPosType>(-1)); + + extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1, + step_to_dmi_fr, step_to); + + matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos; + mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos + + matrix_[step_to].mtrx_nd_num; + + if (id_chosen == lma_id_[fixed_lmas_]) + fixed_lmas_no1_[fixed_lmas_] = 1; + else + fixed_lmas_no1_[fixed_lmas_] = 0; + lma_id_[fixed_lmas_] = id_chosen; + lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len; + fixed_lmas_++; + fixed_hzs_ = fixed_hzs_ + cand_len; + + while (step_to != pys_decoded_len) { + bool b = add_char(pys_[step_to]); + assert(b); + step_to++; + } + + if (fixed_hzs_ < spl_id_num_) { + prepare_candidates(); + } else { + lpi_total_ = 0; + if (NULL != user_dict_) { + try_add_cand0_to_userdict(); + } + } + + return get_candidate_num(); +} + +size_t MatrixSearch::cancel_last_choice() { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + + size_t step_start = 0; + if (fixed_hzs_ > 0) { + size_t step_end = spl_start_[fixed_hzs_]; + MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed; + assert(NULL != end_node); + + step_start = end_node->from->step; + + if (step_start > 0) { + DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr; + fixed_hzs_ -= dmi->dict_level; + } else { + fixed_hzs_ = 0; + } + + reset_search(step_start, false, false, false); + + while (pys_[step_start] != '\0') { + bool b = add_char(pys_[step_start]); + assert(b); + step_start++; + } + + prepare_candidates(); + } + return get_candidate_num(); +} + +size_t MatrixSearch::get_fixedlen() { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + return fixed_hzs_; +} + +bool MatrixSearch::prepare_add_char(char ch) { + if (pys_decoded_len_ >= kMaxRowNum - 1 || + (!spl_parser_->is_valid_to_parse(ch) && ch != '\'')) + return false; + + if (dmi_pool_used_ >= kDmiPoolSize) return false; + + pys_[pys_decoded_len_] = ch; + pys_decoded_len_++; + + MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_; + mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_; + mtrx_this_row->mtrx_nd_num = 0; + mtrx_this_row->dmi_pos = dmi_pool_used_; + mtrx_this_row->dmi_num = 0; + mtrx_this_row->dmi_has_full_id = 0; + + return true; +} + +bool MatrixSearch::is_split_at(uint16 pos) { + return !spl_parser_->is_valid_to_parse(pys_[pos - 1]); +} + +void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, + PoolPosType dmi_fr, uint16 spl_id, + uint16 node_num, unsigned char dict_level, + bool splid_end_split, unsigned char splstr_len, + unsigned char all_full_id) { + dmi->dict_handles[0] = handles[0]; + dmi->dict_handles[1] = handles[1]; + dmi->dmi_fr = dmi_fr; + dmi->spl_id = spl_id; + dmi->dict_level = dict_level; + dmi->splid_end_split = splid_end_split ? 1 : 0; + dmi->splstr_len = splstr_len; + dmi->all_full_id = all_full_id; + dmi->c_phrase = 0; +} + +bool MatrixSearch::add_char(char ch) { + if (!prepare_add_char(ch)) + return false; + return add_char_qwerty(); +} + +bool MatrixSearch::add_char_qwerty() { + matrix_[pys_decoded_len_].mtrx_nd_num = 0; + + bool spl_matched = false; + uint16 longest_ext = 0; + // Extend the search matrix, from the oldest unfixed row. ext_len means + // extending length. + for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) { + if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_]) + continue; + + // Refer to the declaration of the variable dmi_has_full_id for the + // explanation of this piece of code. In one word, it is used to prevent + // from the unwise extending of "shoud ou" but allow the reasonable + // extending of "heng ao", "lang a", etc. + if (ext_len > 1 && 0 != longest_ext && + 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) { + if (xi_an_enabled_) + continue; + else + break; + } + + uint16 oldrow = pys_decoded_len_ - ext_len; + + // 0. If that row is before the last fixed step, ignore. + if (spl_start_[fixed_hzs_] > oldrow) + continue; + + // 1. Check if that old row has valid MatrixNode. If no, means that row is + // not a boundary, either a word boundary or a spelling boundary. + // If it is for extending composing phrase, it's OK to ignore the 0. + if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_) + continue; + + // 2. Get spelling id(s) for the last ext_len chars. + uint16 spl_idx; + bool is_pre = false; + spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow, + ext_len, &is_pre); + if (is_pre) + spl_matched = true; + + if (0 == spl_idx) + continue; + + bool splid_end_split = is_split_at(oldrow + ext_len); + + // 3. Extend the DMI nodes of that old row + // + 1 is to extend an extra node from the root + for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos; + dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1; + dmi_pos++) { + DictMatchInfo *dmi = dmi_pool_ + dmi_pos; + if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) { + dmi = NULL; // The last one, NULL means extending from the root. + } else { + // If the dmi is covered by the fixed arrange, ignore it. + if (fixed_hzs_ > 0 && + pys_decoded_len_ - ext_len - dmi->splstr_len < + spl_start_[fixed_hzs_]) { + continue; + } + // If it is not in mode for composing phrase, and the source DMI node + // is marked for composing phrase, ignore this node. + if (dmi->c_phrase != 0 && !dmi_c_phrase_) { + continue; + } + } + + // For example, if "gao" is extended, "g ao" is not allowed. + // or "zh" has been passed, "z h" is not allowed. + // Both word and word-connection will be prevented. + if (longest_ext > ext_len) { + if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) { + continue; + } + + // "z h" is not allowed. + if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) { + continue; + } + } + + dep_->splids_extended = 0; + if (NULL != dmi) { + uint16 prev_ids_num = dmi->dict_level; + if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) || + (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) { + continue; + } + + DictMatchInfo *d = dmi; + while (d) { + dep_->splids[--prev_ids_num] = d->spl_id; + if ((PoolPosType)-1 == d->dmi_fr) + break; + d = dmi_pool_ + d->dmi_fr; + } + assert(0 == prev_ids_num); + dep_->splids_extended = dmi->dict_level; + } + dep_->splids[dep_->splids_extended] = spl_idx; + dep_->ext_len = ext_len; + dep_->splid_end_split = splid_end_split; + + dep_->id_num = 1; + dep_->id_start = spl_idx; + if (spl_trie_->is_half_id(spl_idx)) { + // Get the full id list + dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start)); + assert(dep_->id_num > 0); + } + + uint16 new_dmi_num; + + new_dmi_num = extend_dmi(dep_, dmi); + + if (new_dmi_num > 0) { + if (dmi_c_phrase_) { + dmi_pool_[dmi_pool_used_].c_phrase = 1; + } + matrix_[pys_decoded_len_].dmi_num += new_dmi_num; + dmi_pool_used_ += new_dmi_num; + + if (!spl_trie_->is_half_id(spl_idx)) + matrix_[pys_decoded_len_].dmi_has_full_id = 1; + } + + // If get candiate lemmas, try to extend the path + if (lpi_total_ > 0) { + uint16 fr_row; + if (NULL == dmi) { + fr_row = oldrow; + } else { + assert(oldrow >= dmi->splstr_len); + fr_row = oldrow - dmi->splstr_len; + } + for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos; + mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos + + matrix_[fr_row].mtrx_nd_num; + mtrx_nd_pos++) { + MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos; + + extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_, + dmi_pool_used_ - new_dmi_num, pys_decoded_len_); + if (longest_ext == 0) + longest_ext = ext_len; + } + } + } // for dmi_pos + } // for ext_len + mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num; + + if (dmi_c_phrase_) + return true; + + return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched); +} + +void MatrixSearch::prepare_candidates() { + // Get candiates from the first un-fixed step. + uint16 lma_size_max = kMaxLemmaSize; + if (lma_size_max > spl_id_num_ - fixed_hzs_) + lma_size_max = spl_id_num_ - fixed_hzs_; + + uint16 lma_size = lma_size_max; + + // If the full sentense candidate's unfixed part may be the same with a normal + // lemma. Remove the lemma candidate in this case. + char16 fullsent[kMaxLemmaSize + 1]; + char16 *pfullsent = NULL; + uint16 sent_len; + pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true); + + // If the unfixed part contains more than one ids, it is not necessary to + // check whether a lemma's string is the same to the unfixed part of the full + // sentence candidate, so, set it to NULL; + if (sent_len > kMaxLemmaSize) + pfullsent = NULL; + + lpi_total_ = 0; + size_t lpi_num_full_match = 0; // Number of items which are fully-matched. + while (lma_size > 0) { + size_t lma_num; + lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size, + lpi_items_ + lpi_total_, + size_t(kMaxLmaPsbItems - lpi_total_), + pfullsent, lma_size == lma_size_max); + + if (lma_num > 0) { + lpi_total_ += lma_num; + // For next lemma candidates which are not the longest, it is not + // necessary to compare with the full sentence candiate. + pfullsent = NULL; + } + if (lma_size == lma_size_max) { + lpi_num_full_match = lpi_total_; + } + lma_size--; + } + + // Sort those partially-matched items by their unified scores. + myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match, + sizeof(LmaPsbItem), cmp_lpi_with_unified_psb); + + if (kPrintDebug0) { + printf("-----Prepare candidates, score:\n"); + for (size_t a = 0; a < lpi_total_; a++) { + printf("[%03d]%d ", a, lpi_items_[a].psb); + if ((a + 1) % 6 == 0) printf("\n"); + } + printf("\n"); + } + + if (kPrintDebug0) { + printf("--- lpi_total_ = %d\n", lpi_total_); + } +} + +const char* MatrixSearch::get_pystr(size_t *decoded_len) { + if (!inited_ || NULL == decoded_len) + return NULL; + + *decoded_len = pys_decoded_len_; + return pys_; +} + +void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) { + if (fixed_lmas_ == 0) + return; + // Update spelling segmentation information first. + spl_id_num_ -= 1; + uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos]; + for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) { + spl_start_[pos] = spl_start_[pos + 1] - del_py_len; + if (pos == spl_id_num_) + break; + spl_id_[pos] = spl_id_[pos + 1]; + } + + // Begin to merge. + uint16 phrase_len = 0; + + // Update the spelling ids to the composing phrase. + // We need to convert these ids into full id in the future. + memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16)); + memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16)); + + // If composing phrase has not been created, first merge all fixed + // lemmas into a composing phrase without deletion. + if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) { + uint16 bp = 1; // Begin position of real fixed lemmas. + // There is no existing composing phrase. + if (kLemmaIdComposing != lma_id_[0]) { + c_phrase_.sublma_num = 0; + bp = 0; + } + + uint16 sub_num = c_phrase_.sublma_num; + for (uint16 pos = bp; pos <= fixed_lmas_; pos++) { + c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos]; + if (lma_start_[pos] > del_spl_pos) { + c_phrase_.sublma_start[sub_num + pos - bp] -= 1; + } + + if (pos == fixed_lmas_) + break; + + uint16 lma_len; + char16 *lma_str = c_phrase_.chn_str + + c_phrase_.sublma_start[sub_num] + phrase_len; + + lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len); + assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]); + phrase_len += lma_len; + } + assert(phrase_len == lma_start_[fixed_lmas_]); + c_phrase_.length = phrase_len; // will be deleted by 1 + c_phrase_.sublma_num += fixed_lmas_ - bp; + } else { + for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) { + if (c_phrase_.sublma_start[pos] > del_spl_pos) { + c_phrase_.sublma_start[pos] -= 1; + } + } + phrase_len = c_phrase_.length; + } + + assert(phrase_len > 0); + if (1 == phrase_len) { + // After the only one is deleted, nothing will be left. + fixed_lmas_ = 0; + return; + } + + // Delete the Chinese character in the merged phrase. + // The corresponding elements in spl_ids and spl_start of the + // phrase have been deleted. + char16 *chn_str = c_phrase_.chn_str + del_spl_pos; + for (uint16 pos = 0; + pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos; + pos++) { + chn_str[pos] = chn_str[pos + 1]; + } + c_phrase_.length -= 1; + + // If the deleted spelling id is in a sub lemma which contains more than + // one id, del_a_sub will be false; but if the deleted id is in a sub lemma + // which only contains 1 id, the whole sub lemma needs to be deleted, so + // del_a_sub will be true. + bool del_a_sub = false; + for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) { + if (c_phrase_.sublma_start[pos - 1] == + c_phrase_.sublma_start[pos]) { + del_a_sub = true; + } + if (del_a_sub) { + c_phrase_.sublma_start[pos - 1] = + c_phrase_.sublma_start[pos]; + } + } + if (del_a_sub) + c_phrase_.sublma_num -= 1; + + return; +} + +void MatrixSearch::get_spl_start_id() { + lma_id_num_ = 0; + lma_start_[0] = 0; + + spl_id_num_ = 0; + spl_start_[0] = 0; + if (!inited_ || 0 == pys_decoded_len_ || + 0 == matrix_[pys_decoded_len_].mtrx_nd_num) + return; + + // Calculate number of lemmas and spellings + // Only scan those part which is not fixed. + lma_id_num_ = fixed_lmas_; + spl_id_num_ = fixed_hzs_; + + MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; + while (mtrx_nd != mtrx_nd_pool_) { + if (fixed_hzs_ > 0) { + if (mtrx_nd->step <= spl_start_[fixed_hzs_]) + break; + } + + // Update the spelling segamentation information + unsigned char word_splstr_len = 0; + PoolPosType dmi_fr = mtrx_nd->dmi_fr; + if ((PoolPosType)-1 != dmi_fr) + word_splstr_len = dmi_pool_[dmi_fr].splstr_len; + + while ((PoolPosType)-1 != dmi_fr) { + spl_start_[spl_id_num_ + 1] = mtrx_nd->step - + (word_splstr_len - dmi_pool_[dmi_fr].splstr_len); + spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id; + spl_id_num_++; + dmi_fr = dmi_pool_[dmi_fr].dmi_fr; + } + + // Update the lemma segmentation information + lma_start_[lma_id_num_ + 1] = spl_id_num_; + lma_id_[lma_id_num_] = mtrx_nd->id; + lma_id_num_++; + + mtrx_nd = mtrx_nd->from; + } + + // Reverse the result of spelling info + for (size_t pos = fixed_hzs_; + pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) { + if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) { + spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; + spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1]; + spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; + + spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1]; + spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos]; + spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1]; + } + } + + // Reverse the result of lemma info + for (size_t pos = fixed_lmas_; + pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) { + assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos); + + if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) { + lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; + lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1]; + lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; + + lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; + lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos]; + lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; + } + } + + for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) { + if (pos < lma_id_num_) + lma_start_[pos] = lma_start_[pos - 1] + + (lma_start_[pos] - lma_start_[pos + 1]); + else + lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] - + lma_start_[fixed_lmas_]; + } + + // Find the last fixed position + fixed_hzs_ = 0; + for (size_t pos = spl_id_num_; pos > 0; pos--) { + if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) { + fixed_hzs_ = pos; + break; + } + } + + return; +} + +size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) { + get_spl_start_id(); + spl_start = spl_start_; + return spl_id_num_; +} + +size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) { + if (dmi_pool_used_ >= kDmiPoolSize) return 0; + + if (dmi_c_phrase_) + return extend_dmi_c(dep, dmi_s); + + LpiCache& lpi_cache = LpiCache::get_instance(); + uint16 splid = dep->splids[dep->splids_extended]; + + bool cached = false; + if (0 == dep->splids_extended) + cached = lpi_cache.is_cached(splid); + + // 1. If this is a half Id, get its corresponding full starting Id and + // number of full Id. + size_t ret_val = 0; + PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node + + lpi_total_ = 0; + + MileStoneHandle from_h[3]; + from_h[0] = 0; + from_h[1] = 0; + + if (0 != dep->splids_extended) { + from_h[0] = dmi_s->dict_handles[0]; + from_h[1] = dmi_s->dict_handles[1]; + } + + // 2. Begin exgtending in the system dictionary + size_t lpi_num = 0; + MileStoneHandle handles[2]; + handles[0] = handles[1] = 0; + if (from_h[0] > 0 || NULL == dmi_s) { + handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_, + kMaxLmaPsbItems, &lpi_num); + } + if (handles[0] > 0) + lpi_total_ = lpi_num; + + if (NULL == dmi_s) { // from root + assert(0 != handles[0]); + mtrx_dmi_fr = dmi_pool_used_; + } + + // 3. Begin extending in the user dictionary + if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) { + handles[1] = user_dict_->extend_dict(from_h[1], dep, + lpi_items_ + lpi_total_, + kMaxLmaPsbItems - lpi_total_, + &lpi_num); + if (handles[1] > 0) { + if (kPrintDebug0) { + for (size_t t = 0; t < lpi_num; t++) { + printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id, + lpi_items_[lpi_total_ + t].psb); + } + } + lpi_total_ += lpi_num; + } + } + + if (0 != handles[0] || 0 != handles[1]) { + if (dmi_pool_used_ >= kDmiPoolSize) return 0; + + DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; + if (NULL == dmi_s) { + fill_dmi(dmi_add, handles, + (PoolPosType)-1, splid, + 1, 1, dep->splid_end_split, dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : 1); + } else { + fill_dmi(dmi_add, handles, + dmi_s - dmi_pool_, splid, 1, + dmi_s->dict_level + 1, dep->splid_end_split, + dmi_s->splstr_len + dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); + } + + ret_val = 1; + } + + if (!cached) { + if (0 == lpi_total_) + return ret_val; + + if (kPrintDebug0) { + printf("--- lpi_total_ = %d\n", lpi_total_); + } + + myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb); + if (NULL == dmi_s && spl_trie_->is_half_id(splid)) + lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_); + } else { + assert(spl_trie_->is_half_id(splid)); + lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems); + } + + return ret_val; +} + +size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) { + lpi_total_ = 0; + + uint16 pos = dep->splids_extended; + assert(dmi_c_phrase_); + if (pos >= c_phrase_.length) + return 0; + + uint16 splid = dep->splids[pos]; + if (splid == c_phrase_.spl_ids[pos]) { + DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; + MileStoneHandle handles[2]; // Actually never used. + if (NULL == dmi_s) + fill_dmi(dmi_add, handles, + (PoolPosType)-1, splid, + 1, 1, dep->splid_end_split, dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : 1); + else + fill_dmi(dmi_add, handles, + dmi_s - dmi_pool_, splid, 1, + dmi_s->dict_level + 1, dep->splid_end_split, + dmi_s->splstr_len + dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); + + if (pos == c_phrase_.length - 1) { + lpi_items_[0].id = kLemmaIdComposing; + lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score. + lpi_total_ = 1; + } + return 1; + } + return 0; +} + +size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], + size_t lpi_num, PoolPosType dmi_fr, + size_t res_row) { + assert(NULL != mtrx_nd); + matrix_[res_row].mtrx_nd_fixed = NULL; + + if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow) + return 0; + + if (0 == mtrx_nd->step) { + // Because the list is sorted, if the source step is 0, it is only + // necessary to pick up the first kMaxNodeARow items. + if (lpi_num > kMaxNodeARow) + lpi_num = kMaxNodeARow; + } + + MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos; + for (size_t pos = 0; pos < lpi_num; pos++) { + float score = mtrx_nd->score + lpi_items[pos].psb; + if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score) + break; + + // Try to add a new node + size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num; + MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num; + bool replace = false; + // Find its position + while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) { + if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow) + *mtrx_nd_res = *(mtrx_nd_res - 1); + mtrx_nd_res--; + replace = true; + } + if (replace || (mtrx_nd_num < kMaxNodeARow && + matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) { + mtrx_nd_res->id = lpi_items[pos].id; + mtrx_nd_res->score = score; + mtrx_nd_res->from = mtrx_nd; + mtrx_nd_res->dmi_fr = dmi_fr; + mtrx_nd_res->step = res_row; + if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow) + matrix_[res_row].mtrx_nd_num++; + } + } + return matrix_[res_row].mtrx_nd_num; +} + +PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[], + uint16 spl_id_num) { + if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) { + return static_cast<PoolPosType>(-1); + } + + for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) { + DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos; + + if (dmi->dict_level != spl_id_num) + continue; + + bool matched = true; + for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) { + if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) { + matched = false; + break; + } + + dmi = dmi_pool_ + dmi->dmi_fr; + } + if (matched) { + return matrix_[step_to].dmi_pos + dmi_pos; + } + } + + return static_cast<PoolPosType>(-1); +} + +char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len, + uint16 *retstr_len, + bool only_unfixed) { + if (pys_decoded_len_ == 0 || + matrix_[pys_decoded_len_].mtrx_nd_num == 0) + return NULL; + + LemmaIdType idxs[kMaxRowNum]; + size_t id_num = 0; + + MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; + + if (kPrintDebug0) { + printf("--- sentence score: %f\n", mtrx_nd->score); + } + + if (kPrintDebug1) { + printf("==============Sentence DMI (reverse order) begin===========>>\n"); + } + + while (mtrx_nd != NULL) { + idxs[id_num] = mtrx_nd->id; + id_num++; + + if (kPrintDebug1) { + printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n", + mtrx_nd->step, mtrx_nd->id, mtrx_nd->score); + debug_print_dmi(mtrx_nd->dmi_fr, 1); + } + + mtrx_nd = mtrx_nd->from; + } + + if (kPrintDebug1) { + printf("<<==============Sentence DMI (reverse order) end=============\n"); + } + + size_t ret_pos = 0; + do { + id_num--; + if (0 == idxs[id_num]) + continue; + + char16 str[kMaxLemmaSize + 1]; + uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1); + if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) || + (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) { + if (!only_unfixed) + utf16_strncpy(cand_str + ret_pos, str, str_len); + else if (ret_pos >= fixed_hzs_) + utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len); + + ret_pos += str_len; + } else { + return NULL; + } + } while (id_num != 0); + + if (!only_unfixed) { + if (NULL != retstr_len) + *retstr_len = ret_pos; + cand_str[ret_pos] = (char16)'\0'; + } else { + if (NULL != retstr_len) + *retstr_len = ret_pos - fixed_hzs_; + cand_str[ret_pos - fixed_hzs_] = (char16)'\0'; + } + return cand_str; +} + +size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len, + LmaPsbItem* lma_buf, size_t max_lma_buf, + const char16 *pfullsent, bool sort_by_psb) { + if (splid_str_len > kMaxLemmaSize) + return 0; + + size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len, + lma_buf, max_lma_buf); + size_t num2 = 0; + if (NULL != user_dict_) { + num2 = user_dict_->get_lpis(splid_str, splid_str_len, + lma_buf + num1, max_lma_buf - num1); + } + + size_t num = num1 + num2; + + if (0 == num) + return 0; + + // Remove repeated items. + if (splid_str_len > 1) { + LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num); + size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) / + sizeof(LmaPsbStrItem); + //assert(lpsi_num > num); + if (num > lpsi_num) num = lpsi_num; + lpsi_num = num; + + for (size_t pos = 0; pos < lpsi_num; pos++) { + lpsis[pos].lpi = lma_buf[pos]; + get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1); + } + + myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str); + + size_t remain_num = 0; + for (size_t pos = 0; pos < lpsi_num; pos++) { + if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) { + if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) { + assert(remain_num > 0); + lma_buf[remain_num - 1] = lpsis[pos].lpi; + } + continue; + } + if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0) + continue; + + lma_buf[remain_num] = lpsis[pos].lpi; + remain_num++; + } + + // Update the result number + num = remain_num; + } else { + // For single character, some characters have more than one spelling, for + // example, "de" and "di" are all valid for a Chinese character, so when + // the user input "d", repeated items are generated. + // For single character lemmas, Hanzis will be gotten + for (size_t pos = 0; pos < num; pos++) { + char16 hanzis[2]; + get_lemma_str(lma_buf[pos].id, hanzis, 2); + lma_buf[pos].hanzi = hanzis[0]; + } + + myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi); + + size_t remain_num = 0; + for (size_t pos = 0; pos < num; pos++) { + if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) { + if (NULL != pfullsent && + static_cast<char16>(0) == pfullsent[1] && + lma_buf[pos].hanzi == pfullsent[0]) + continue; + + if (lma_buf[pos].psb < lma_buf[pos - 1].psb) { + assert(remain_num > 0); + assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi); + lma_buf[remain_num - 1] = lma_buf[pos]; + } + continue; + } + if (NULL != pfullsent && + static_cast<char16>(0) == pfullsent[1] && + lma_buf[pos].hanzi == pfullsent[0]) + continue; + + lma_buf[remain_num] = lma_buf[pos]; + remain_num++; + } + + num = remain_num; + } + + if (sort_by_psb) { + myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb); + } + return num; +} + +uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) { + uint16 str_len = 0; + + if (is_system_lemma(id_lemma)) { + str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max); + } else if (is_user_lemma(id_lemma)) { + if (NULL != user_dict_) { + str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max); + } else { + str_len = 0; + str_buf[0] = static_cast<char16>('\0'); + } + } else if (is_composing_lemma(id_lemma)) { + if (str_max <= 1) + return 0; + str_len = c_phrase_.sublma_start[c_phrase_.sublma_num]; + if (str_len > str_max - 1) + str_len = str_max - 1; + utf16_strncpy(str_buf, c_phrase_.chn_str, str_len); + str_buf[str_len] = (char16)'\0'; + return str_len; + } + + return str_len; +} + +uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid) { + uint16 splid_num = 0; + + if (arg_valid) { + for (splid_num = 0; splid_num < splids_max; splid_num++) { + if (spl_trie_->is_half_id(splids[splid_num])) + break; + } + if (splid_num == splids_max) + return splid_num; + } + + if (is_system_lemma(id_lemma)) { + splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max, + arg_valid); + } else if (is_user_lemma(id_lemma)) { + if (NULL != user_dict_) { + splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max, + arg_valid); + } else { + splid_num = 0; + } + } else if (is_composing_lemma(id_lemma)) { + if (c_phrase_.length > splids_max) { + return 0; + } + for (uint16 pos = 0; pos < c_phrase_.length; pos++) { + splids[pos] = c_phrase_.spl_ids[pos]; + if (spl_trie_->is_half_id(splids[pos])) { + return 0; + } + } + } + return splid_num; +} + +size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len, + char16 predict_buf[][kMaxPredictSize + 1], + size_t buf_len) { + size_t res_total = 0; + memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_); + // In order to shorten the comments, j-character candidates predicted by + // i-character prefix are called P(i,j). All candiates predicted by + // i-character prefix are called P(i,*) + // Step 1. Get P(kMaxPredictSize, *) and sort them, here + // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1) + for (size_t len = fixed_len; len >0; len--) { + // How many blank items are available + size_t this_max = npre_items_len_ - res_total; + size_t res_this; + // If the history is longer than 1, and we can not get prediction from + // lemmas longer than 2, in this case, we will add lemmas with + // highest scores as the prediction result. + if (fixed_len > 1 && 1 == len && 0 == res_total) { + // Try to find if recent n (n>1) characters can be a valid lemma in system + // dictionary. + bool nearest_n_word = false; + for (size_t nlen = 2; nlen <= fixed_len; nlen++) { + if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) { + nearest_n_word = true; + break; + } + } + res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0, + npre_items_ + res_total, + this_max, res_total); + res_total += res_this; + } + + // How many blank items are available + this_max = npre_items_len_ - res_total; + res_this = 0; + if (!kOnlyUserDictPredict) { + res_this = + dict_trie_->predict(fixed_buf + fixed_len - len, len, + npre_items_ + res_total, this_max, + res_total); + } + + if (NULL != user_dict_) { + res_this = res_this + + user_dict_->predict(fixed_buf + fixed_len - len, len, + npre_items_ + res_total + res_this, + this_max - res_this, res_total + res_this); + } + + if (kPredictLimitGt1) { + myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem), + cmp_npre_by_score); + + if (len > 3) { + if (res_this > kMaxPredictNumByGt3) + res_this = kMaxPredictNumByGt3; + } else if (3 == len) { + if (res_this > kMaxPredictNumBy3) + res_this = kMaxPredictNumBy3; + } else if (2 == len) { + if (res_this > kMaxPredictNumBy2) + res_this = kMaxPredictNumBy2; + } + } + + res_total += res_this; + } + + res_total = remove_duplicate_npre(npre_items_, res_total); + + if (kPreferLongHistoryPredict) { + myqsort(npre_items_, res_total, sizeof(NPredictItem), + cmp_npre_by_hislen_score); + } else { + myqsort(npre_items_, res_total, sizeof(NPredictItem), + cmp_npre_by_score); + } + + if (buf_len < res_total) { + res_total = buf_len; + } + + if (kPrintDebug2) { + printf("/////////////////Predicted Items Begin////////////////////>>\n"); + for (size_t i = 0; i < res_total; i++) { + printf("---"); + for (size_t j = 0; j < kMaxPredictSize; j++) { + printf("%d ", npre_items_[i].pre_hzs[j]); + } + printf("\n"); + } + printf("<<///////////////Predicted Items End////////////////////////\n"); + } + + for (size_t i = 0; i < res_total; i++) { + utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs, + kMaxPredictSize); + predict_buf[i][kMaxPredictSize] = '\0'; + } + + return res_total; +} + +size_t MatrixSearch::get_predicts(const char16 fixed_buf[], + char16 predict_buf[][kMaxPredictSize + 1], + size_t buf_len) { + size_t fixed_len = utf16_strlen(fixed_buf); + if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len) + return 0; + + return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len); +} + +} // namespace ime_pinyin |