/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "../include/dictlist.h" #include "../include/mystdlib.h" #include "../include/ngram.h" #include "../include/searchutility.h" namespace ime_pinyin { DictList::DictList() { initialized_ = false; scis_num_ = 0; scis_hz_ = NULL; scis_splid_ = NULL; buf_ = NULL; spl_trie_ = SpellingTrie::get_cpinstance(); assert(kMaxLemmaSize == 8); cmp_func_[0] = cmp_hanzis_1; cmp_func_[1] = cmp_hanzis_2; cmp_func_[2] = cmp_hanzis_3; cmp_func_[3] = cmp_hanzis_4; cmp_func_[4] = cmp_hanzis_5; cmp_func_[5] = cmp_hanzis_6; cmp_func_[6] = cmp_hanzis_7; cmp_func_[7] = cmp_hanzis_8; } DictList::~DictList() { free_resource(); } bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { // Allocate memory buf_ = static_cast(malloc(buf_size * sizeof(char16))); if (NULL == buf_) return false; scis_num_ = scis_num; scis_hz_ = static_cast(malloc(scis_num_ * sizeof(char16))); if (NULL == scis_hz_) return false; scis_splid_ = static_cast (malloc(scis_num_ * sizeof(SpellingId))); if (NULL == scis_splid_) return false; return true; } void DictList::free_resource() { if (NULL != buf_) free(buf_); buf_ = NULL; if (NULL != scis_hz_) free(scis_hz_); scis_hz_ = NULL; if (NULL != scis_splid_) free(scis_splid_); scis_splid_ = NULL; } #ifdef ___BUILD_MODEL___ bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, const LemmaEntry *lemma_arr, size_t lemma_num) { if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) return false; initialized_ = false; if (NULL != buf_) free(buf_); // calculate the size size_t buf_size = calculate_size(lemma_arr, lemma_num); if (0 == buf_size) return false; if (!alloc_resource(buf_size, scis_num)) return false; fill_scis(scis, scis_num); // Copy the related content from the array to inner buffer fill_list(lemma_arr, lemma_num); initialized_ = true; return true; } size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { size_t last_hz_len = 0; size_t list_size = 0; size_t id_num = 0; for (size_t i = 0; i < lemma_num; i++) { if (0 == i) { last_hz_len = lemma_arr[i].hz_str_len; assert(last_hz_len > 0); assert(lemma_arr[0].idx_by_hz == 1); id_num++; start_pos_[0] = 0; start_id_[0] = id_num; last_hz_len = 1; list_size += last_hz_len; } else { size_t current_hz_len = lemma_arr[i].hz_str_len; assert(current_hz_len >= last_hz_len); if (current_hz_len == last_hz_len) { list_size += current_hz_len; id_num++; } else { for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { start_pos_[len] = start_pos_[len - 1]; start_id_[len] = start_id_[len - 1]; } start_pos_[current_hz_len - 1] = list_size; id_num++; start_id_[current_hz_len - 1] = id_num; last_hz_len = current_hz_len; list_size += current_hz_len; } } } for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { if (0 == i) { start_pos_[0] = 0; start_id_[0] = 1; } else { start_pos_[i] = list_size; start_id_[i] = id_num; } } return start_pos_[kMaxLemmaSize]; } void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { assert(scis_num_ == scis_num); for (size_t pos = 0; pos < scis_num_; pos++) { scis_hz_[pos] = scis[pos].hz; scis_splid_[pos] = scis[pos].splid; } } void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { size_t current_pos = 0; utf16_strncpy(buf_, lemma_arr[0].hanzi_str, lemma_arr[0].hz_str_len); current_pos = lemma_arr[0].hz_str_len; size_t id_num = 1; for (size_t i = 1; i < lemma_num; i++) { utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, lemma_arr[i].hz_str_len); id_num++; current_pos += lemma_arr[i].hz_str_len; } assert(current_pos == start_pos_[kMaxLemmaSize]); assert(id_num == start_id_[kMaxLemmaSize]); } char16* DictList::find_pos2_startedbyhz(char16 hz_char) { char16 *found_2w = static_cast (mybsearch(&hz_char, buf_ + start_pos_[1], (start_pos_[2] - start_pos_[1]) / 2, sizeof(char16) * 2, cmp_hanzis_1)); if (NULL == found_2w) return NULL; while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) found_2w -= 2; return found_2w; } #endif // ___BUILD_MODEL___ char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], size_t word_len, int (*cmp_func)(const void *, const void *)) { char16 *found_w = static_cast (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], (start_pos_[word_len] - start_pos_[word_len - 1]) / word_len, sizeof(char16) * word_len, cmp_func)); if (NULL == found_w) return NULL; while (found_w > buf_ + start_pos_[word_len -1] && cmp_func(found_w, found_w - word_len) == 0) found_w -= word_len; return found_w; } size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used) { assert(hzs_len <= kMaxPredictSize && hzs_len > 0); // 1. Prepare work int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; NGram& ngram = NGram::get_instance(); size_t item_num = 0; // 2. Do prediction for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; pre_len++) { uint16 word_len = hzs_len + pre_len; char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); if (NULL == w_buf) continue; while (w_buf < buf_ + start_pos_[word_len] && cmp_func(w_buf, last_hzs) == 0 && item_num < npre_max) { memset(npre_items + item_num, 0, sizeof(NPredictItem)); utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); npre_items[item_num].psb = ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) / word_len + start_id_[word_len - 1]); npre_items[item_num].his_len = hzs_len; item_num++; w_buf += word_len; } } size_t new_num = 0; for (size_t i = 0; i < item_num; i++) { // Try to find it in the existing items size_t e_pos; for (e_pos = 1; e_pos <= b4_used; e_pos++) { if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, kMaxPredictSize) == 0) break; } if (e_pos <= b4_used) continue; // If not found, append it to the buffer npre_items[new_num] = npre_items[i]; new_num++; } return new_num; } uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max) { if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf || str_max <= 1) return 0; // Find the range for (uint16 i = 0; i < kMaxLemmaSize; i++) { if (i + 1 > str_max - 1) return 0; if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { size_t id_span = id_lemma - start_id_[i]; uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); for (uint16 len = 0; len <= i; len++) { str_buf[len] = buf[len]; } str_buf[i+1] = (char16)'\0'; return i + 1; } } return 0; } uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, uint16 *splids, uint16 max_splids) { char16 *hz_found = static_cast (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); assert(NULL != hz_found && hanzi == *hz_found); // Move to the first one. while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) hz_found--; // First try to found if strict comparison result is not zero. char16 *hz_f = hz_found; bool strict = false; while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { uint16 pos = hz_f - scis_hz_; if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { strict = true; } hz_f++; } uint16 found_num = 0; while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { uint16 pos = hz_found - scis_hz_; if (0 == half_splid || (strict && scis_splid_[pos].half_splid == half_splid) || (!strict && spl_trie_->half_full_compatible(half_splid, scis_splid_[pos].full_splid))) { assert(found_num + 1 < max_splids); splids[found_num] = scis_splid_[pos].full_splid; found_num++; } hz_found++; } return found_num; } LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { if (NULL == str || str_len > kMaxLemmaSize) return 0; char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); if (NULL == found) return 0; assert(found > buf_); assert(static_cast(found - buf_) >= start_pos_[str_len - 1]); return static_cast (start_id_[str_len - 1] + (found - buf_ - start_pos_[str_len - 1]) / str_len); } void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { assert(NULL != str); for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { str[str_pos] = scis_hz_[str[str_pos]]; } } void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { assert(NULL != str); for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { str[str_pos] = 0x100; } } bool DictList::save_list(FILE *fp) { if (!initialized_ || NULL == fp) return false; if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) return false; if (fwrite(&scis_num_, sizeof(uint32), 1, fp) != 1) return false; if (fwrite(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != kMaxLemmaSize + 1) return false; if (fwrite(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != kMaxLemmaSize + 1) return false; if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) return false; if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) return false; if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != start_pos_[kMaxLemmaSize]) return false; return true; } bool DictList::load_list(FILE *fp) { if (NULL == fp) return false; initialized_ = false; if (fread(&scis_num_, sizeof(uint32), 1, fp) != 1) return false; if (fread(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != kMaxLemmaSize + 1) return false; if (fread(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != kMaxLemmaSize + 1) return false; free_resource(); if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) return false; if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) return false; if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) return false; if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != start_pos_[kMaxLemmaSize]) return false; initialized_ = true; return true; } } // namespace ime_pinyin