diff options
Diffstat (limited to 'src/plugins/pinyin/3rdparty/pinyin/share/dictbuilder.cpp')
-rw-r--r-- | src/plugins/pinyin/3rdparty/pinyin/share/dictbuilder.cpp | 1070 |
1 files changed, 1070 insertions, 0 deletions
diff --git a/src/plugins/pinyin/3rdparty/pinyin/share/dictbuilder.cpp b/src/plugins/pinyin/3rdparty/pinyin/share/dictbuilder.cpp new file mode 100644 index 00000000..6f0bd4f7 --- /dev/null +++ b/src/plugins/pinyin/3rdparty/pinyin/share/dictbuilder.cpp @@ -0,0 +1,1070 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "../include/dictbuilder.h" +#include "../include/dicttrie.h" +#include "../include/mystdlib.h" +#include "../include/ngram.h" +#include "../include/searchutility.h" +#include "../include/spellingtable.h" +#include "../include/spellingtrie.h" +#include "../include/splparser.h" +#include "../include/utf16reader.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +static const size_t kReadBufLen = 512; +static const size_t kSplTableHashLen = 2000; + +// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by +// frequencies. +int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { + const SingleCharItem *s1, *s2; + s1 = static_cast<const SingleCharItem*>(p1); + s2 = static_cast<const SingleCharItem*>(p2); + + if (s1->hz < s2->hz) + return -1; + if (s1->hz > s2->hz) + return 1; + + if (s1->splid.half_splid < s2->splid.half_splid) + return -1; + if (s1->splid.half_splid > s2->splid.half_splid) + return 1; + + if (s1->splid.full_splid < s2->splid.full_splid) + return -1; + if (s1->splid.full_splid > s2->splid.full_splid) + return 1; + + if (s1->freq > s2->freq) + return -1; + if (s1->freq < s2->freq) + return 1; + return 0; +} + +int cmp_scis_hz_splid(const void* p1, const void* p2) { + const SingleCharItem *s1, *s2; + s1 = static_cast<const SingleCharItem*>(p1); + s2 = static_cast<const SingleCharItem*>(p2); + + if (s1->hz < s2->hz) + return -1; + if (s1->hz > s2->hz) + return 1; + + if (s1->splid.half_splid < s2->splid.half_splid) + return -1; + if (s1->splid.half_splid > s2->splid.half_splid) + return 1; + + if (s1->splid.full_splid < s2->splid.full_splid) + return -1; + if (s1->splid.full_splid > s2->splid.full_splid) + return 1; + + return 0; +} + +int cmp_lemma_entry_hzs(const void* p1, const void* p2) { + size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); + size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); + if (size1 < size2) + return -1; + else if (size1 > size2) + return 1; + + return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, + ((const LemmaEntry*)p2)->hanzi_str); +} + +int compare_char16(const void* p1, const void* p2) { + if (*((const char16*)p1) < *((const char16*)p2)) + return -1; + if (*((const char16*)p1) > *((const char16*)p2)) + return 1; + return 0; +} + +int compare_py(const void* p1, const void* p2) { + int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + + if (0 != ret) + return ret; + + return static_cast<int>(((const LemmaEntry*)p2)->freq) - + static_cast<int>(((const LemmaEntry*)p1)->freq); +} + +// First hanzi, if the same, then Pinyin +int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { + size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); + size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); + if (size1 < size2) + return -1; + else if (size1 > size2) + return 1; + int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, + ((const LemmaEntry*)p2)->hanzi_str); + + if (0 != ret) + return ret; + + ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + return ret; +} + +int compare_splid2(const void* p1, const void* p2) { + int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + return ret; +} + +DictBuilder::DictBuilder() { + lemma_arr_ = NULL; + lemma_num_ = 0; + + scis_ = NULL; + scis_num_ = 0; + + lma_nodes_le0_ = NULL; + lma_nodes_ge1_ = NULL; + + lma_nds_used_num_le0_ = 0; + lma_nds_used_num_ge1_ = 0; + + homo_idx_buf_ = NULL; + homo_idx_num_eq1_ = 0; + homo_idx_num_gt1_ = 0; + + top_lmas_ = NULL; + top_lmas_num_ = 0; + + spl_table_ = NULL; + spl_parser_ = NULL; +} + +DictBuilder::~DictBuilder() { + free_resource(); +} + +bool DictBuilder::alloc_resource(size_t lma_num) { + if (0 == lma_num) + return false; + + free_resource(); + + lemma_num_ = lma_num; + lemma_arr_ = new LemmaEntry[lemma_num_]; + + top_lmas_num_ = 0; + top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; + + // New the scis_ buffer to the possible maximum size. + scis_num_ = lemma_num_ * kMaxLemmaSize; + scis_ = new SingleCharItem[scis_num_]; + + // The root and first level nodes is less than kMaxSpellingNum + 1 + lma_nds_used_num_le0_ = 0; + lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; + + // Other nodes is less than lemma_num + lma_nds_used_num_ge1_ = 0; + lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; + + homo_idx_buf_ = new LemmaIdType[lemma_num_]; + spl_table_ = new SpellingTable(); + spl_parser_ = new SpellingParser(); + + if (NULL == lemma_arr_ || NULL == top_lmas_ || + NULL == scis_ || NULL == spl_table_ || + NULL == spl_parser_ || NULL == lma_nodes_le0_ || + NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { + free_resource(); + return false; + } + + memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); + memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); + memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); + memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); + memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); + spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); + + return true; +} + +char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { + if (NULL == fn_validhzs || NULL == num) + return NULL; + + *num = 0; + FILE *fp = fopen(fn_validhzs, "rb"); + if (NULL == fp) + return NULL; + + char16 utf16header; + if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || + 0xfeff != utf16header) { + fclose(fp); + return NULL; + } + + fseek(fp, 0, SEEK_END); + *num = ftell(fp) / sizeof(char16); + assert(*num >= 1); + *num -= 1; + + char16 *hzs = new char16[*num]; + if (NULL == hzs) { + fclose(fp); + return NULL; + } + + fseek(fp, 2, SEEK_SET); + + if (fread(hzs, sizeof(char16), *num, fp) != *num) { + fclose(fp); + delete [] hzs; + return NULL; + } + fclose(fp); + + myqsort(hzs, *num, sizeof(char16), compare_char16); + return hzs; +} + +bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, + char16 hz) { + if (NULL == hzs) + return false; + + char16 *found; + found = static_cast<char16*>( + mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); + if (NULL == found) + return false; + + assert(*found == hz); + return true; +} + +// The caller makes sure that the parameters are valid. +bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, + const char16 *str, size_t str_len) { + if (NULL == hzs || NULL == str) + return false; + + for (size_t pos = 0; pos < str_len; pos++) { + if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) + return false; + } + return true; +} + +void DictBuilder::get_top_lemmas() { + top_lmas_num_ = 0; + if (NULL == lemma_arr_) + return; + + for (size_t pos = 0; pos < lemma_num_; pos++) { + if (0 == top_lmas_num_) { + top_lmas_[0] = lemma_arr_[pos]; + top_lmas_num_ = 1; + continue; + } + + if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { + if (kTopScoreLemmaNum > top_lmas_num_) + top_lmas_num_ += 1; + + size_t move_pos; + for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { + top_lmas_[move_pos] = top_lmas_[move_pos - 1]; + if (0 == move_pos - 1 || + (move_pos - 1 > 0 && + top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { + break; + } + } + assert(move_pos > 0); + top_lmas_[move_pos - 1] = lemma_arr_[pos]; + } else if (kTopScoreLemmaNum > top_lmas_num_) { + top_lmas_[top_lmas_num_] = lemma_arr_[pos]; + top_lmas_num_ += 1; + } + } + + if (kPrintDebug0) { + printf("\n------Top Lemmas------------------\n"); + for (size_t pos = 0; pos < top_lmas_num_; pos++) { + printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, + top_lmas_[pos].freq); + } + } +} + +void DictBuilder::free_resource() { + if (NULL != lemma_arr_) + delete [] lemma_arr_; + + if (NULL != scis_) + delete [] scis_; + + if (NULL != lma_nodes_le0_) + delete [] lma_nodes_le0_; + + if (NULL != lma_nodes_ge1_) + delete [] lma_nodes_ge1_; + + if (NULL != homo_idx_buf_) + delete [] homo_idx_buf_; + + if (NULL != spl_table_) + delete spl_table_; + + if (NULL != spl_parser_) + delete spl_parser_; + + lemma_arr_ = NULL; + scis_ = NULL; + lma_nodes_le0_ = NULL; + lma_nodes_ge1_ = NULL; + homo_idx_buf_ = NULL; + spl_table_ = NULL; + spl_parser_ = NULL; + + lemma_num_ = 0; + lma_nds_used_num_le0_ = 0; + lma_nds_used_num_ge1_ = 0; + homo_idx_num_eq1_ = 0; + homo_idx_num_gt1_ = 0; +} + +size_t DictBuilder::read_raw_dict(const char* fn_raw, + const char *fn_validhzs, + size_t max_item) { + if (NULL == fn_raw) return 0; + + Utf16Reader utf16_reader; + if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) + return false; + + char16 read_buf[kReadBufLen]; + + // Read the number of lemmas in the file + size_t lemma_num = 240000; + + // allocate resource required + if (!alloc_resource(lemma_num)) { + utf16_reader.close(); + } + + // Read the valid Hanzi list. + char16 *valid_hzs = NULL; + size_t valid_hzs_num = 0; + valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); + + // Begin reading the lemma entries + for (size_t i = 0; i < max_item; i++) { + // read next entry + if (!utf16_reader.readline(read_buf, kReadBufLen)) { + lemma_num = i; + break; + } + + size_t token_size; + char16 *token; + char16 *to_tokenize = read_buf; + + // Get the Hanzi string + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + + size_t lemma_size = utf16_strlen(token); + + if (lemma_size > kMaxLemmaSize) { + i--; + continue; + } + + if (lemma_size > 4) { + i--; + continue; + } + + // Copy to the lemma entry + utf16_strcpy(lemma_arr_[i].hanzi_str, token); + + lemma_arr_[i].hz_str_len = token_size; + + // Get the freq string + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + lemma_arr_[i].freq = utf16_atof(token); + + if (lemma_size > 1 && lemma_arr_[i].freq < 60) { + i--; + continue; + } + + // Get GBK mark, if no valid Hanzi list available, all items which contains + // GBK characters will be discarded. Otherwise, all items which contains + // characters outside of the valid Hanzi list will be discarded. + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + assert(NULL != token); + int gbk_flag = utf16_atoi(token); + if (NULL == valid_hzs || 0 == valid_hzs_num) { + if (0 != gbk_flag) { + i--; + continue; + } + } else { + if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, + lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { + i--; + continue; + } + } + + // Get spelling String + bool spelling_not_support = false; + for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; + hz_pos++) { + // Get a Pinyin + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + + assert(utf16_strlen(token) <= kMaxPinyinSize); + + utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); + + format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); + + // Put the pinyin to the spelling table + if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], + lemma_arr_[i].freq)) { + spelling_not_support = true; + break; + } + } + + // The whole line must have been parsed fully, otherwise discard this one. + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (spelling_not_support || NULL != token) { + i--; + continue; + } + } + + delete [] valid_hzs; + utf16_reader.close(); + + printf("read succesfully, lemma num: %d\n", lemma_num); + + return lemma_num; +} + +bool DictBuilder::build_dict(const char *fn_raw, + const char *fn_validhzs, + DictTrie *dict_trie) { + if (NULL == fn_raw || NULL == dict_trie) + return false; + + lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); + if (0 == lemma_num_) + return false; + + // Arrange the spelling table, and build a spelling tree + // The size of an spelling. '\0' is included. If the spelling table is + // initialized to calculate the spelling scores, the last char in the + // spelling string will be score, and it is also included in spl_item_size. + size_t spl_item_size; + size_t spl_num; + const char* spl_buf; + spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); + if (NULL == spl_buf) { + free_resource(); + return false; + } + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + + if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, + spl_table_->get_score_amplifier(), + spl_table_->get_average_score())) { + free_resource(); + return false; + } + + printf("spelling tree construct successfully.\n"); + + // Convert the spelling string to idxs + for (size_t i = 0; i < lemma_num_; i++) { + for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; + hz_pos++) { + uint16 spl_idxs[2]; + uint16 spl_start_pos[3]; + bool is_pre = true; + int spl_idx_num = + spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], + strlen(lemma_arr_[i].pinyin_str[hz_pos]), + spl_idxs, spl_start_pos, 2, is_pre); + assert(1 == spl_idx_num); + + if (spl_trie.is_half_id(spl_idxs[0])) { + uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); + assert(0 != num); + } + lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; + } + } + + // Sort the lemma items according to the hanzi, and give each unique item a + // id + sort_lemmas_by_hz(); + + scis_num_ = build_scis(); + + // Construct the dict list + dict_trie->dict_list_ = new DictList(); + bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, + lemma_arr_, lemma_num_); + assert(dl_success); + + // Construct the NGram information + NGram& ngram = NGram::get_instance(); + ngram.build_unigram(lemma_arr_, lemma_num_, + lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); + + // sort the lemma items according to the spelling idx string + myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); + + get_top_lemmas(); + +#ifdef ___DO_STATISTICS___ + stat_init(); +#endif + + lma_nds_used_num_le0_ = 1; // The root node + bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_), + lemma_arr_, 0, lemma_num_, 0); + if (!dt_success) { + free_resource(); + return false; + } + +#ifdef ___DO_STATISTICS___ + stat_print(); +#endif + + // Move the node data and homo data to the DictTrie + dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; + dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; + size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; + dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; + assert(NULL != dict_trie->root_); + assert(NULL != dict_trie->lma_idx_buf_); + dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; + dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; + dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; + dict_trie->top_lmas_num_ = top_lmas_num_; + + memcpy(dict_trie->root_, lma_nodes_le0_, + sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); + memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, + sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); + + for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { + id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, + homo_idx_buf_[pos]); + } + + for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; + pos < lma_idx_num; pos++) { + LemmaIdType idx = + top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; + id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); + } + + if (kPrintDebug0) { + printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); + printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); + printf("top_lmas_num_: %d\n", top_lmas_num_); + } + + free_resource(); + + if (kPrintDebug0) { + printf("Building dict succeds\n"); + } + return dt_success; +} + +void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { + if (NULL == buf) return; + for (size_t pos = 0; pos < kLemmaIdSize; pos++) { + (buf)[pos] = (unsigned char)(id >> (pos * 8)); + } +} + +void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { + node->son_1st_off_l = static_cast<uint16>(offset); + node->son_1st_off_h = static_cast<unsigned char>(offset >> 16); +} + +void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { + node->homo_idx_buf_off_l = static_cast<uint16>(offset); + node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16); + +} + +// All spelling strings will be converted to upper case, except that +// spellings started with "ZH"/"CH"/"SH" will be converted to +// "Zh"/"Ch"/"Sh" +void DictBuilder::format_spelling_str(char *spl_str) { + if (NULL == spl_str) + return; + + uint16 pos = 0; + while ('\0' != spl_str[pos]) { + if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') + spl_str[pos] = spl_str[pos] - 'a' + 'A'; + + if (1 == pos && 'H' == spl_str[pos]) { + if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { + spl_str[pos] = 'h'; + } + } + pos++; + } +} + +LemmaIdType DictBuilder::sort_lemmas_by_hz() { + if (NULL == lemma_arr_ || 0 == lemma_num_) + return 0; + + myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); + + lemma_arr_[0].idx_by_hz = 1; + LemmaIdType idx_max = 1; + for (size_t i = 1; i < lemma_num_; i++) { + if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { + idx_max++; + lemma_arr_[i].idx_by_hz = idx_max; + } else { + idx_max++; + lemma_arr_[i].idx_by_hz = idx_max; + } + } + return idx_max + 1; +} + +size_t DictBuilder::build_scis() { + if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) + return 0; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + + // This first one is blank, because id 0 is invalid. + scis_[0].freq = 0; + scis_[0].hz = 0; + scis_[0].splid.full_splid = 0; + scis_[0].splid.half_splid = 0; + scis_num_ = 1; + + // Copy the hanzis to the buffer + for (size_t pos = 0; pos < lemma_num_; pos++) { + size_t hz_num = lemma_arr_[pos].hz_str_len; + for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { + scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; + scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; + scis_[scis_num_].splid.half_splid = + spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); + if (1 == hz_num) + scis_[scis_num_].freq = lemma_arr_[pos].freq; + else + scis_[scis_num_].freq = 0.000001; + scis_num_++; + } + } + + myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); + + // Remove repeated items + size_t unique_scis_num = 1; + for (size_t pos = 1; pos < scis_num_; pos++) { + if (scis_[pos].hz == scis_[pos - 1].hz && + scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) + continue; + scis_[unique_scis_num] = scis_[pos]; + scis_[unique_scis_num].splid.half_splid = + spl_trie.full_to_half(scis_[pos].splid.full_splid); + unique_scis_num++; + } + + scis_num_ = unique_scis_num; + + // Update the lemma list. + for (size_t pos = 0; pos < lemma_num_; pos++) { + size_t hz_num = lemma_arr_[pos].hz_str_len; + for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { + SingleCharItem key; + key.hz = lemma_arr_[pos].hanzi_str[hzpos]; + key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; + key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); + + SingleCharItem *found; + found = static_cast<SingleCharItem*>(mybsearch(&key, scis_, + unique_scis_num, + sizeof(SingleCharItem), + cmp_scis_hz_splid)); + + assert(found); + + lemma_arr_[pos].hanzi_scis_ids[hzpos] = + static_cast<uint16>(found - scis_); + lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; + } + } + + return scis_num_; +} + +bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, + size_t item_start, size_t item_end, + size_t level) { + if (level >= kMaxLemmaSize || item_end <= item_start) + return false; + + // 1. Scan for how many sons + size_t parent_son_num = 0; + // LemmaNode *son_1st = NULL; + // parent.num_of_son = 0; + + LemmaEntry *lma_last_start = lemma_arr_ + item_start; + uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; + + // Scan for how many sons to be allocaed + for (size_t i = item_start + 1; i< item_end; i++) { + LemmaEntry *lma_current = lemma_arr + i; + uint16 spl_idx_current = lma_current->spl_idx_arr[level]; + if (spl_idx_current != spl_idx_node) { + parent_son_num++; + spl_idx_node = spl_idx_current; + } + } + parent_son_num++; + +#ifdef ___DO_STATISTICS___ + // Use to indicate whether all nodes of this layer have no son. + bool allson_noson = true; + + assert(level < kMaxLemmaSize); + if (parent_son_num > max_sonbuf_len_[level]) + max_sonbuf_len_[level] = parent_son_num; + + total_son_num_[level] += parent_son_num; + total_sonbuf_num_[level] += 1; + + if (parent_son_num == 1) + sonbufs_num1_++; + else + sonbufs_numgt1_++; + total_lma_node_num_ += parent_son_num; +#endif + + // 2. Update the parent's information + // Update the parent's son list; + LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used + LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. + if (0 == level) { // the parent is root + (static_cast<LmaNodeLE0*>(parent))->son_1st_off = + lma_nds_used_num_le0_; + son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; + lma_nds_used_num_le0_ += parent_son_num; + + assert(parent_son_num <= 65535); + (static_cast<LmaNodeLE0*>(parent))->num_of_son = + static_cast<uint16>(parent_son_num); + } else if (1 == level) { // the parent is a son of root + (static_cast<LmaNodeLE0*>(parent))->son_1st_off = + lma_nds_used_num_ge1_; + son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; + lma_nds_used_num_ge1_ += parent_son_num; + + assert(parent_son_num <= 65535); + (static_cast<LmaNodeLE0*>(parent))->num_of_son = + static_cast<uint16>(parent_son_num); + } else { + set_son_offset((static_cast<LmaNodeGE1*>(parent)), + lma_nds_used_num_ge1_); + son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; + lma_nds_used_num_ge1_ += parent_son_num; + + assert(parent_son_num <= 255); + (static_cast<LmaNodeGE1*>(parent))->num_of_son = + (unsigned char)parent_son_num; + } + + // 3. Now begin to construct the son one by one + size_t son_pos = 0; + + lma_last_start = lemma_arr_ + item_start; + spl_idx_node = lma_last_start->spl_idx_arr[level]; + + size_t homo_num = 0; + if (lma_last_start->spl_idx_arr[level + 1] == 0) + homo_num = 1; + + size_t item_start_next = item_start; + + for (size_t i = item_start + 1; i < item_end; i++) { + LemmaEntry* lma_current = lemma_arr_ + i; + uint16 spl_idx_current = lma_current->spl_idx_arr[level]; + + if (spl_idx_current == spl_idx_node) { + if (lma_current->spl_idx_arr[level + 1] == 0) + homo_num++; + } else { + // Construct a node + LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid + LmaNodeGE1 *node_cur_ge1 = NULL; + if (0 == level) { + node_cur_le0 = son_1st_le0 + son_pos; + node_cur_le0->spl_idx = spl_idx_node; + node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; + node_cur_le0->son_1st_off = 0; + homo_idx_num_eq1_ += homo_num; + } else { + node_cur_ge1 = son_1st_ge1 + son_pos; + node_cur_ge1->spl_idx = spl_idx_node; + + set_homo_id_buf_offset(node_cur_ge1, + (homo_idx_num_eq1_ + homo_idx_num_gt1_)); + set_son_offset(node_cur_ge1, 0); + homo_idx_num_gt1_ += homo_num; + } + + if (homo_num > 0) { + LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + + homo_idx_num_gt1_ - homo_num; + if (0 == level) { + assert(homo_num <= 65535); + node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); + } else { + assert(homo_num <= 255); + node_cur_ge1->num_of_homo = (unsigned char)homo_num; + } + + for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { + idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; + } + +#ifdef ___DO_STATISTICS___ + if (homo_num > max_homobuf_len_[level]) + max_homobuf_len_[level] = homo_num; + + total_homo_num_[level] += homo_num; +#endif + } + + if (i - item_start_next > homo_num) { + void *next_parent; + if (0 == level) + next_parent = static_cast<void*>(node_cur_le0); + else + next_parent = static_cast<void*>(node_cur_ge1); + construct_subset(next_parent, lemma_arr, + item_start_next + homo_num, i, level + 1); +#ifdef ___DO_STATISTICS___ + + total_node_hasson_[level] += 1; + allson_noson = false; +#endif + } + + // for the next son + lma_last_start = lma_current; + spl_idx_node = spl_idx_current; + item_start_next = i; + homo_num = 0; + if (lma_current->spl_idx_arr[level + 1] == 0) + homo_num = 1; + + son_pos++; + } + } + + // 4. The last one to construct + LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid + LmaNodeGE1 *node_cur_ge1 = NULL; + if (0 == level) { + node_cur_le0 = son_1st_le0 + son_pos; + node_cur_le0->spl_idx = spl_idx_node; + node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; + node_cur_le0->son_1st_off = 0; + homo_idx_num_eq1_ += homo_num; + } else { + node_cur_ge1 = son_1st_ge1 + son_pos; + node_cur_ge1->spl_idx = spl_idx_node; + + set_homo_id_buf_offset(node_cur_ge1, + (homo_idx_num_eq1_ + homo_idx_num_gt1_)); + set_son_offset(node_cur_ge1, 0); + homo_idx_num_gt1_ += homo_num; + } + + if (homo_num > 0) { + LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + + homo_idx_num_gt1_ - homo_num; + if (0 == level) { + assert(homo_num <= 65535); + node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); + } else { + assert(homo_num <= 255); + node_cur_ge1->num_of_homo = (unsigned char)homo_num; + } + + for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { + idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; + } + +#ifdef ___DO_STATISTICS___ + if (homo_num > max_homobuf_len_[level]) + max_homobuf_len_[level] = homo_num; + + total_homo_num_[level] += homo_num; +#endif + } + + if (item_end - item_start_next > homo_num) { + void *next_parent; + if (0 == level) + next_parent = static_cast<void*>(node_cur_le0); + else + next_parent = static_cast<void*>(node_cur_ge1); + construct_subset(next_parent, lemma_arr, + item_start_next + homo_num, item_end, level + 1); +#ifdef ___DO_STATISTICS___ + + total_node_hasson_[level] += 1; + allson_noson = false; +#endif + } + +#ifdef ___DO_STATISTICS___ + if (allson_noson) { + total_sonbuf_allnoson_[level] += 1; + total_node_in_sonbuf_allnoson_[level] += parent_son_num; + } +#endif + + assert(son_pos + 1 == parent_son_num); + return true; +} + +#ifdef ___DO_STATISTICS___ +void DictBuilder::stat_init() { + memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); + + sonbufs_num1_ = 0; + sonbufs_numgt1_ = 0; + total_lma_node_num_ = 0; +} + +void DictBuilder::stat_print() { + printf("\n------------STAT INFO-------------\n"); + printf("[root is layer -1]\n"); + printf(".. max_sonbuf_len per layer(from layer 0):\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", max_sonbuf_len_[i]); + printf("-, \n"); + + printf(".. max_homobuf_len per layer:\n -, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", max_homobuf_len_[i]); + printf("\n"); + + printf(".. total_son_num per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_son_num_[i]); + printf("-, \n"); + + printf(".. total_node_hasson per layer:\n 1, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_node_hasson_[i]); + printf("\n"); + + printf(".. total_sonbuf_num per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_sonbuf_num_[i]); + printf("-, \n"); + + printf(".. total_sonbuf_allnoson per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_sonbuf_allnoson_[i]); + printf("-, \n"); + + printf(".. total_node_in_sonbuf_allnoson per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_node_in_sonbuf_allnoson_[i]); + printf("-, \n"); + + printf(".. total_homo_num per layer:\n 0, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_homo_num_[i]); + printf("\n"); + + printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); + printf(".. son buf allocation number with more than 1 son: %d\n", + sonbufs_numgt1_); + printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); +} +#endif // ___DO_STATISTICS___ + +#endif // ___BUILD_MODEL___ +} // namespace ime_pinyin |