diff options
Diffstat (limited to 'src/virtualkeyboard/3rdparty/pinyin/include')
18 files changed, 0 insertions, 3056 deletions
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h b/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h deleted file mode 100644 index 0a70a510..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * This class defines AtomDictBase class which is the base class for all atom - * dictionaries. Atom dictionaries are managed by the decoder class - * MatrixSearch. - * - * When the user appends a new character to the Pinyin string, all enabled atom - * dictionaries' extend_dict() will be called at least once to get candidates - * ended in this step (the information of starting step is also given in the - * parameter). Usually, when extend_dict() is called, a MileStoneHandle object - * returned by a previous calling for a earlier step is given to speed up the - * look-up process, and a new MileStoneHandle object will be returned if - * the extension is successful. - * - * A returned MileStoneHandle object should keep alive until Function - * reset_milestones() is called and this object is noticed to be reset. - * - * Usually, the atom dictionary can use step information to manage its - * MileStoneHandle objects, or it can make the objects in ascendant order to - * make the reset easier. - * - * When the decoder loads the dictionary, it will give a starting lemma id for - * this atom dictionary to map a inner id to a global id. Global ids should be - * used when an atom dictionary talks to any component outside. - */ -#ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__ -#define PINYINIME_INCLUDE_ATOMDICTBASE_H__ - -#include <stdlib.h> -#include "./dictdef.h" -#include "./searchutility.h" - -namespace ime_pinyin { -class AtomDictBase { - public: - virtual ~AtomDictBase() {} - - /** - * Load an atom dictionary from a file. - * - * @param file_name The file name to load dictionary. - * @param start_id The starting id used for this atom dictionary. - * @param end_id The end id (included) which can be used for this atom - * dictionary. User dictionary will always use the last id space, so it can - * ignore this paramter. All other atom dictionaries should check this - * parameter. - * @return True if succeed. - */ - virtual bool load_dict(const char *file_name, LemmaIdType start_id, - LemmaIdType end_id) = 0; - - /** - * Close this atom dictionary. - * - * @return True if succeed. - */ - virtual bool close_dict() = 0; - - /** - * Get the total number of lemmas in this atom dictionary. - * - * @return The total number of lemmas. - */ - virtual size_t number_of_lemmas() = 0; - - /** - * This function is called by the decoder when user deletes a character from - * the input string, or begins a new input string. - * - * Different atom dictionaries may implement this function in different way. - * an atom dictionary can use one of these two parameters (or both) to reset - * its corresponding MileStoneHandle objects according its detailed - * implementation. - * - * For example, if an atom dictionary uses step information to manage its - * MileStoneHandle objects, parameter from_step can be used to identify which - * objects should be reset; otherwise, if another atom dictionary does not - * use the detailed step information, it only uses ascendant handles - * (according to step. For the same step, earlier call, smaller handle), it - * can easily reset those MileStoneHandle which are larger than from_handle. - * - * The decoder always reset the decoding state by step. So when it begins - * resetting, it will call reset_milestones() of its atom dictionaries with - * the step information, and the MileStoneHandle objects returned by the - * earliest calling of extend_dict() for that step. - * - * If an atom dictionary does not implement incremental search, this function - * can be totally ignored. - * - * @param from_step From which step(included) the MileStoneHandle - * objects should be reset. - * @param from_handle The ealiest MileStoneHandle object for step from_step - */ - virtual void reset_milestones(uint16 from_step, - MileStoneHandle from_handle) = 0; - - /** - * Used to extend in this dictionary. The handle returned should keep valid - * until reset_milestones() is called. - * - * @param from_handle Its previous returned extended handle without the new - * spelling id, it can be used to speed up the extending. - * @param dep The paramter used for extending. - * @param lpi_items Used to fill in the lemmas matched. - * @param lpi_max The length of the buffer - * @param lpi_num Used to return the newly added items. - * @return The new mile stone for this extending. 0 if fail. - */ - virtual MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) = 0; - - /** - * Get lemma items with scores according to a spelling id stream. - * This atom dictionary does not need to sort the returned items. - * - * @param splid_str The spelling id stream buffer. - * @param splid_str_len The length of the spelling id stream buffer. - * @param lpi_items Used to return matched lemma items with scores. - * @param lpi_max The maximum size of the buffer to return result. - * @return The number of matched items which have been filled in to lpi_items. - */ - virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max) = 0; - - /** - * Get a lemma string (The Chinese string) by the given lemma id. - * - * @param id_lemma The lemma id to get the string. - * @param str_buf The buffer to return the Chinese string. - * @param str_max The maximum size of the buffer. - * @return The length of the string, 0 if fail. - */ - virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) = 0; - - /** - * Get the full spelling ids for the given lemma id. - * If the given buffer is too short, return 0. - * - * @param splids Used to return the spelling ids. - * @param splids_max The maximum buffer length of splids. - * @param arg_valid Used to indicate if the incoming parameters have been - * initialized are valid. If it is true, the splids and splids_max are valid - * and there may be half ids in splids to be updated to full ids. In this - * case, splids_max is the number of valid ids in splids. - * @return The number of ids in the buffer. - */ - virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) = 0; - - /** - * Function used for prediction. - * No need to sort the newly added items. - * - * @param last_hzs The last n Chinese chracters(called Hanzi), its length - * should be less than or equal to kMaxPredictSize. - * @param hzs_len specifies the length(<= kMaxPredictSize) of the history. - * @param npre_items Used used to return the result. - * @param npre_max The length of the buffer to return result - * @param b4_used Number of prediction result (from npre_items[-b4_used]) - * from other atom dictionaries. A atom ditionary can just ignore it. - * @return The number of prediction result from this atom dictionary. - */ - virtual size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) = 0; - - /** - * Add a lemma to the dictionary. If the dictionary allows to add new - * items and this item does not exist, add it. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the Chinese lemma. - * @param count The frequency count for this lemma. - */ - virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count) = 0; - - /** - * Update a lemma's occuring count. - * - * @param lemma_id The lemma id to update. - * @param delta_count The frequnecy count to ajust. - * @param selected Indicate whether this lemma is selected by user and - * submitted to target edit box. - * @return The id if succeed, 0 if fail. - */ - virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, - bool selected) = 0; - - /** - * Get the lemma id for the given lemma. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the lemma. - * @return The matched lemma id, or 0 if fail. - */ - virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) = 0; - - /** - * Get the lemma score. - * - * @param lemma_id The lemma id to get score. - * @return The score of the lemma, or 0 if fail. - */ - virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0; - - /** - * Get the lemma score. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the lemma. - * @return The score of the lamm, or 0 if fail. - */ - virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) = 0; - - /** - * If the dictionary allowed, remove a lemma from it. - * - * @param lemma_id The id of the lemma to remove. - * @return True if succeed. - */ - virtual bool remove_lemma(LemmaIdType lemma_id) = 0; - - /** - * Get the total occuring count of this atom dictionary. - * - * @return The total occuring count of this atom dictionary. - */ - virtual size_t get_total_lemma_count() = 0; - - /** - * Set the total occuring count of other atom dictionaries. - * - * @param count The total occuring count of other atom dictionaies. - */ - virtual void set_total_lemma_count_of_others(size_t count) = 0; - - /** - * Notify this atom dictionary to flush the cached data to persistent storage - * if necessary. - */ - virtual void flush_cache() = 0; -}; -} - -#endif // PINYINIME_INCLUDE_ATOMDICTBASE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h deleted file mode 100644 index da0d6cd3..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ -#define PINYINIME_INCLUDE_DICTBUILDER_H__ - -#include <stdlib.h> -#include "./utf16char.h" -#include "./dictdef.h" -#include "./dictlist.h" -#include "./spellingtable.h" -#include "./spellingtrie.h" -#include "./splparser.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -#define ___DO_STATISTICS___ - -class DictTrie; - -class DictBuilder { - private: - // The raw lemma array buffer. - LemmaEntry *lemma_arr_; - size_t lemma_num_; - - // Used to store all possible single char items. - // Two items may have the same Hanzi while their spelling ids are different. - SingleCharItem *scis_; - size_t scis_num_; - - // In the tree, root's level is -1. - // Lemma nodes for root, and level 0 - LmaNodeLE0 *lma_nodes_le0_; - - // Lemma nodes for layers whose levels are deeper than 0 - LmaNodeGE1 *lma_nodes_ge1_; - - // Number of used lemma nodes - size_t lma_nds_used_num_le0_; - size_t lma_nds_used_num_ge1_; - - // Used to store homophonies' ids. - LemmaIdType *homo_idx_buf_; - // Number of homophonies each of which only contains one Chinese character. - size_t homo_idx_num_eq1_; - // Number of homophonies each of which contains more than one character. - size_t homo_idx_num_gt1_; - - // The items with highest scores. - LemmaEntry *top_lmas_; - size_t top_lmas_num_; - - SpellingTable *spl_table_; - SpellingParser *spl_parser_; - -#ifdef ___DO_STATISTICS___ - size_t max_sonbuf_len_[kMaxLemmaSize]; - size_t max_homobuf_len_[kMaxLemmaSize]; - - size_t total_son_num_[kMaxLemmaSize]; - size_t total_node_hasson_[kMaxLemmaSize]; - size_t total_sonbuf_num_[kMaxLemmaSize]; - size_t total_sonbuf_allnoson_[kMaxLemmaSize]; - size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; - size_t total_homo_num_[kMaxLemmaSize]; - - size_t sonbufs_num1_; // Number of son buffer with only 1 son - size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; - - size_t total_lma_node_num_; - - void stat_init(); - void stat_print(); -#endif - - public: - - DictBuilder(); - ~DictBuilder(); - - // Build dictionary trie from the file fn_raw. File fn_validhzs provides - // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be - // included. - bool build_dict(const char* fn_raw, const char* fn_validhzs, - DictTrie *dict_trie); - - private: - // Fill in the buffer with id. The caller guarantees that the paramters are - // vaild. - void id_to_charbuf(unsigned char *buf, LemmaIdType id); - - // Update the offset of sons for a node. - void set_son_offset(LmaNodeGE1 *node, size_t offset); - - // Update the offset of homophonies' ids for a node. - void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); - - // Format a speling string. - void format_spelling_str(char *spl_str); - - // Sort the lemma_arr by the hanzi string, and give each of unique items - // a id. Why we need to sort the lemma list according to their Hanzi string - // is to find items started by a given prefix string to do prediction. - // Actually, the single char items are be in other order, for example, - // in spelling id order, etc. - // Return value is next un-allocated idx available. - LemmaIdType sort_lemmas_by_hz(); - - // Build the SingleCharItem list, and fill the hanzi_scis_ids in the - // lemma buffer lemma_arr_. - // This function should be called after the lemma array is ready. - // Return the number of unique SingleCharItem elements. - size_t build_scis(); - - // Construct a subtree using a subset of the spelling array (from - // item_star to item_end) - // parent is the parent node to update the necessary information - // parent can be a member of LmaNodeLE0 or LmaNodeGE1 - bool construct_subset(void* parent, LemmaEntry* lemma_arr, - size_t item_start, size_t item_end, size_t level); - - - // Read valid Chinese Hanzis from the given file. - // num is used to return number of chars. - // The return buffer is sorted and caller needs to free the returned buffer. - char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); - - - // Read a raw dictionary. max_item is the maximum number of items. If there - // are more items in the ditionary, only the first max_item will be read. - // Returned value is the number of items successfully read from the file. - size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, - size_t max_item); - - // Try to find if a character is in hzs buffer. - bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); - - // Try to find if all characters in str are in hzs buffer. - bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, - const char16 *str, size_t str_len); - - // Get these lemmas with toppest scores. - void get_top_lemmas(); - - // Allocate resource to build dictionary. - // lma_num is the number of items to be loaded - bool alloc_resource(size_t lma_num); - - // Free resource. - void free_resource(); -}; -#endif // ___BUILD_MODEL___ -} - -#endif // PINYINIME_INCLUDE_DICTBUILDER_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h deleted file mode 100644 index 5e1d7818..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTDEF_H__ -#define PINYINIME_INCLUDE_DICTDEF_H__ - -#include <stdlib.h> -#include "./utf16char.h" - -namespace ime_pinyin { - -// Enable the following line when building the binary dictionary model. -// #define ___BUILD_MODEL___ - -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; - -typedef signed char int8; -typedef short int16; -typedef int int32; -typedef long long int64; -typedef unsigned long long uint64; - -const bool kPrintDebug0 = false; -const bool kPrintDebug1 = false; -const bool kPrintDebug2 = false; - -// The max length of a lemma. -const size_t kMaxLemmaSize = 8; - -// The max length of a Pinyin (spelling). -const size_t kMaxPinyinSize = 6; - -// The number of half spelling ids. For Chinese Pinyin, there 30 half ids. -// See SpellingTrie.h for details. -const size_t kHalfSpellingIdNum = 29; - -// The maximum number of full spellings. For Chinese Pinyin, there are only -// about 410 spellings. -// If change this value is bigger(needs more bits), please also update -// other structures like SpellingNode, to make sure than a spelling id can be -// stored. -// -1 is because that 0 is never used. -const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; -const size_t kMaxSearchSteps = 40; - -// One character predicts its following characters. -const size_t kMaxPredictSize = (kMaxLemmaSize - 1); - -// LemmaIdType must always be size_t. -typedef size_t LemmaIdType; -const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. -const size_t kLemmaIdComposing = 0xffffff; - -typedef uint16 LmaScoreType; -typedef uint16 KeyScoreType; - -// Number of items with highest score are kept for prediction purpose. -const size_t kTopScoreLemmaNum = 10; - -const size_t kMaxPredictNumByGt3 = 1; -const size_t kMaxPredictNumBy3 = 2; -const size_t kMaxPredictNumBy2 = 2; - -// The last lemma id (included) for the system dictionary. The system -// dictionary's ids always start from 1. -const LemmaIdType kSysDictIdEnd = 500000; - -// The first lemma id for the user dictionary. -const LemmaIdType kUserDictIdStart = 500001; - -// The last lemma id (included) for the user dictionary. -const LemmaIdType kUserDictIdEnd = 600000; - -typedef struct { - uint16 half_splid:5; - uint16 full_splid:11; -} SpellingId, *PSpellingId; - - -/** - * We use different node types for different layers - * Statistical data of the building result for a testing dictionary: - * root, level 0, level 1, level 2, level 3 - * max son num of one node: 406 280 41 2 - - * max homo num of one node: 0 90 23 2 2 - * total node num of a layer: 1 406 31766 13516 993 - * total homo num of a layer: 9 5674 44609 12667 995 - * - * The node number for root and level 0 won't be larger than 500 - * According to the information above, two kinds of nodes can be used; one for - * root and level 0, the other for these layers deeper than 0. - * - * LE = less and equal, - * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K - */ -struct LmaNodeLE0 { - uint32 son_1st_off; - uint32 homo_idx_buf_off; - uint16 spl_idx; - uint16 num_of_son; - uint16 num_of_homo; -}; - -/** - * GE = great and equal - * A node occupies 8 bytes. - */ -struct LmaNodeGE1 { - uint16 son_1st_off_l; // Low bits of the son_1st_off - uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 - uint16 spl_idx; - unsigned char num_of_son; // number of son nodes - unsigned char num_of_homo; // number of homo words - unsigned char son_1st_off_h; // high bits of the son_1st_off - unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off -}; - -#ifdef ___BUILD_MODEL___ -struct SingleCharItem { - float freq; - char16 hz; - SpellingId splid; -}; - -struct LemmaEntry { - LemmaIdType idx_by_py; - LemmaIdType idx_by_hz; - char16 hanzi_str[kMaxLemmaSize + 1]; - - // The SingleCharItem id for each Hanzi. - uint16 hanzi_scis_ids[kMaxLemmaSize]; - - uint16 spl_idx_arr[kMaxLemmaSize + 1]; - char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; - unsigned char hz_str_len; - float freq; -}; -#endif // ___BUILD_MODEL___ - -} // namespace ime_pinyin - -#endif // PINYINIME_INCLUDE_DICTDEF_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h deleted file mode 100644 index 1c1daef4..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTLIST_H__ -#define PINYINIME_INCLUDE_DICTLIST_H__ - -#include <stdlib.h> -#include <stdio.h> -#include "./dictdef.h" -#include "./searchutility.h" -#include "./spellingtrie.h" -#include "./utf16char.h" - -namespace ime_pinyin { - -class DictList { - private: - bool initialized_; - - const SpellingTrie *spl_trie_; - - // Number of SingCharItem. The first is blank, because id 0 is invalid. - uint32 scis_num_; - char16 *scis_hz_; - SpellingId *scis_splid_; - - // The large memory block to store the word list. - char16 *buf_; - - // Starting position of those words whose lengths are i+1, counted in - // char16 - uint32 start_pos_[kMaxLemmaSize + 1]; - - uint32 start_id_[kMaxLemmaSize + 1]; - - int (*cmp_func_[kMaxLemmaSize])(const void *, const void *); - - bool alloc_resource(size_t buf_size, size_t scim_num); - - void free_resource(); - -#ifdef ___BUILD_MODEL___ - // Calculate the requsted memory, including the start_pos[] buffer. - size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num); - - void fill_scis(const SingleCharItem *scis, size_t scis_num); - - // Copy the related content to the inner buffer - // It should be called after calculate_size() - void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num); - - // Find the starting position for the buffer of those 2-character Chinese word - // whose first character is the given Chinese character. - char16* find_pos2_startedbyhz(char16 hz_char); -#endif - - // Find the starting position for the buffer of those words whose lengths are - // word_len. The given parameter cmp_func decides how many characters from - // beginning will be used to compare. - char16* find_pos_startedbyhzs(const char16 last_hzs[], - size_t word_Len, - int (*cmp_func)(const void *, const void *)); - - public: - - DictList(); - ~DictList(); - - bool save_list(FILE *fp); - bool load_list(QFile *fp); - -#ifdef ___BUILD_MODEL___ - // Init the list from the LemmaEntry array. - // lemma_arr should have been sorted by the hanzi_str, and have been given - // ids from 1 - bool init_list(const SingleCharItem *scis, size_t scis_num, - const LemmaEntry *lemma_arr, size_t lemma_num); -#endif - - // Get the hanzi string for the given id - uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max); - - void convert_to_hanzis(char16 *str, uint16 str_len); - - void convert_to_scis_ids(char16 *str, uint16 str_len); - - // last_hzs stores the last n Chinese characters history, its length should be - // less or equal than kMaxPredictSize. - // hzs_len specifies the length(<= kMaxPredictSize). - // predict_buf is used to store the result. - // buf_len specifies the buffer length. - // b4_used specifies how many items before predict_buf have been used. - // Returned value is the number of newly added items. - size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - // If half_splid is a valid half spelling id, return those full spelling - // ids which share this half id. - uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid, - uint16 *splids, uint16 max_splids); - - LemmaIdType get_lemma_id(const char16 *str, uint16 str_len); -}; -} - -#endif // PINYINIME_INCLUDE_DICTLIST_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h deleted file mode 100644 index 86a8ee25..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTTRIE_H__ -#define PINYINIME_INCLUDE_DICTTRIE_H__ - -#include <stdlib.h> -#include "./atomdictbase.h" -#include "./dictdef.h" -#include "./dictlist.h" -#include "./searchutility.h" -#include <QFile> - -namespace ime_pinyin { - -class DictTrie : AtomDictBase { - private: - struct ParsingMark { - size_t node_offset:24; - size_t node_num:8; // Number of nodes with this spelling id given - // by spl_id. If spl_id is a Shengmu, for nodes - // in the first layer of DictTrie, it equals to - // SpellingTrie::shm2full_num(); but for those - // nodes which are not in the first layer, - // node_num < SpellingTrie::shm2full_num(). - // For a full spelling id, node_num = 1; - }; - - // Used to indicate an extended mile stone. - // An extended mile stone is used to mark a partial match in the dictionary - // trie to speed up further potential extending. - // For example, when the user inputs "w", a mile stone is created to mark the - // partial match status, so that when user inputs another char 'm', it will be - // faster to extend search space based on this mile stone. - // - // For partial match status of "wm", there can be more than one sub mile - // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so - // there may be more one parsing mark used to mark these partial matchings. - // A mile stone records the starting position in the mark list and number of - // marks. - struct MileStone { - uint16 mark_start; - uint16 mark_num; - }; - - DictList* dict_list_; - - const SpellingTrie *spl_trie_; - - LmaNodeLE0* root_; // Nodes for root and the first layer. - LmaNodeGE1* nodes_ge1_; // Nodes for other layers. - - // An quick index from spelling id to the LmaNodeLE0 node buffer, or - // to the root_ buffer. - // Index length: - // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used - // to get the end. - // All Shengmu ids are not indexed because they will be converted into - // corresponding full ids. - // So, given an id splid, the son is: - // root_[splid_le0_index_[splid - kFullSplIdStart]] - uint16 *splid_le0_index_; - - uint32 lma_node_num_le0_; - uint32 lma_node_num_ge1_; - - // The first part is for homophnies, and the last top_lma_num_ items are - // lemmas with highest scores. - unsigned char *lma_idx_buf_; - uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. - uint32 total_lma_num_; // Total number of lemmas in this dictionary. - uint32 top_lmas_num_; // Number of lemma with highest scores. - - // Parsing mark list used to mark the detailed extended statuses. - ParsingMark *parsing_marks_; - // The position for next available mark. - uint16 parsing_marks_pos_; - - // Mile stone list used to mark the extended status. - MileStone *mile_stones_; - // The position for the next available mile stone. We use positions (except 0) - // as handles. - MileStoneHandle mile_stones_pos_; - - // Get the offset of sons for a node. - inline size_t get_son_offset(const LmaNodeGE1 *node); - - // Get the offset of homonious ids for a node. - inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); - - // Get the lemma id by the offset. - inline LemmaIdType get_lemma_id(size_t id_offset); - - void free_resource(bool free_dict_list); - - bool load_dict(QFile *fp); - - // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill - // them into the lpi_items buffer. - // This function is called by the search engine. - size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, - LmaNodeLE0 *node); - - // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill - // them into the lpi_items buffer. - // This function is called by inner functions extend_dict0(), extend_dict1() - // and extend_dict2(). - size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, - size_t homo_buf_off, LmaNodeGE1 *node, - uint16 lma_len); - - // Extend in the trie from level 0. - MileStoneHandle extend_dict0(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Extend in the trie from level 1. - MileStoneHandle extend_dict1(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Extend in the trie from level 2. - MileStoneHandle extend_dict2(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Try to extend the given spelling id buffer, and if the given id_lemma can - // be successfully gotten, return true; - // The given spelling ids are all valid full ids. - bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); - -#ifdef ___BUILD_MODEL___ - bool save_dict(FILE *fp); -#endif // ___BUILD_MODEL___ - - static const int kMaxMileStone = 100; - static const int kMaxParsingMark = 600; - static const MileStoneHandle kFirstValidMileStoneHandle = 1; - - friend class DictParser; - friend class DictBuilder; - - public: - - DictTrie(); - ~DictTrie(); - -#ifdef ___BUILD_MODEL___ - // Construct the tree from the file fn_raw. - // fn_validhzs provide the valid hanzi list. If fn_validhzs is - // NULL, only chars in GB2312 will be included. - bool build_dict(const char *fn_raw, const char *fn_validhzs); - - // Save the binary dictionary - // Actually, the SpellingTrie/DictList instance will be also saved. - bool save_dict(const char *filename); -#endif // ___BUILD_MODEL___ - - void convert_to_hanzis(char16 *str, uint16 str_len); - - void convert_to_scis_ids(char16 *str, uint16 str_len); - - // Load a binary dictionary - // The SpellingTrie instance/DictList will be also loaded - bool load_dict(const char *filename, LemmaIdType start_id, - LemmaIdType end_id); - bool load_dict_fd(int sys_fd, long start_offset, long length, - LemmaIdType start_id, LemmaIdType end_id); - bool close_dict() {return true;} - size_t number_of_lemmas() {return 0;} - - void reset_milestones(uint16 from_step, MileStoneHandle from_handle); - - MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - size_t predict(const char16 *last_hzs, uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/, uint16 /*count*/) {return 0;} - - LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/, - bool /*selected*/) {return 0;} - - LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/) {return 0;} - - LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;} - - LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/) {return 0;} - - bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;} - - size_t get_total_lemma_count() {return 0;} - void set_total_lemma_count_of_others(size_t count); - - void flush_cache() {} - - LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); - - // Fill the lemmas with highest scores to the prediction buffer. - // his_len is the history length to fill in the prediction buffer. - size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, - size_t npre_max, size_t b4_used); -}; -} - -#endif // PINYINIME_INCLUDE_DICTTRIE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h b/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h deleted file mode 100644 index 60735971..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ -#define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ - -#include <stdlib.h> -#include "./searchutility.h" -#include "./spellingtrie.h" - -namespace ime_pinyin { - -// Used to cache LmaPsbItem list for half spelling ids. -class LpiCache { - private: - static LpiCache *instance_; - static const int kMaxLpiCachePerId = 15; - - LmaPsbItem *lpi_cache_; - uint16 *lpi_cache_len_; - - public: - LpiCache(); - ~LpiCache(); - - static LpiCache& get_instance(); - - // Test if the LPI list of the given splid has been cached. - // If splid is a full spelling id, it returns false, because we only cache - // list for half ids. - bool is_cached(uint16 splid); - - // Put LPI list to cahce. If the length of the list, lpi_num, is longer than - // the cache buffer. the list will be truncated, and function returns the - // maximum length of the cache buffer. - // Note: splid must be a half id, and lpi_items must be not NULL. The - // caller of this function should guarantee this. - size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num); - - // Get the cached list for the given half id. - // Return the length of the cached buffer. - // Note: splid must be a half id, and lpi_items must be not NULL. The - // caller of this function should guarantee this. - size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max); -}; - -} // namespace - -#endif // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h b/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h deleted file mode 100644 index 61e78aa6..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ -#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ - -#include <stdlib.h> -#include "./atomdictbase.h" -#include "./dicttrie.h" -#include "./searchutility.h" -#include "./spellingtrie.h" -#include "./splparser.h" - -namespace ime_pinyin { - -static const size_t kMaxRowNum = kMaxSearchSteps; - -typedef struct { - // MileStoneHandle objects for the system and user dictionaries. - MileStoneHandle dict_handles[2]; - // From which DMI node. -1 means it's from root. - PoolPosType dmi_fr; - // The spelling id for the Pinyin string from the previous DMI to this node. - // If it is a half id like Shengmu, the node pointed by dict_node is the first - // node with this Shengmu, - uint16 spl_id; - // What's the level of the dict node. Level of root is 0, but root is never - // recorded by dict_node. - unsigned char dict_level:7; - // If this node is for composing phrase, this bit is 1. - unsigned char c_phrase:1; - // Whether the spl_id is parsed with a split character at the end. - unsigned char splid_end_split:1; - // What's the length of the spelling string for this match, for the whole - // word. - unsigned char splstr_len:7; - // Used to indicate whether all spelling ids from the root are full spelling - // ids. This information is useful for keymapping mode(not finished). Because - // in this mode, there is no clear boundaries, we prefer those results which - // have full spelling ids. - unsigned char all_full_id:1; -} DictMatchInfo, *PDictMatchInfo; - -typedef struct MatrixNode { - LemmaIdType id; - float score; - MatrixNode *from; - // From which DMI node. Used to trace the spelling segmentation. - PoolPosType dmi_fr; - uint16 step; -} MatrixNode, *PMatrixNode; - -typedef struct { - // The MatrixNode position in the matrix pool - PoolPosType mtrx_nd_pos; - // The DictMatchInfo position in the DictMatchInfo pool. - PoolPosType dmi_pos; - uint16 mtrx_nd_num; - uint16 dmi_num:15; - // Used to indicate whether there are dmi nodes in this step with full - // spelling id. This information is used to decide whether a substring of a - // valid Pinyin should be extended. - // - // Example1: shoudao - // When the last char 'o' is added, the parser will find "dao" is a valid - // Pinyin, and because all dmi nodes at location 'd' (including those for - // "shoud", and those for "d") have Shengmu id only, so it is not necessary - // to extend "ao", otherwise the result may be "shoud ao", that is not - // reasonable. - // - // Example2: hengao - // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. - // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi - // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus - // "heng ao" can also be the result. - // - // Similarly, "ganga" is expanded to "gang a". - // - // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi - // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it - // is not valid either). If the parser uses break in the loop, the result - // always be "xian"; but if the parser uses continue in the loop, "xi an" will - // also be tried. This behaviour can be set via the function - // set_xi_an_switch(). - uint16 dmi_has_full_id:1; - // Points to a MatrixNode of the current step to indicate which choice the - // user selects. - MatrixNode *mtrx_nd_fixed; -} MatrixRow, *PMatrixRow; - -// When user inputs and selects candidates, the fixed lemma ids are stored in -// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many -// lemmas from the beginning are fixed. If user deletes Pinyin characters one -// by one from the end, these fixed lemmas can be unlocked one by one when -// necessary. Whenever user deletes a Chinese character and its spelling string -// in these fixed lemmas, all fixed lemmas will be merged together into a unit -// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing -// phrase will be the first lemma in the sentence. Because it contains some -// modified lemmas (by deleting a character), these merged lemmas are called -// sub lemmas (sublma), and each of them are represented individually, so that -// when user deletes Pinyin characters from the end, these sub lemmas can also -// be unlocked one by one. -typedef struct { - uint16 spl_ids[kMaxRowNum]; - uint16 spl_start[kMaxRowNum]; - char16 chn_str[kMaxRowNum]; // Chinese string. - uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. - size_t sublma_num; - uint16 length; // Counted in Chinese characters. -} ComposingPhrase, *TComposingPhrase; - -class MatrixSearch { - private: - // If it is true, prediction list by string whose length is greater than 1 - // will be limited to a reasonable number. - static const bool kPredictLimitGt1 = false; - - // If it is true, the engine will prefer long history based prediction, - // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are - // based on the two-character history. - static const bool kPreferLongHistoryPredict = true; - - // If it is true, prediction will only be based on user dictionary. this flag - // is for debug purpose. - static const bool kOnlyUserDictPredict = false; - - // The maximum buffer to store LmaPsbItems. - static const size_t kMaxLmaPsbItems = 1450; - - // How many rows for each step. - static const size_t kMaxNodeARow = 5; - - // The maximum length of the sentence candidates counted in chinese - // characters - static const size_t kMaxSentenceLength = 16; - - // The size of the matrix node pool. - static const size_t kMtrxNdPoolSize = 200; - - // The size of the DMI node pool. - static const size_t kDmiPoolSize = 800; - - // Used to indicate whether this object has been initialized. - bool inited_; - - // Spelling trie. - const SpellingTrie *spl_trie_; - - // Used to indicate this switcher status: when "xian" is parseed, should - // "xi an" also be extended. Default is false. - // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string - // should be valid for a FULL spelling, or a combination of two spellings, - // first of which is a FULL id too. So even it is true, "da" will never be - // split into "d a", because "d" is not a full spelling id. - bool xi_an_enabled_; - - // System dictionary. - DictTrie* dict_trie_; - - // User dictionary. - AtomDictBase* user_dict_; - - // Spelling parser. - SpellingParser* spl_parser_; - - // The maximum allowed length of spelling string (such as a Pinyin string). - size_t max_sps_len_; - - // The maximum allowed length of a result Chinese string. - size_t max_hzs_len_; - - // Pinyin string. Max length: kMaxRowNum - 1 - char pys_[kMaxRowNum]; - - // The length of the string that has been decoded successfully. - size_t pys_decoded_len_; - - // Shared buffer for multiple purposes. - size_t *share_buf_; - - MatrixNode *mtrx_nd_pool_; - PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool - DictMatchInfo *dmi_pool_; - PoolPosType dmi_pool_used_; // How many items used in the pool - - MatrixRow *matrix_; // The first row is for starting - - DictExtPara *dep_; // Parameter used to extend DMI nodes. - - NPredictItem *npre_items_; // Used to do prediction - size_t npre_items_len_; - - // The starting positions and lemma ids for the full sentence candidate. - size_t lma_id_num_; - uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. - LemmaIdType lma_id_[kMaxRowNum]; - size_t fixed_lmas_; - - // If fixed_lmas_ is bigger than i, Element i is used to indicate whether - // the i'th lemma id in lma_id_ is the first candidate for that step. - // If all candidates are the first one for that step, the whole string can be - // decoded by the engine automatically, so no need to add it to user - // dictionary. (We are considering to add it to user dictionary in the - // future). - uint8 fixed_lmas_no1_[kMaxRowNum]; - - // Composing phrase - ComposingPhrase c_phrase_; - - // If dmi_c_phrase_ is true, the decoder will try to match the - // composing phrase (And definitely it will match successfully). If it - // is false, the decoder will try to match lemmas items in dictionaries. - bool dmi_c_phrase_; - - // The starting positions and spelling ids for the first full sentence - // candidate. - size_t spl_id_num_; // Number of splling ids - uint16 spl_start_[kMaxRowNum]; // Starting positions - uint16 spl_id_[kMaxRowNum]; // Spelling ids - // Used to remember the last fixed position, counted in Hanzi. - size_t fixed_hzs_; - - // Lemma Items with possibility score, two purposes: - // 1. In Viterbi decoding, this buffer is used to get all possible candidates - // for current step; - // 2. When the search is done, this buffer is used to get candiates from the - // first un-fixed step and show them to the user. - LmaPsbItem lpi_items_[kMaxLmaPsbItems]; - size_t lpi_total_; - - // Assign the pointers with NULL. The caller makes sure that all pointers are - // not valid before calling it. This function only will be called in the - // construction function and free_resource(). - void reset_pointers_to_null(); - - bool alloc_resource(); - - void free_resource(); - - // Reset the search space totally. - bool reset_search0(); - - // Reset the search space from ch_pos step. For example, if the original - // input Pinyin is "an", reset_search(1) will reset the search space to the - // result of "a". If the given position is out of range, return false. - // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, - // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. - // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. - // The DMI nodes will be kept. - // - // Note: this function should not destroy content of pys_. - bool reset_search(size_t ch_pos, bool clear_fixed_this_step, - bool clear_dmi_this_step, bool clear_mtrx_this_step); - - // Delete a part of the content in pys_. - void del_in_pys(size_t start, size_t len); - - // Delete a spelling id and its corresponding Chinese character, and merge - // the fixed lemmas into the composing phrase. - // del_spl_pos indicates which spelling id needs to be delete. - // This function will update the lemma and spelling segmentation information. - // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within - // the fixed lemmas. - void merge_fixed_lmas(size_t del_spl_pos); - - // Get spelling start posistions and ids. The result will be stored in - // spl_id_num_, spl_start_[], spl_id_[]. - // fixed_hzs_ will be also assigned. - void get_spl_start_id(); - - // Get all lemma ids with match the given spelling id stream(shorter than the - // maximum length of a word). - // If pfullsent is not NULL, means the full sentence candidate may be the - // same with the coming lemma string, if so, remove that lemma. - // The result is sorted in descendant order by the frequency score. - size_t get_lpis(const uint16* splid_str, size_t splid_str_len, - LmaPsbItem* lma_buf, size_t max_lma_buf, - const char16 *pfullsent, bool sort_by_psb); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - - // Extend a DMI node with a spelling id. ext_len is the length of the rows - // to extend, actually, it is the size of the spelling string of splid. - // return value can be 1 or 0. - // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in - // the pool). - // 0 means either the dmi node can not be extended with splid, or the splid - // is a Shengmu id, which is only used to get lpi_items, or the result node - // in DictTrie has no son, it is not nccessary to keep the new DMI. - // - // This function modifies the content of lpi_items_ and lpi_total_. - // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. - // The function's returned value has no relation with the value of lpi_num. - // - // If dmi == NULL, this function will extend the root node of DictTrie - // - // This function will not change dmi_nd_pool_used_. Please change it after - // calling this function if necessary. - // - // The caller should guarantees that NULL != dep. - size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); - - // Extend dmi for the composing phrase. - size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); - - // Extend a MatrixNode with the give LmaPsbItem list. - // res_row is the destination row number. - // This function does not change mtrx_nd_pool_used_. Please change it after - // calling this function if necessary. - // return 0 always. - size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], - size_t lpi_num, PoolPosType dmi_fr, size_t res_row); - - - // Try to find a dmi node at step_to position, and the found dmi node should - // match the given spelling id strings. - PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); - - bool add_char(char ch); - bool prepare_add_char(char ch); - - // Called after prepare_add_char, so the input char has been saved. - bool add_char_qwerty(); - - // Prepare candidates from the last fixed hanzi position. - void prepare_candidates(); - - // Is the character in step pos a splitter character? - // The caller guarantees that the position is valid. - bool is_split_at(uint16 pos); - - void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, - PoolPosType dmi_fr, - uint16 spl_id, uint16 node_num, unsigned char dict_level, - bool splid_end_split, unsigned char splstr_len, - unsigned char all_full_id); - - size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len); - - // Add the first candidate to the user dictionary. - bool try_add_cand0_to_userdict(); - - // Add a user lemma to the user dictionary. This lemma is a subset of - // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the - // number of lemmas to be combined together as a new lemma. The caller - // gurantees that the combined new lemma's length is less or equal to - // kMaxLemmaSize. - bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); - - // Update dictionary frequencies. - void update_dict_freq(); - - void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); - - public: - MatrixSearch(); - ~MatrixSearch(); - - bool init(const char *fn_sys_dict, const char *fn_usr_dict); - - bool init_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict); - - void init_user_dictionary(const char *fn_usr_dict); - - bool is_user_dictionary_enabled() const; - - void set_max_lens(size_t max_sps_len, size_t max_hzs_len); - - void close(); - - void flush_cache(); - - void set_xi_an_switch(bool xi_an_enabled); - - bool get_xi_an_switch(); - - // Reset the search space. Equivalent to reset_search(0). - // If inited, always return true; - bool reset_search(); - - // Search a Pinyin string. - // Return value is the position successfully parsed. - size_t search(const char *py, size_t py_len); - - // Used to delete something in the Pinyin string kept by the engine, and do - // a re-search. - // Return value is the new length of Pinyin string kept by the engine which - // is parsed successfully. - // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin - // character needs to be deleted. If is_pos_in_splid is true, all Pinyin - // characters for pos-th spelling id needs to be deleted. - // If the deleted character(s) is just after a fixed lemma or sub lemma in - // composing phrase, clear_fixed_this_step indicates whether we needs to - // unlock the last fixed lemma or sub lemma. - // If is_pos_in_splid is false, and pos-th character is in the range for the - // fixed lemmas or composing string, this function will do nothing and just - // return the result of the previous search. - size_t delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step); - - // Get the number of candiates, called after search(). - size_t get_candidate_num(); - - // Get the Pinyin string stored by the engine. - // *decoded_len returns the length of the successfully decoded string. - const char* get_pystr(size_t *decoded_len); - - // Get the spelling boundaries for the first sentence candidate. - // Number of spellings will be returned. The number of valid elements in - // spl_start is one more than the return value because the last one is used - // to indicate the beginning of the next un-input speling. - // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . - size_t get_spl_start(const uint16 *&spl_start); - - // Get one candiate string. If full sentence candidate is available, it will - // be the first one. - char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); - - // Get the first candiate, which is a "full sentence". - // retstr_len is not NULL, it will be used to return the string length. - // If only_unfixed is true, only unfixed part will be fetched. - char16* get_candidate0(char16* cand_str, size_t max_len, - uint16 *retstr_len, bool only_unfixed); - - // Choose a candidate. The decoder will do a search after the fixed position. - size_t choose(size_t cand_id); - - // Cancel the last choosing operation, and return the new number of choices. - size_t cancel_last_choice(); - - // Get the length of fixed Hanzis. - size_t get_fixedlen(); - - size_t get_predicts(const char16 fixed_buf[], - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len); -}; -} - -#endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h b/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h deleted file mode 100644 index dfcf980b..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_MYSTDLIB_H__ -#define PINYINIME_INCLUDE_MYSTDLIB_H__ - -#include <stdlib.h> - -namespace ime_pinyin { - -void myqsort(void *p, size_t n, size_t es, - int (*cmp)(const void *, const void *)); - -void *mybsearch(const void *key, const void *base, - size_t nmemb, size_t size, - int (*compar)(const void *, const void *)); -} - -#endif // PINYINIME_INCLUDE_MYSTDLIB_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h b/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h deleted file mode 100644 index 1d3a86e6..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_NGRAM_H__ -#define PINYINIME_INCLUDE_NGRAM_H__ - -#include <stdio.h> -#include <stdlib.h> -#include "./dictdef.h" -#include <QFile> - -namespace ime_pinyin { - -typedef unsigned char CODEBOOK_TYPE; - -static const size_t kCodeBookSize = 256; - -class NGram { - public: - // The maximum score of a lemma item. - static const LmaScoreType kMaxScore = 0x3fff; - - // In order to reduce the storage size, the original log value is amplified by - // kScoreAmplifier, and we use LmaScoreType to store. - // After this process, an item with a lower score has a higher frequency. - static const int kLogValueAmplifier = -800; - - // System words' total frequency. It is not the real total frequency, instead, - // It is only used to adjust system lemmas' scores when the user dictionary's - // total frequency changes. - // In this version, frequencies of system lemmas are fixed. We are considering - // to make them changable in next version. - static const size_t kSysDictTotalFreq = 100000000; - - private: - - static NGram* instance_; - - bool initialized_; - uint32 idx_num_; - - size_t total_freq_none_sys_; - - // Score compensation for system dictionary lemmas. - // Because after user adds some user lemmas, the total frequency changes, and - // we use this value to normalize the score. - float sys_score_compensation_; - -#ifdef ___BUILD_MODEL___ - double *freq_codes_df_; -#endif - LmaScoreType *freq_codes_; - CODEBOOK_TYPE *lma_freq_idx_; - - public: - NGram(); - ~NGram(); - - static NGram& get_instance(); - - bool save_ngram(FILE *fp); - bool load_ngram(QFile *fp); - - // Set the total frequency of all none system dictionaries. - void set_total_freq_none_sys(size_t freq_none_sys); - - float get_uni_psb(LemmaIdType lma_id); - - // Convert a probability to score. Actually, the score will be limited to - // kMaxScore, but at runtime, we also need float expression to get accurate - // value of the score. - // After the conversion, a lower score indicates a higher probability of the - // item. - static float convert_psb_to_score(double psb); - -#ifdef ___BUILD_MODEL___ - // For constructing the unigram mode model. - bool build_unigram(LemmaEntry *lemma_arr, size_t num, - LemmaIdType next_idx_unused); -#endif -}; -} - -#endif // PINYINIME_INCLUDE_NGRAM_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h b/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h deleted file mode 100644 index e376c20c..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_ANDPYIME_H__ -#define PINYINIME_INCLUDE_ANDPYIME_H__ - -#include <stdlib.h> -#include "./dictdef.h" - -#ifdef __cplusplus -extern "C" { -#endif - - namespace ime_pinyin { - - /** - * Open the decoder engine via the system and user dictionary file names. - * - * @param fn_sys_dict The file name of the system dictionary. - * @param fn_usr_dict The file name of the user dictionary. - * @return true if open the decoder engine successfully. - */ - bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict); - - /** - * Open the decoder engine via the system dictionary FD and user dictionary - * file name. Because on Android, the system dictionary is embedded in the - * whole application apk file. - * - * @param sys_fd The file in which the system dictionary is embedded. - * @param start_offset The starting position of the system dictionary in the - * file sys_fd. - * @param length The length of the system dictionary in the file sys_fd, - * counted in byte. - * @return true if succeed. - */ - bool im_open_decoder_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict); - - /** - * Close the decoder engine. - */ - void im_close_decoder(); - - /** - * Set maximum limitations for decoding. If this function is not called, - * default values will be used. For example, due to screen size limitation, - * the UI engine of the IME can only show a certain number of letters(input) - * to decode, and a certain number of Chinese characters(output). If after - * user adds a new letter, the input or the output string is longer than the - * limitations, the engine will discard the recent letter. - * - * @param max_sps_len Maximum length of the spelling string(Pinyin string). - * @max_hzs_len Maximum length of the decoded Chinese character string. - */ - void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len); - - /** - * Flush cached data to persistent memory. Because at runtime, in order to - * achieve best performance, some data is only store in memory. - */ - void im_flush_cache(); - - /** - * Use a spelling string(Pinyin string) to search. The engine will try to do - * an incremental search based on its previous search result, so if the new - * string has the same prefix with the previous one stored in the decoder, - * the decoder will only continue the search from the end of the prefix. - * If the caller needs to do a brand new search, please call im_reset_search() - * first. Calling im_search() is equivalent to calling im_add_letter() one by - * one. - * - * @param sps_buf The spelling string buffer to decode. - * @param sps_len The length of the spelling string buffer. - * @return The number of candidates. - */ - size_t im_search(const char* sps_buf, size_t sps_len); - - /** - * Make a delete operation in the current search result, and make research if - * necessary. - * - * @param pos The posistion of char in spelling string to delete, or the - * position of spelling id in result string to delete. - * @param is_pos_in_splid Indicate whether the pos parameter is the position - * in the spelling string, or the position in the result spelling id string. - * @return The number of candidates. - */ - size_t im_delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step); - - /** - * Reset the previous search result. - */ - void im_reset_search(); - - /** - * Add a Pinyin letter to the current spelling string kept by decoder. If the - * decoder fails in adding the letter, it will do nothing. im_get_sps_str() - * can be used to get the spelling string kept by decoder currently. - * - * @param ch The letter to add. - * @return The number of candidates. - */ - size_t im_add_letter(char ch); - - /** - * Get the spelling string kept by the decoder. - * - * @param decoded_len Used to return how many characters in the spelling - * string is successfully parsed. - * @return The spelling string kept by the decoder. - */ - const char *im_get_sps_str(size_t *decoded_len); - - /** - * Get a candidate(or choice) string. - * - * @param cand_id The id to get a candidate. Started from 0. Usually, id 0 - * is a sentence-level candidate. - * @param cand_str The buffer to store the candidate. - * @param max_len The maximum length of the buffer. - * @return cand_str if succeeds, otherwise NULL. - */ - char16* im_get_candidate(size_t cand_id, char16* cand_str, - size_t max_len); - - /** - * Get the segmentation information(the starting positions) of the spelling - * string. - * - * @param spl_start Used to return the starting posistions. - * @return The number of spelling ids. If it is L, there will be L+1 valid - * elements in spl_start, and spl_start[L] is the posistion after the end of - * the last spelling id. - */ - size_t im_get_spl_start_pos(const uint16 *&spl_start); - - /** - * Choose a candidate and make it fixed. If the candidate does not match - * the end of all spelling ids, new candidates will be provided from the - * first unfixed position. If the candidate matches the end of the all - * spelling ids, there will be only one new candidates, or the whole fixed - * sentence. - * - * @param cand_id The id of candidate to select and make it fixed. - * @return The number of candidates. If after the selection, the whole result - * string has been fixed, there will be only one candidate. - */ - size_t im_choose(size_t cand_id); - - /** - * Cancel the last selection, or revert the last operation of im_choose(). - * - * @return The number of candidates. - */ - size_t im_cancel_last_choice(); - - /** - * Get the number of fixed spelling ids, or Chinese characters. - * - * @return The number of fixed spelling ids, of Chinese characters. - */ - size_t im_get_fixed_len(); - - /** - * Cancel the input state and reset the search workspace. - */ - bool im_cancel_input(); - - /** - * Get prediction candiates based on the given fixed Chinese string as the - * history. - * - * @param his_buf The history buffer to do the prediction. It should be ended - * with '\0'. - * @param pre_buf Used to return prediction result list. - * @return The number of predicted result string. - */ - size_t im_get_predicts(const char16 *his_buf, - char16 (*&pre_buf)[kMaxPredictSize + 1]); - - /** - * Enable Shengmus in ShouZiMu mode. - */ - void im_enable_shm_as_szm(bool enable); - - /** - * Enable Yunmus in ShouZiMu mode. - */ - void im_enable_ym_as_szm(bool enable); - - /** - * Initializes or uninitializes the user dictionary. - * - * @param fn_usr_dict The file name of the user dictionary. - */ - void im_init_user_dictionary(const char *fn_usr_dict); - - /** - * Returns the current status of user dictinary. - */ - bool im_is_user_dictionary_enabled(void); -} - -#ifdef __cplusplus -} -#endif - -#endif // PINYINIME_INCLUDE_ANDPYIME_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h b/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h deleted file mode 100644 index f1357107..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ -#define PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ - -#include <stdlib.h> -#include "./spellingtrie.h" - -namespace ime_pinyin { - -// Type used to identify the size of a pool, such as id pool, etc. -typedef uint16 PoolPosType; - -// Type used to identify a parsing mile stone in an atom dictionary. -typedef uint16 MileStoneHandle; - -// Type used to express a lemma and its probability score. -typedef struct { - size_t id:(kLemmaIdSize * 8); - size_t lma_len:4; - uint16 psb; // The score, the lower psb, the higher possibility. - // For single character items, we may also need Hanzi. - // For multiple characer items, ignore it. - char16 hanzi; -} LmaPsbItem, *PLmaPsbItem; - -// LmaPsbItem extended with string. -typedef struct { - LmaPsbItem lpi; - char16 str[kMaxLemmaSize + 1]; -} LmaPsbStrItem, *PLmaPsbStrItem; - - -typedef struct { - float psb; - char16 pre_hzs[kMaxPredictSize]; - uint16 his_len; // The length of the history used to do the prediction. -} NPredictItem, *PNPredictItem; - -// Parameter structure used to extend in a dictionary. All dictionaries -// receives the same DictExtPara and a dictionary specific MileStoneHandle for -// extending. -// -// When the user inputs a new character, AtomDictBase::extend_dict() will be -// called at least once for each dictionary. -// -// For example, when the user inputs "wm", extend_dict() will be called twice, -// and the DictExtPara parameter are as follows respectively: -// 1. splids = {w, m}; splids_extended = 1; ext_len = 1; step_no = 1; -// splid_end_split = false; id_start = wa(the first id start with 'w'); -// id_num = number of ids starting with 'w'. -// 2. splids = {m}; splids_extended = 0; ext_len = 1; step_no = 1; -// splid_end_split = false; id_start = wa; id_num = number of ids starting with -// 'w'. -// -// For string "women", one of the cases of the DictExtPara parameter is: -// splids = {wo, men}, splids_extended = 1, ext_len = 3 (length of "men"), -// step_no = 4; splid_end_split = false; id_start = men, id_num = 1. -// -typedef struct { - // Spelling ids for extending, there are splids_extended + 1 ids in the - // buffer. - // For a normal lemma, there can only be kMaxLemmaSize spelling ids in max, - // but for a composing phrase, there can kMaxSearchSteps spelling ids. - uint16 splids[kMaxSearchSteps]; - - // Number of ids that have been used before. splids[splids_extended] is the - // newly added id for the current extension. - uint16 splids_extended; - - // The step span of the extension. It is also the size of the string for - // the newly added spelling id. - uint16 ext_len; - - // The step number for the current extension. It is also the ending position - // in the input Pinyin string for the substring of spelling ids in splids[]. - // For example, when the user inputs "women", step_no = 4. - // This parameter may useful to manage the MileStoneHandle list for each - // step. When the user deletes a character from the string, MileStoneHandle - // objects for the the steps after that character should be reset; when the - // user begins a new string, all MileStoneHandle objects should be reset. - uint16 step_no; - - // Indicate whether the newly added spelling ends with a splitting character - bool splid_end_split; - - // If the newly added id is a half id, id_start is the first id of the - // corresponding full ids; if the newly added id is a full id, id_start is - // that id. - uint16 id_start; - - // If the newly added id is a half id, id_num is the number of corresponding - // ids; if it is a full id, id_num == 1. - uint16 id_num; -}DictExtPara, *PDictExtPara; - -bool is_system_lemma(LemmaIdType lma_id); -bool is_user_lemma(LemmaIdType lma_id); -bool is_composing_lemma(LemmaIdType lma_id); - -int cmp_lpi_with_psb(const void *p1, const void *p2); -int cmp_lpi_with_unified_psb(const void *p1, const void *p2); -int cmp_lpi_with_id(const void *p1, const void *p2); -int cmp_lpi_with_hanzi(const void *p1, const void *p2); - -int cmp_lpsi_with_str(const void *p1, const void *p2); - -int cmp_hanzis_1(const void *p1, const void *p2); -int cmp_hanzis_2(const void *p1, const void *p2); -int cmp_hanzis_3(const void *p1, const void *p2); -int cmp_hanzis_4(const void *p1, const void *p2); -int cmp_hanzis_5(const void *p1, const void *p2); -int cmp_hanzis_6(const void *p1, const void *p2); -int cmp_hanzis_7(const void *p1, const void *p2); -int cmp_hanzis_8(const void *p1, const void *p2); - -int cmp_npre_by_score(const void *p1, const void *p2); -int cmp_npre_by_hislen_score(const void *p1, const void *p2); -int cmp_npre_by_hanzi_score(const void *p1, const void *p2); - - -size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num); - -size_t align_to_size_t(size_t size); - -} // namespace - -#endif // PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h deleted file mode 100644 index fd79c6ef..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPELLINGTABLE_H__ -#define PINYINIME_INCLUDE_SPELLINGTABLE_H__ - -#include <stdlib.h> -#include "./dictdef.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -const size_t kMaxSpellingSize = kMaxPinyinSize; - -typedef struct { - char str[kMaxSpellingSize + 1]; - double freq; -} RawSpelling, *PRawSpelling; - -// This class is used to store the spelling strings -// The length of the input spelling string should be less or equal to the -// spelling_size_ (set by init_table). If the input string is too long, -// we only keep its first spelling_size_ chars. -class SpellingTable { - private: - static const size_t kNotSupportNum = 3; - static const char kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1]; - - bool need_score_; - - size_t spelling_max_num_; - - RawSpelling *raw_spellings_; - - // Used to store spelling strings. If the spelling table needs to calculate - // score, an extra char after each spelling string is the score. - // An item with a lower score has a higher probability. - char *spelling_buf_; - size_t spelling_size_; - - double total_freq_; - - size_t spelling_num_; - - double score_amplifier_; - - unsigned char average_score_; - - // If frozen is true, put_spelling() and contain() are not allowed to call. - bool frozen_; - - size_t get_hash_pos(const char* spelling_str); - size_t hash_pos_next(size_t hash_pos); - void free_resource(); - public: - SpellingTable(); - ~SpellingTable(); - - // pure_spl_size is the pure maximum spelling string size. For example, - // "zhuang" is the longgest item in Pinyin, so pure_spl_size should be 6. - // spl_max_num is the maximum number of spelling strings to store. - // need_score is used to indicate whether the caller needs to calculate a - // score for each spelling. - bool init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score); - - // Put a spelling string to the table. - // It always returns false if called after arrange() withtout a new - // init_table() operation. - // freq is the spelling's occuring count. - // If the spelling has been in the table, occuring count will accumulated. - bool put_spelling(const char* spelling_str, double spl_count); - - // Test whether a spelling string is in the table. - // It always returns false, when being called after arrange() withtout a new - // init_table() operation. - bool contain(const char* spelling_str); - - // Sort the spelling strings and put them from the begin of the buffer. - // Return the pointer of the sorted spelling strings. - // item_size and spl_num return the item size and number of spelling. - // Because each spelling uses a '\0' as terminator, the returned item_size is - // at least one char longer than the spl_size parameter specified by - // init_table(). If the table is initialized to calculate score, item_size - // will be increased by 1, and current_spl_str[item_size - 1] stores an - // unsinged char score. - // An item with a lower score has a higher probability. - // Do not call put_spelling() and contains() after arrange(). - const char* arrange(size_t *item_size, size_t *spl_num); - - float get_score_amplifier(); - - unsigned char get_average_score(); -}; -#endif // ___BUILD_MODEL___ -} - -#endif // PINYINIME_INCLUDE_SPELLINGTABLE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h deleted file mode 100644 index f943a24d..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__ -#define PINYINIME_INCLUDE_SPELLINGTRIE_H__ - -#include <stdio.h> -#include <stdlib.h> -#include "./dictdef.h" -#include <QFile> - -namespace ime_pinyin { - -static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1; - -// Node used for the trie of spellings -struct SpellingNode { - SpellingNode *first_son; - // The spelling id for each node. If you need more bits to store - // spelling id, please adjust this structure. - uint16 spelling_idx:11; - uint16 num_of_son:5; - char char_this_node; - unsigned char score; -}; - -class SpellingTrie { - private: - static const int kMaxYmNum = 64; - static const size_t kValidSplCharNum = 26; - - static const uint16 kHalfIdShengmuMask = 0x01; - static const uint16 kHalfIdYunmuMask = 0x02; - static const uint16 kHalfIdSzmMask = 0x04; - - // Map from half spelling id to single char. - // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively. - // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ..., - // 28 to 'Z', 29 to 'z'. - // [0] is not used to achieve better efficiency. - static const char kHalfId2Sc_[kFullSplIdStart + 1]; - - static unsigned char char_flags_[]; - static SpellingTrie* instance_; - - // The spelling table - char *spelling_buf_; - - // The size of longest spelling string, includes '\0' and an extra char to - // store score. For example, "zhuang" is the longgest item in Pinyin list, - // so spelling_size_ is 8. - // Structure: The string ended with '\0' + score char. - // An item with a lower score has a higher probability. - uint32 spelling_size_; - - // Number of full spelling ids. - uint32 spelling_num_; - - float score_amplifier_; - unsigned char average_score_; - - // The Yunmu id list for the spelling ids (for half ids of Shengmu, - // the Yunmu id is 0). - // The length of the list is spelling_num_ + kFullSplIdStart, - // so that spl_ym_ids_[splid] is the Yunmu id of the splid. - uint8 *spl_ym_ids_; - - // The Yunmu table. - // Each Yunmu will be assigned with Yunmu id from 1. - char *ym_buf_; - size_t ym_size_; // The size of longest Yunmu string, '\0'included. - size_t ym_num_; - - // The spelling string just queried - char *splstr_queried_; - - // The spelling string just queried - char16 *splstr16_queried_; - - // The root node of the spelling tree - SpellingNode* root_; - - // If a none qwerty key such as a fnction key like ENTER is given, this node - // will be used to indicate that this is not a QWERTY node. - SpellingNode* dumb_node_; - - // If a splitter key is pressed, this node will be used to indicate that this - // is a splitter key. - SpellingNode* splitter_node_; - - // Used to get the first level sons. - SpellingNode* level1_sons_[kValidSplCharNum]; - - // The full spl_id range for specific half id. - // h2f means half to full. - // A half id can be a ShouZiMu id (id to represent the first char of a full - // spelling, including Shengmu and Yunmu), or id of zh/ch/sh. - // [1..kFullSplIdStart-1] is the arrange of half id. - uint16 h2f_start_[kFullSplIdStart]; - uint16 h2f_num_[kFullSplIdStart]; - - // Map from full id to half id. - uint16 *f2h_; - -#ifdef ___BUILD_MODEL___ - // How many node used to build the trie. - size_t node_num_; -#endif - - SpellingTrie(); - - void free_son_trie(SpellingNode* node); - - // Construct a subtree using a subset of the spelling array (from - // item_star to item_end). - // Member spelliing_buf_ and spelling_size_ should be valid. - // parent is used to update its num_of_son and score. - SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end, - size_t level, SpellingNode *parent); - bool build_f2h(); - - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_shengmu_char(char ch) const; - - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_yunmu_char(char ch) const; - -#ifdef ___BUILD_MODEL___ - // Given a spelling string, return its Yunmu string. - // The caller guaratees spl_str is valid. - const char* get_ym_str(const char *spl_str); - - // Build the Yunmu list, and the mapping relation between the full ids and the - // Yunmu ids. This functin is called after the spelling trie is built. - bool build_ym_info(); -#endif - - friend class SpellingParser; - friend class SmartSplParser; - friend class SmartSplParser2; - - public: - ~SpellingTrie(); - - inline static bool is_valid_spl_char(char ch) { - return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); - } - - // The caller guarantees that the two chars are valid spelling chars. - inline static bool is_same_spl_char(char ch1, char ch2) { - return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A'; - } - - // Construct the tree from the input pinyin array - // The given string list should have been sorted. - // score_amplifier is used to convert a possibility value into score. - // average_score is the average_score of all spellings. The dumb node is - // assigned with this score. - bool construct(const char* spelling_arr, size_t item_size, size_t item_num, - float score_amplifier, unsigned char average_score); - - // Test if the given id is a valid spelling id. - // If function returns true, the given splid may be updated like this: - // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is - // first given as a half id 1, but because 'A' is a one-char Yunmu and - // it is a valid id, it needs to updated to its corresponding full id. - bool if_valid_id_update(uint16 *splid) const; - - // Test if the given id is a half id. - bool is_half_id(uint16 splid) const; - - bool is_full_id(uint16 splid) const; - - // Test if the given id is a one-char Yunmu id (obviously, it is also a half - // id), such as 'A', 'E' and 'O'. - bool is_half_id_yunmu(uint16 splid) const; - - // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled. - // For Pinyin, only i/u/v is not a ShouZiMu char. - // The caller should guarantee that ch >= 'A' && ch <= 'Z' - bool is_szm_char(char ch) const; - - // Test If this char is enabled in ShouZiMu mode. - // The caller should guarantee that ch >= 'A' && ch <= 'Z' - bool szm_is_enabled(char ch) const; - - // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling - // to input). - void szm_enable_shm(bool enable); - - // Enable/disable Yunmus in ShouZiMu mode. - void szm_enable_ym(bool enable); - - // Test if this char is enabled in ShouZiMu mode. - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_szm_enabled(char ch) const; - - // Return the number of full ids for the given half id. - uint16 half2full_num(uint16 half_id) const; - - // Return the number of full ids for the given half id, and fill spl_id_start - // to return the first full id. - uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const; - - // Return the corresponding half id for the given full id. - // Not frequently used, low efficient. - // Return 0 if fails. - uint16 full_to_half(uint16 full_id) const; - - // To test whether a half id is compatible with a full id. - // Generally, when half_id == full_to_half(full_id), return true. - // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible - // with a full id like "Zhe". (Fussy mode is not ready). - bool half_full_compatible(uint16 half_id, uint16 full_id) const; - - static const SpellingTrie* get_cpinstance(); - - static SpellingTrie& get_instance(); - - // Save to the file stream - bool save_spl_trie(FILE *fp); - - // Load from the file stream - bool load_spl_trie(QFile *fp); - - // Get the number of spellings - size_t get_spelling_num(); - - // Return the Yunmu id for the given Yunmu string. - // If the string is not valid, return 0; - uint8 get_ym_id(const char* ym_str); - - // Get the readonly Pinyin string for a given spelling id - const char* get_spelling_str(uint16 splid); - - // Get the readonly Pinyin string for a given spelling id - const char16* get_spelling_str16(uint16 splid); - - // Get Pinyin string for a given spelling id. Return the length of the - // string, and fill-in '\0' at the end. - size_t get_spelling_str16(uint16 splid, char16 *splstr16, - size_t splstr16_len); -}; -} - -#endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h b/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h deleted file mode 100644 index d783bd73..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPLPARSER_H__ -#define PINYINIME_INCLUDE_SPLPARSER_H__ - -#include "./dictdef.h" -#include "./spellingtrie.h" - -namespace ime_pinyin { - -class SpellingParser { - protected: - const SpellingTrie *spl_trie_; - - public: - SpellingParser(); - - // Given a string, parse it into a spelling id stream. - // If the whole string are sucessfully parsed, last_is_pre will be true; - // if the whole string is not fullly parsed, last_is_pre will return whether - // the last part of the string is a prefix of a full spelling string. For - // example, given string "zhengzhon", "zhon" is not a valid speling, but it is - // the prefix of "zhong". - // - // If splstr starts with a character not in ['a'-z'] (it is a split char), - // return 0. - // Split char can only appear in the middle of the string or at the end. - uint16 splstr_to_idxs(const char *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs(), the only difference is that splstr_to_idxs() - // convert single-character Yunmus into half ids, while this function converts - // them into full ids. - uint16 splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs(), the only difference is that this function - // uses char16 instead of char8. - uint16 splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs_f(), the only difference is that this function - // uses char16 instead of char8. - uint16 splstr16_to_idxs_f(const char16 *splstr16, uint16 str_len, - uint16 splidx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre); - - // If the given string is a spelling, return the id, others, return 0. - // If the give string is a single char Yunmus like "A", and the char is - // enabled in ShouZiMu mode, the returned spelling id will be a half id. - // When the returned spelling id is a half id, *is_pre returns whether it - // is a prefix of a full spelling string. - uint16 get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre); - - // If the given string is a spelling, return the id, others, return 0. - // If the give string is a single char Yunmus like "a", no matter the char - // is enabled in ShouZiMu mode or not, the returned spelling id will be - // a full id. - // When the returned spelling id is a half id, *p_is_pre returns whether it - // is a prefix of a full spelling string. - uint16 get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre); - - // Splitter chars are not included. - bool is_valid_to_parse(char ch); - - // When auto-correction is not enabled, get_splid_by_str() will be called to - // return the single result. When auto-correction is enabled, this function - // will be called to get the results. Auto-correction is not ready. - // full_id_num returns number of full spelling ids. - // is_pre returns whether the given string is the prefix of a full spelling - // string. - // If splstr starts with a character not in [a-zA-Z] (it is a split char), - // return 0. - // Split char can only appear in the middle of the string or at the end. - // The caller should guarantee NULL != splstr && str_len > 0 && NULL != splidx - uint16 get_splids_parallel(const char *splstr, uint16 str_len, - uint16 splidx[], uint16 max_size, - uint16 &full_id_num, bool &is_pre); -}; -} - -#endif // PINYINIME_INCLUDE_SPLPARSER_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h b/src/virtualkeyboard/3rdparty/pinyin/include/sync.h deleted file mode 100644 index bf42d1f1..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SYNC_H__ -#define PINYINIME_INCLUDE_SYNC_H__ - -#define ___SYNC_ENABLED___ - -#ifdef ___SYNC_ENABLED___ - -#include "userdict.h" - -namespace ime_pinyin { - -// Class for user dictionary synchronization -// This class is not thread safe -// Normal invoking flow will be -// begin() -> -// put_lemmas() x N -> -// { -// get_lemmas() -> -// [ get_last_got_count() ] -> -// clear_last_got() -> -// } x N -> -// finish() -class Sync { - public: - Sync(); - ~Sync(); - - static const int kUserDictMaxLemmaCount = 5000; - static const int kUserDictMaxLemmaSize = 200000; - static const int kUserDictRatio = 20; - - bool begin(const char * filename); - - // Merge lemmas downloaded from sync server into local dictionary - // lemmas, lemmas string encoded in UTF16LE - // len, length of lemmas string - // Return how many lemmas merged successfully - int put_lemmas(char16 * lemmas, int len); - - // Get local new user lemmas into UTF16LE string - // str, buffer ptr to store new user lemmas - // size, size of buffer - // Return length of returned buffer in measure of UTF16LE - int get_lemmas(char16 * str, int size); - - // Return lemmas count in last get_lemmas() - int get_last_got_count(); - - // Return total lemmas count need get_lemmas() - int get_total_count(); - - // Clear lemmas got by recent get_lemmas() - void clear_last_got(); - - void finish(); - - int get_capacity(); - - private: - UserDict * userdict_; - char * dictfile_; - int last_count_; -}; - -} - -#endif - -#endif // PINYINIME_INCLUDE_SYNC_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h b/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h deleted file mode 100644 index db010912..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_USERDICT_H__ -#define PINYINIME_INCLUDE_USERDICT_H__ - -#define ___CACHE_ENABLED___ -#define ___SYNC_ENABLED___ -#define ___PREDICT_ENABLED___ - -// Debug performance for operations -// #define ___DEBUG_PERF___ - -#ifdef _WIN32 -#include <time.h> -#include <winsock.h> // timeval -#else -#include <pthread.h> -#include <sys/time.h> -#endif -#include "atomdictbase.h" - -namespace ime_pinyin { - -class UserDict : public AtomDictBase { - public: - UserDict(); - ~UserDict(); - - bool load_dict(const char *file_name, LemmaIdType start_id, - LemmaIdType end_id); - - bool close_dict(); - - size_t number_of_lemmas(); - - void reset_milestones(uint16 from_step, MileStoneHandle from_handle); - - MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, - uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - // Full spelling ids are required - LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count); - - LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, - bool selected); - - LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], - uint16 lemma_len); - - LmaScoreType get_lemma_score(LemmaIdType lemma_id); - - LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len); - - bool remove_lemma(LemmaIdType lemma_id); - - size_t get_total_lemma_count(); - void set_total_lemma_count_of_others(size_t count); - - void flush_cache(); - - void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, - uint32 reclaim_ratio); - - void reclaim(); - - void defragment(); - -#ifdef ___SYNC_ENABLED___ - void clear_sync_lemmas(unsigned int start, unsigned int end); - - int get_sync_count(); - - LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - /** - * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. - * - * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' - * @param len length of lemmas string in UTF-16LE - * @return newly added lemma count - */ - int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); - - /** - * Get lemmas need sync to a UTF-16LE string of above format. - * Note: input buffer (str) must not be too small. If str is too small to - * contain single one lemma, there might be a dead loop. - * - * @param str buffer to write lemmas - * @param size buffer size in UTF-16LE - * @param count output value of lemma returned - * @return UTF-16LE string length - */ - int get_sync_lemmas_in_utf16le_string_from_beginning( - char16 * str, int size, int * count); - -#endif - - struct UserDictStat { - uint32 version; - const char * file_name; - struct timeval load_time; - struct timeval last_update; - uint32 disk_size; - uint32 lemma_count; - uint32 lemma_size; - uint32 delete_count; - uint32 delete_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - uint32 reclaim_ratio; - uint32 limit_lemma_count; - uint32 limit_lemma_size; - }; - - bool state(UserDictStat * stat); - - private: - uint32 total_other_nfreq_; - struct timeval load_time_; - LemmaIdType start_id_; - uint32 version_; - uint8 * lemmas_; - - // In-Memory-Only flag for each lemma - static const uint8 kUserDictLemmaFlagRemove = 1; - // Inuse lemmas' offset - uint32 * offsets_; - // Highest bit in offset tells whether corresponding lemma is removed - static const uint32 kUserDictOffsetFlagRemove = (1 << 31); - // Maximum possible for the offset - static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); - // Bit width for last modified time, from 1 to 16 - static const uint32 kUserDictLMTBitWidth = 16; - // Granularity for last modified time in second - static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; - // Maximum frequency count - static const uint16 kUserDictMaxFrequency = 0xFFFF; - -#define COARSE_UTC(year, month, day, hour, minute, second) \ - ( \ - (year - 1970) * 365 * 24 * 60 * 60 + \ - (month - 1) * 30 * 24 * 60 * 60 + \ - (day - 1) * 24 * 60 * 60 + \ - (hour - 0) * 60 * 60 + \ - (minute - 0) * 60 + \ - (second - 0) \ - ) - static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); - - // Correspond to offsets_ - uint32 * scores_; - // Following two fields are only valid in memory - uint32 * ids_; -#ifdef ___PREDICT_ENABLED___ - uint32 * predicts_; -#endif -#ifdef ___SYNC_ENABLED___ - uint32 * syncs_; - size_t sync_count_size_; -#endif - uint32 * offsets_by_id_; - - size_t lemma_count_left_; - size_t lemma_size_left_; - - const char * dict_file_; - - // Be sure size is 4xN - struct UserDictInfo { - // When limitation reached, how much percentage will be reclaimed (1 ~ 100) - uint32 reclaim_ratio; - // maximum lemma count, 0 means no limitation - uint32 limit_lemma_count; - // Maximum lemma size, it's different from - // whole disk file size or in-mem dict size - // 0 means no limitation - uint32 limit_lemma_size; - // Total lemma count including deleted and inuse - // Also indicate offsets_ size - uint32 lemma_count; - // Total size of lemmas including used and freed - uint32 lemma_size; - // Freed lemma count - uint32 free_count; - // Freed lemma size in byte - uint32 free_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - int32 total_nfreq; - } dict_info_; - - static const uint32 kUserDictVersion = 0x0ABCDEF0; - - static const uint32 kUserDictPreAlloc = 32; - static const uint32 kUserDictAverageNchar = 8; - - enum UserDictState { - // Keep in order - USER_DICT_NONE = 0, - USER_DICT_SYNC, -#ifdef ___SYNC_ENABLED___ - USER_DICT_SYNC_DIRTY, -#endif - USER_DICT_SCORE_DIRTY, - USER_DICT_OFFSET_DIRTY, - USER_DICT_LEMMA_DIRTY, - - USER_DICT_DEFRAGMENTED, - } state_; - - struct UserDictSearchable { - uint16 splids_len; - uint16 splid_start[kMaxLemmaSize]; - uint16 splid_count[kMaxLemmaSize]; - // Compact inital letters for both FuzzyCompareSpellId and cache system - uint32 signature[kMaxLemmaSize / 4]; - }; - -#ifdef ___CACHE_ENABLED___ - enum UserDictCacheType { - USER_DICT_CACHE, - USER_DICT_MISS_CACHE, - }; - - static const int kUserDictCacheSize = 4; - static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; - - struct UserDictMissCache { - uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; - uint16 head, tail; - } miss_caches_[kMaxLemmaSize]; - - struct UserDictCache { - uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; - uint32 offsets[kUserDictCacheSize]; - uint32 lengths[kUserDictCacheSize]; - // Ring buffer - uint16 head, tail; - } caches_[kMaxLemmaSize]; - - void cache_init(); - - void cache_push(UserDictCacheType type, - UserDictSearchable *searchable, - uint32 offset, uint32 length); - - bool cache_hit(UserDictSearchable *searchable, - uint32 *offset, uint32 *length); - - bool load_cache(UserDictSearchable *searchable, - uint32 *offset, uint32 *length); - - void save_cache(UserDictSearchable *searchable, - uint32 offset, uint32 length); - - void reset_cache(); - - bool load_miss_cache(UserDictSearchable *searchable); - - void save_miss_cache(UserDictSearchable *searchable); - - void reset_miss_cache(); -#endif - - LmaScoreType translate_score(int f); - - int extract_score_freq(int raw_score); - - uint64 extract_score_lmt(int raw_score); - - inline int build_score(uint64 lmt, int freq); - - inline int64 utf16le_atoll(uint16 *s, int len); - - inline int utf16le_lltoa(int64 v, uint16 *s, int size); - - LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - - size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); - - int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); - - int _get_lemma_score(LemmaIdType lemma_id); - - int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, - const UserDictSearchable *searchable); - - bool is_prefix_spell_id(const uint16 * fullids, - uint16 fulllen, const UserDictSearchable *searchable); - - uint32 get_dict_file_size(UserDictInfo * info); - - bool reset(const char *file); - - bool validate(const char *file); - - bool load(const char *file, LemmaIdType start_id); - - bool is_valid_state(); - - bool is_valid_lemma_id(LemmaIdType id); - - LemmaIdType get_max_lemma_id(); - - void set_lemma_flag(uint32 offset, uint8 flag); - - char get_lemma_flag(uint32 offset); - - char get_lemma_nchar(uint32 offset); - - uint16 * get_lemma_spell_ids(uint32 offset); - - uint16 * get_lemma_word(uint32 offset); - - // Prepare searchable to fasten locate process - void prepare_locate(UserDictSearchable *searchable, - const uint16 * splids, uint16 len); - - // Compare initial letters only - int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, - const UserDictSearchable *searchable); - - // Compare exactly two spell ids - // First argument must be a full id spell id - bool equal_spell_id(const uint16 * fullids, - uint16 fulllen, const UserDictSearchable *searchable); - - // Find first item by initial letters - int32 locate_first_in_offsets(const UserDictSearchable *searchable); - - LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - - // Check if a lemma is in dictionary - int32 locate_in_offsets(char16 lemma_str[], - uint16 splid_str[], uint16 lemma_len); - - bool remove_lemma_by_offset_index(int offset_index); -#ifdef ___PREDICT_ENABLED___ - uint32 locate_where_to_insert_in_predicts(const uint16 * words, - int lemma_len); - - int32 locate_first_in_predicts(const uint16 * words, int lemma_len); - - void remove_lemma_from_predict_list(uint32 offset); -#endif -#ifdef ___SYNC_ENABLED___ - void queue_lemma_for_sync(LemmaIdType id); - - void remove_lemma_from_sync_list(uint32 offset); - - void write_back_sync(int fd); -#endif - void write_back_score(int fd); - void write_back_offset(int fd); - void write_back_lemma(int fd); - void write_back_all(int fd); - void write_back(); - - struct UserDictScoreOffsetPair { - int score; - uint32 offset_index; - }; - - inline void swap(UserDictScoreOffsetPair * sop, int i, int j); - - void shift_down(UserDictScoreOffsetPair * sop, int i, int n); - - // On-disk format for each lemma - // +-------------+ - // | Version (4) | - // +-------------+ - // +-----------+-----------+--------------------+-------------------+ - // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | - // +-----------+-----------+--------------------+-------------------+ - // ... - // +-----------------------+ +-------------+ <---Offset of offset - // | Offset1 by_splids (4) | ... | OffsetN (4) | - // +-----------------------+ +-------------+ -#ifdef ___PREDICT_ENABLED___ - // +----------------------+ +-------------+ - // | Offset1 by_lemma (4) | ... | OffsetN (4) | - // +----------------------+ +-------------+ -#endif - // +------------+ +------------+ - // | Score1 (4) | ... | ScoreN (4) | - // +------------+ +------------+ -#ifdef ___SYNC_ENABLED___ - // +-------------+ +-------------+ - // | NewAdd1 (4) | ... | NewAddN (4) | - // +-------------+ +-------------+ -#endif - // +----------------+ - // | Dict Info (4x) | - // +----------------+ -}; -} - -#endif diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h deleted file mode 100644 index 7e957db5..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_UTF16CHAR_H__ -#define PINYINIME_INCLUDE_UTF16CHAR_H__ - -#include <stdlib.h> - -namespace ime_pinyin { - -#ifdef __cplusplus -extern "C" { -#endif - - typedef unsigned short char16; - - // Get a token from utf16_str, - // Returned pointer is a '\0'-terminated utf16 string, or NULL - // *utf16_str_next returns the next part of the string for further tokenizing - char16* utf16_strtok(char16 *utf16_str, size_t *token_size, - char16 **utf16_str_next); - - int utf16_atoi(const char16 *utf16_str); - - float utf16_atof(const char16 *utf16_str); - - size_t utf16_strlen(const char16 *utf16_str); - - int utf16_strcmp(const char16 *str1, const char16 *str2); - int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size); - - char16* utf16_strcpy(char16 *dst, const char16 *src); - char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size); - - - char* utf16_strcpy_tochar(char *dst, const char16 *src); - -#ifdef __cplusplus -} -#endif -} - -#endif // PINYINIME_INCLUDE_UTF16CHAR_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h deleted file mode 100644 index b6d6719e..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_UTF16READER_H__ -#define PINYINIME_INCLUDE_UTF16READER_H__ - -#include <stdio.h> -#include "./utf16char.h" - -namespace ime_pinyin { - -class Utf16Reader { - private: - FILE *fp_; - char16 *buffer_; - size_t buffer_total_len_; - size_t buffer_next_pos_; - - // Always less than buffer_total_len_ - buffer_next_pos_ - size_t buffer_valid_len_; - - public: - Utf16Reader(); - ~Utf16Reader(); - - // filename is the name of the file to open. - // buffer_len specifies how long buffer should be allocated to speed up the - // future reading - bool open(const char* filename, size_t buffer_len); - char16* readline(char16* read_buf, size_t max_len); - bool close(); -}; -} - -#endif // PINYINIME_INCLUDE_UTF16READER_H__ |