diff options
Diffstat (limited to 'src/virtualkeyboard/3rdparty/pinyin')
42 files changed, 0 insertions, 12903 deletions
diff --git a/src/virtualkeyboard/3rdparty/pinyin/NOTICE b/src/virtualkeyboard/3rdparty/pinyin/NOTICE deleted file mode 100644 index 64aaa8db..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/NOTICE +++ /dev/null @@ -1,190 +0,0 @@ - - Copyright (c) 2009, The Android Open Source Project - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - diff --git a/src/virtualkeyboard/3rdparty/pinyin/command/Makefile b/src/virtualkeyboard/3rdparty/pinyin/command/Makefile deleted file mode 100644 index 8ef2315c..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/command/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -CC=gcc -CFLAGS= -g -Wall -std=c99 -CPP=g++ -CPPFLAGS= -g3 -Wall -lpthread -D___BUILD_MODEL___ - -PINYINIME_DICTBUILDER=pinyinime_dictbuilder - -LIBRARY_SRC= \ - ../share/dictbuilder.cpp \ - ../share/dictlist.cpp \ - ../share/dicttrie.cpp \ - ../share/lpicache.cpp \ - ../share/mystdlib.cpp \ - ../share/ngram.cpp \ - ../share/searchutility.cpp \ - ../share/spellingtable.cpp \ - ../share/spellingtrie.cpp \ - ../share/splparser.cpp \ - ../share/utf16char.cpp \ - ../share/utf16reader.cpp \ - -all: engine - -engine: $(PINYINIME_DICTBUILDER) - -$(PINYINIME_DICTBUILDER): $(LIBRARY_SRC) pinyinime_dictbuilder.cpp - @$(CPP) $(CPPFLAGS) -o $@ $? - - -clean: - -rm -rf $(PINYINIME_DICTBUILDER) - -.PHONY: clean diff --git a/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp b/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp deleted file mode 100644 index 41ea648d..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdlib.h> -#include <stdio.h> -#include <time.h> -#include <unistd.h> -#include "../include/dicttrie.h" - -using namespace ime_pinyin; - -/** - * Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined - * in dictdef.h. - */ -int main(int argc, char* argv[]) { - DictTrie* dict_trie = new DictTrie(); - bool success; - if (argc >= 3) - success = dict_trie->build_dict(argv[1], argv[2]); - else - success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt", - "../data/valid_utf16.txt"); - - if (success) { - printf("Build dictionary successfully.\n"); - } else { - printf("Build dictionary unsuccessfully.\n"); - return -1; - } - - success = dict_trie->save_dict("../data/dict_pinyin.dat"); - - if (success) { - printf("Save dictionary successfully.\n"); - } else { - printf("Save dictionary unsuccessfully.\n"); - return -1; - } - - return 0; -} diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat b/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat Binary files differdeleted file mode 100644 index 1be3f9c7..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat +++ /dev/null diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt b/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt Binary files differdeleted file mode 100644 index 28805ba6..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt +++ /dev/null diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt b/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt Binary files differdeleted file mode 100644 index fecc67eb..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt +++ /dev/null diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h b/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h deleted file mode 100644 index 0a70a510..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * This class defines AtomDictBase class which is the base class for all atom - * dictionaries. Atom dictionaries are managed by the decoder class - * MatrixSearch. - * - * When the user appends a new character to the Pinyin string, all enabled atom - * dictionaries' extend_dict() will be called at least once to get candidates - * ended in this step (the information of starting step is also given in the - * parameter). Usually, when extend_dict() is called, a MileStoneHandle object - * returned by a previous calling for a earlier step is given to speed up the - * look-up process, and a new MileStoneHandle object will be returned if - * the extension is successful. - * - * A returned MileStoneHandle object should keep alive until Function - * reset_milestones() is called and this object is noticed to be reset. - * - * Usually, the atom dictionary can use step information to manage its - * MileStoneHandle objects, or it can make the objects in ascendant order to - * make the reset easier. - * - * When the decoder loads the dictionary, it will give a starting lemma id for - * this atom dictionary to map a inner id to a global id. Global ids should be - * used when an atom dictionary talks to any component outside. - */ -#ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__ -#define PINYINIME_INCLUDE_ATOMDICTBASE_H__ - -#include <stdlib.h> -#include "./dictdef.h" -#include "./searchutility.h" - -namespace ime_pinyin { -class AtomDictBase { - public: - virtual ~AtomDictBase() {} - - /** - * Load an atom dictionary from a file. - * - * @param file_name The file name to load dictionary. - * @param start_id The starting id used for this atom dictionary. - * @param end_id The end id (included) which can be used for this atom - * dictionary. User dictionary will always use the last id space, so it can - * ignore this paramter. All other atom dictionaries should check this - * parameter. - * @return True if succeed. - */ - virtual bool load_dict(const char *file_name, LemmaIdType start_id, - LemmaIdType end_id) = 0; - - /** - * Close this atom dictionary. - * - * @return True if succeed. - */ - virtual bool close_dict() = 0; - - /** - * Get the total number of lemmas in this atom dictionary. - * - * @return The total number of lemmas. - */ - virtual size_t number_of_lemmas() = 0; - - /** - * This function is called by the decoder when user deletes a character from - * the input string, or begins a new input string. - * - * Different atom dictionaries may implement this function in different way. - * an atom dictionary can use one of these two parameters (or both) to reset - * its corresponding MileStoneHandle objects according its detailed - * implementation. - * - * For example, if an atom dictionary uses step information to manage its - * MileStoneHandle objects, parameter from_step can be used to identify which - * objects should be reset; otherwise, if another atom dictionary does not - * use the detailed step information, it only uses ascendant handles - * (according to step. For the same step, earlier call, smaller handle), it - * can easily reset those MileStoneHandle which are larger than from_handle. - * - * The decoder always reset the decoding state by step. So when it begins - * resetting, it will call reset_milestones() of its atom dictionaries with - * the step information, and the MileStoneHandle objects returned by the - * earliest calling of extend_dict() for that step. - * - * If an atom dictionary does not implement incremental search, this function - * can be totally ignored. - * - * @param from_step From which step(included) the MileStoneHandle - * objects should be reset. - * @param from_handle The ealiest MileStoneHandle object for step from_step - */ - virtual void reset_milestones(uint16 from_step, - MileStoneHandle from_handle) = 0; - - /** - * Used to extend in this dictionary. The handle returned should keep valid - * until reset_milestones() is called. - * - * @param from_handle Its previous returned extended handle without the new - * spelling id, it can be used to speed up the extending. - * @param dep The paramter used for extending. - * @param lpi_items Used to fill in the lemmas matched. - * @param lpi_max The length of the buffer - * @param lpi_num Used to return the newly added items. - * @return The new mile stone for this extending. 0 if fail. - */ - virtual MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) = 0; - - /** - * Get lemma items with scores according to a spelling id stream. - * This atom dictionary does not need to sort the returned items. - * - * @param splid_str The spelling id stream buffer. - * @param splid_str_len The length of the spelling id stream buffer. - * @param lpi_items Used to return matched lemma items with scores. - * @param lpi_max The maximum size of the buffer to return result. - * @return The number of matched items which have been filled in to lpi_items. - */ - virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max) = 0; - - /** - * Get a lemma string (The Chinese string) by the given lemma id. - * - * @param id_lemma The lemma id to get the string. - * @param str_buf The buffer to return the Chinese string. - * @param str_max The maximum size of the buffer. - * @return The length of the string, 0 if fail. - */ - virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) = 0; - - /** - * Get the full spelling ids for the given lemma id. - * If the given buffer is too short, return 0. - * - * @param splids Used to return the spelling ids. - * @param splids_max The maximum buffer length of splids. - * @param arg_valid Used to indicate if the incoming parameters have been - * initialized are valid. If it is true, the splids and splids_max are valid - * and there may be half ids in splids to be updated to full ids. In this - * case, splids_max is the number of valid ids in splids. - * @return The number of ids in the buffer. - */ - virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) = 0; - - /** - * Function used for prediction. - * No need to sort the newly added items. - * - * @param last_hzs The last n Chinese chracters(called Hanzi), its length - * should be less than or equal to kMaxPredictSize. - * @param hzs_len specifies the length(<= kMaxPredictSize) of the history. - * @param npre_items Used used to return the result. - * @param npre_max The length of the buffer to return result - * @param b4_used Number of prediction result (from npre_items[-b4_used]) - * from other atom dictionaries. A atom ditionary can just ignore it. - * @return The number of prediction result from this atom dictionary. - */ - virtual size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) = 0; - - /** - * Add a lemma to the dictionary. If the dictionary allows to add new - * items and this item does not exist, add it. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the Chinese lemma. - * @param count The frequency count for this lemma. - */ - virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count) = 0; - - /** - * Update a lemma's occuring count. - * - * @param lemma_id The lemma id to update. - * @param delta_count The frequnecy count to ajust. - * @param selected Indicate whether this lemma is selected by user and - * submitted to target edit box. - * @return The id if succeed, 0 if fail. - */ - virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, - bool selected) = 0; - - /** - * Get the lemma id for the given lemma. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the lemma. - * @return The matched lemma id, or 0 if fail. - */ - virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) = 0; - - /** - * Get the lemma score. - * - * @param lemma_id The lemma id to get score. - * @return The score of the lemma, or 0 if fail. - */ - virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0; - - /** - * Get the lemma score. - * - * @param lemma_str The Chinese string of the lemma. - * @param splids The spelling ids of the lemma. - * @param lemma_len The length of the lemma. - * @return The score of the lamm, or 0 if fail. - */ - virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) = 0; - - /** - * If the dictionary allowed, remove a lemma from it. - * - * @param lemma_id The id of the lemma to remove. - * @return True if succeed. - */ - virtual bool remove_lemma(LemmaIdType lemma_id) = 0; - - /** - * Get the total occuring count of this atom dictionary. - * - * @return The total occuring count of this atom dictionary. - */ - virtual size_t get_total_lemma_count() = 0; - - /** - * Set the total occuring count of other atom dictionaries. - * - * @param count The total occuring count of other atom dictionaies. - */ - virtual void set_total_lemma_count_of_others(size_t count) = 0; - - /** - * Notify this atom dictionary to flush the cached data to persistent storage - * if necessary. - */ - virtual void flush_cache() = 0; -}; -} - -#endif // PINYINIME_INCLUDE_ATOMDICTBASE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h deleted file mode 100644 index da0d6cd3..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ -#define PINYINIME_INCLUDE_DICTBUILDER_H__ - -#include <stdlib.h> -#include "./utf16char.h" -#include "./dictdef.h" -#include "./dictlist.h" -#include "./spellingtable.h" -#include "./spellingtrie.h" -#include "./splparser.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -#define ___DO_STATISTICS___ - -class DictTrie; - -class DictBuilder { - private: - // The raw lemma array buffer. - LemmaEntry *lemma_arr_; - size_t lemma_num_; - - // Used to store all possible single char items. - // Two items may have the same Hanzi while their spelling ids are different. - SingleCharItem *scis_; - size_t scis_num_; - - // In the tree, root's level is -1. - // Lemma nodes for root, and level 0 - LmaNodeLE0 *lma_nodes_le0_; - - // Lemma nodes for layers whose levels are deeper than 0 - LmaNodeGE1 *lma_nodes_ge1_; - - // Number of used lemma nodes - size_t lma_nds_used_num_le0_; - size_t lma_nds_used_num_ge1_; - - // Used to store homophonies' ids. - LemmaIdType *homo_idx_buf_; - // Number of homophonies each of which only contains one Chinese character. - size_t homo_idx_num_eq1_; - // Number of homophonies each of which contains more than one character. - size_t homo_idx_num_gt1_; - - // The items with highest scores. - LemmaEntry *top_lmas_; - size_t top_lmas_num_; - - SpellingTable *spl_table_; - SpellingParser *spl_parser_; - -#ifdef ___DO_STATISTICS___ - size_t max_sonbuf_len_[kMaxLemmaSize]; - size_t max_homobuf_len_[kMaxLemmaSize]; - - size_t total_son_num_[kMaxLemmaSize]; - size_t total_node_hasson_[kMaxLemmaSize]; - size_t total_sonbuf_num_[kMaxLemmaSize]; - size_t total_sonbuf_allnoson_[kMaxLemmaSize]; - size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; - size_t total_homo_num_[kMaxLemmaSize]; - - size_t sonbufs_num1_; // Number of son buffer with only 1 son - size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; - - size_t total_lma_node_num_; - - void stat_init(); - void stat_print(); -#endif - - public: - - DictBuilder(); - ~DictBuilder(); - - // Build dictionary trie from the file fn_raw. File fn_validhzs provides - // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be - // included. - bool build_dict(const char* fn_raw, const char* fn_validhzs, - DictTrie *dict_trie); - - private: - // Fill in the buffer with id. The caller guarantees that the paramters are - // vaild. - void id_to_charbuf(unsigned char *buf, LemmaIdType id); - - // Update the offset of sons for a node. - void set_son_offset(LmaNodeGE1 *node, size_t offset); - - // Update the offset of homophonies' ids for a node. - void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); - - // Format a speling string. - void format_spelling_str(char *spl_str); - - // Sort the lemma_arr by the hanzi string, and give each of unique items - // a id. Why we need to sort the lemma list according to their Hanzi string - // is to find items started by a given prefix string to do prediction. - // Actually, the single char items are be in other order, for example, - // in spelling id order, etc. - // Return value is next un-allocated idx available. - LemmaIdType sort_lemmas_by_hz(); - - // Build the SingleCharItem list, and fill the hanzi_scis_ids in the - // lemma buffer lemma_arr_. - // This function should be called after the lemma array is ready. - // Return the number of unique SingleCharItem elements. - size_t build_scis(); - - // Construct a subtree using a subset of the spelling array (from - // item_star to item_end) - // parent is the parent node to update the necessary information - // parent can be a member of LmaNodeLE0 or LmaNodeGE1 - bool construct_subset(void* parent, LemmaEntry* lemma_arr, - size_t item_start, size_t item_end, size_t level); - - - // Read valid Chinese Hanzis from the given file. - // num is used to return number of chars. - // The return buffer is sorted and caller needs to free the returned buffer. - char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); - - - // Read a raw dictionary. max_item is the maximum number of items. If there - // are more items in the ditionary, only the first max_item will be read. - // Returned value is the number of items successfully read from the file. - size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, - size_t max_item); - - // Try to find if a character is in hzs buffer. - bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); - - // Try to find if all characters in str are in hzs buffer. - bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, - const char16 *str, size_t str_len); - - // Get these lemmas with toppest scores. - void get_top_lemmas(); - - // Allocate resource to build dictionary. - // lma_num is the number of items to be loaded - bool alloc_resource(size_t lma_num); - - // Free resource. - void free_resource(); -}; -#endif // ___BUILD_MODEL___ -} - -#endif // PINYINIME_INCLUDE_DICTBUILDER_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h deleted file mode 100644 index 5e1d7818..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTDEF_H__ -#define PINYINIME_INCLUDE_DICTDEF_H__ - -#include <stdlib.h> -#include "./utf16char.h" - -namespace ime_pinyin { - -// Enable the following line when building the binary dictionary model. -// #define ___BUILD_MODEL___ - -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; - -typedef signed char int8; -typedef short int16; -typedef int int32; -typedef long long int64; -typedef unsigned long long uint64; - -const bool kPrintDebug0 = false; -const bool kPrintDebug1 = false; -const bool kPrintDebug2 = false; - -// The max length of a lemma. -const size_t kMaxLemmaSize = 8; - -// The max length of a Pinyin (spelling). -const size_t kMaxPinyinSize = 6; - -// The number of half spelling ids. For Chinese Pinyin, there 30 half ids. -// See SpellingTrie.h for details. -const size_t kHalfSpellingIdNum = 29; - -// The maximum number of full spellings. For Chinese Pinyin, there are only -// about 410 spellings. -// If change this value is bigger(needs more bits), please also update -// other structures like SpellingNode, to make sure than a spelling id can be -// stored. -// -1 is because that 0 is never used. -const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; -const size_t kMaxSearchSteps = 40; - -// One character predicts its following characters. -const size_t kMaxPredictSize = (kMaxLemmaSize - 1); - -// LemmaIdType must always be size_t. -typedef size_t LemmaIdType; -const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. -const size_t kLemmaIdComposing = 0xffffff; - -typedef uint16 LmaScoreType; -typedef uint16 KeyScoreType; - -// Number of items with highest score are kept for prediction purpose. -const size_t kTopScoreLemmaNum = 10; - -const size_t kMaxPredictNumByGt3 = 1; -const size_t kMaxPredictNumBy3 = 2; -const size_t kMaxPredictNumBy2 = 2; - -// The last lemma id (included) for the system dictionary. The system -// dictionary's ids always start from 1. -const LemmaIdType kSysDictIdEnd = 500000; - -// The first lemma id for the user dictionary. -const LemmaIdType kUserDictIdStart = 500001; - -// The last lemma id (included) for the user dictionary. -const LemmaIdType kUserDictIdEnd = 600000; - -typedef struct { - uint16 half_splid:5; - uint16 full_splid:11; -} SpellingId, *PSpellingId; - - -/** - * We use different node types for different layers - * Statistical data of the building result for a testing dictionary: - * root, level 0, level 1, level 2, level 3 - * max son num of one node: 406 280 41 2 - - * max homo num of one node: 0 90 23 2 2 - * total node num of a layer: 1 406 31766 13516 993 - * total homo num of a layer: 9 5674 44609 12667 995 - * - * The node number for root and level 0 won't be larger than 500 - * According to the information above, two kinds of nodes can be used; one for - * root and level 0, the other for these layers deeper than 0. - * - * LE = less and equal, - * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K - */ -struct LmaNodeLE0 { - uint32 son_1st_off; - uint32 homo_idx_buf_off; - uint16 spl_idx; - uint16 num_of_son; - uint16 num_of_homo; -}; - -/** - * GE = great and equal - * A node occupies 8 bytes. - */ -struct LmaNodeGE1 { - uint16 son_1st_off_l; // Low bits of the son_1st_off - uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 - uint16 spl_idx; - unsigned char num_of_son; // number of son nodes - unsigned char num_of_homo; // number of homo words - unsigned char son_1st_off_h; // high bits of the son_1st_off - unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off -}; - -#ifdef ___BUILD_MODEL___ -struct SingleCharItem { - float freq; - char16 hz; - SpellingId splid; -}; - -struct LemmaEntry { - LemmaIdType idx_by_py; - LemmaIdType idx_by_hz; - char16 hanzi_str[kMaxLemmaSize + 1]; - - // The SingleCharItem id for each Hanzi. - uint16 hanzi_scis_ids[kMaxLemmaSize]; - - uint16 spl_idx_arr[kMaxLemmaSize + 1]; - char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; - unsigned char hz_str_len; - float freq; -}; -#endif // ___BUILD_MODEL___ - -} // namespace ime_pinyin - -#endif // PINYINIME_INCLUDE_DICTDEF_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h deleted file mode 100644 index 27fa6d8e..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTLIST_H__ -#define PINYINIME_INCLUDE_DICTLIST_H__ - -#include <stdlib.h> -#include <stdio.h> -#include "./dictdef.h" -#include "./searchutility.h" -#include "./spellingtrie.h" -#include "./utf16char.h" - -namespace ime_pinyin { - -class DictList { - private: - bool initialized_; - - const SpellingTrie *spl_trie_; - - // Number of SingCharItem. The first is blank, because id 0 is invalid. - uint32 scis_num_; - char16 *scis_hz_; - SpellingId *scis_splid_; - - // The large memory block to store the word list. - char16 *buf_; - - // Starting position of those words whose lengths are i+1, counted in - // char16 - uint32 start_pos_[kMaxLemmaSize + 1]; - - uint32 start_id_[kMaxLemmaSize + 1]; - - int (*cmp_func_[kMaxLemmaSize])(const void *, const void *); - - bool alloc_resource(size_t buf_size, size_t scim_num); - - void free_resource(); - -#ifdef ___BUILD_MODEL___ - // Calculate the requsted memory, including the start_pos[] buffer. - size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num); - - void fill_scis(const SingleCharItem *scis, size_t scis_num); - - // Copy the related content to the inner buffer - // It should be called after calculate_size() - void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num); - - // Find the starting position for the buffer of those 2-character Chinese word - // whose first character is the given Chinese character. - char16* find_pos2_startedbyhz(char16 hz_char); -#endif - - // Find the starting position for the buffer of those words whose lengths are - // word_len. The given parameter cmp_func decides how many characters from - // beginning will be used to compare. - char16* find_pos_startedbyhzs(const char16 last_hzs[], - size_t word_Len, - int (*cmp_func)(const void *, const void *)); - - public: - - DictList(); - ~DictList(); - - bool save_list(FILE *fp); - bool load_list(FILE *fp); - -#ifdef ___BUILD_MODEL___ - // Init the list from the LemmaEntry array. - // lemma_arr should have been sorted by the hanzi_str, and have been given - // ids from 1 - bool init_list(const SingleCharItem *scis, size_t scis_num, - const LemmaEntry *lemma_arr, size_t lemma_num); -#endif - - // Get the hanzi string for the given id - uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max); - - void convert_to_hanzis(char16 *str, uint16 str_len); - - void convert_to_scis_ids(char16 *str, uint16 str_len); - - // last_hzs stores the last n Chinese characters history, its length should be - // less or equal than kMaxPredictSize. - // hzs_len specifies the length(<= kMaxPredictSize). - // predict_buf is used to store the result. - // buf_len specifies the buffer length. - // b4_used specifies how many items before predict_buf have been used. - // Returned value is the number of newly added items. - size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - // If half_splid is a valid half spelling id, return those full spelling - // ids which share this half id. - uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid, - uint16 *splids, uint16 max_splids); - - LemmaIdType get_lemma_id(const char16 *str, uint16 str_len); -}; -} - -#endif // PINYINIME_INCLUDE_DICTLIST_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h deleted file mode 100644 index 75b7ee05..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_DICTTRIE_H__ -#define PINYINIME_INCLUDE_DICTTRIE_H__ - -#include <stdlib.h> -#include "./atomdictbase.h" -#include "./dictdef.h" -#include "./dictlist.h" -#include "./searchutility.h" - -namespace ime_pinyin { - -class DictTrie : AtomDictBase { - private: - struct ParsingMark { - size_t node_offset:24; - size_t node_num:8; // Number of nodes with this spelling id given - // by spl_id. If spl_id is a Shengmu, for nodes - // in the first layer of DictTrie, it equals to - // SpellingTrie::shm2full_num(); but for those - // nodes which are not in the first layer, - // node_num < SpellingTrie::shm2full_num(). - // For a full spelling id, node_num = 1; - }; - - // Used to indicate an extended mile stone. - // An extended mile stone is used to mark a partial match in the dictionary - // trie to speed up further potential extending. - // For example, when the user inputs "w", a mile stone is created to mark the - // partial match status, so that when user inputs another char 'm', it will be - // faster to extend search space based on this mile stone. - // - // For partial match status of "wm", there can be more than one sub mile - // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so - // there may be more one parsing mark used to mark these partial matchings. - // A mile stone records the starting position in the mark list and number of - // marks. - struct MileStone { - uint16 mark_start; - uint16 mark_num; - }; - - DictList* dict_list_; - - const SpellingTrie *spl_trie_; - - LmaNodeLE0* root_; // Nodes for root and the first layer. - LmaNodeGE1* nodes_ge1_; // Nodes for other layers. - - // An quick index from spelling id to the LmaNodeLE0 node buffer, or - // to the root_ buffer. - // Index length: - // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used - // to get the end. - // All Shengmu ids are not indexed because they will be converted into - // corresponding full ids. - // So, given an id splid, the son is: - // root_[splid_le0_index_[splid - kFullSplIdStart]] - uint16 *splid_le0_index_; - - uint32 lma_node_num_le0_; - uint32 lma_node_num_ge1_; - - // The first part is for homophnies, and the last top_lma_num_ items are - // lemmas with highest scores. - unsigned char *lma_idx_buf_; - uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. - uint32 total_lma_num_; // Total number of lemmas in this dictionary. - uint32 top_lmas_num_; // Number of lemma with highest scores. - - // Parsing mark list used to mark the detailed extended statuses. - ParsingMark *parsing_marks_; - // The position for next available mark. - uint16 parsing_marks_pos_; - - // Mile stone list used to mark the extended status. - MileStone *mile_stones_; - // The position for the next available mile stone. We use positions (except 0) - // as handles. - MileStoneHandle mile_stones_pos_; - - // Get the offset of sons for a node. - inline size_t get_son_offset(const LmaNodeGE1 *node); - - // Get the offset of homonious ids for a node. - inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); - - // Get the lemma id by the offset. - inline LemmaIdType get_lemma_id(size_t id_offset); - - void free_resource(bool free_dict_list); - - bool load_dict(FILE *fp); - - // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill - // them into the lpi_items buffer. - // This function is called by the search engine. - size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, - LmaNodeLE0 *node); - - // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill - // them into the lpi_items buffer. - // This function is called by inner functions extend_dict0(), extend_dict1() - // and extend_dict2(). - size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, - size_t homo_buf_off, LmaNodeGE1 *node, - uint16 lma_len); - - // Extend in the trie from level 0. - MileStoneHandle extend_dict0(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Extend in the trie from level 1. - MileStoneHandle extend_dict1(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Extend in the trie from level 2. - MileStoneHandle extend_dict2(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - // Try to extend the given spelling id buffer, and if the given id_lemma can - // be successfully gotten, return true; - // The given spelling ids are all valid full ids. - bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); - -#ifdef ___BUILD_MODEL___ - bool save_dict(FILE *fp); -#endif // ___BUILD_MODEL___ - - static const int kMaxMileStone = 100; - static const int kMaxParsingMark = 600; - static const MileStoneHandle kFirstValidMileStoneHandle = 1; - - friend class DictParser; - friend class DictBuilder; - - public: - - DictTrie(); - ~DictTrie(); - -#ifdef ___BUILD_MODEL___ - // Construct the tree from the file fn_raw. - // fn_validhzs provide the valid hanzi list. If fn_validhzs is - // NULL, only chars in GB2312 will be included. - bool build_dict(const char *fn_raw, const char *fn_validhzs); - - // Save the binary dictionary - // Actually, the SpellingTrie/DictList instance will be also saved. - bool save_dict(const char *filename); -#endif // ___BUILD_MODEL___ - - void convert_to_hanzis(char16 *str, uint16 str_len); - - void convert_to_scis_ids(char16 *str, uint16 str_len); - - // Load a binary dictionary - // The SpellingTrie instance/DictList will be also loaded - bool load_dict(const char *filename, LemmaIdType start_id, - LemmaIdType end_id); - bool load_dict_fd(int sys_fd, long start_offset, long length, - LemmaIdType start_id, LemmaIdType end_id); - bool close_dict() {return true;} - size_t number_of_lemmas() {return 0;} - - void reset_milestones(uint16 from_step, MileStoneHandle from_handle); - - MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - size_t predict(const char16 *last_hzs, uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/, uint16 /*count*/) {return 0;} - - LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/, - bool /*selected*/) {return 0;} - - LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/) {return 0;} - - LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;} - - LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[], - uint16 /*lemma_len*/) {return 0;} - - bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;} - - size_t get_total_lemma_count() {return 0;} - void set_total_lemma_count_of_others(size_t count); - - void flush_cache() {} - - LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); - - // Fill the lemmas with highest scores to the prediction buffer. - // his_len is the history length to fill in the prediction buffer. - size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, - size_t npre_max, size_t b4_used); -}; -} - -#endif // PINYINIME_INCLUDE_DICTTRIE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h b/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h deleted file mode 100644 index 60735971..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ -#define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ - -#include <stdlib.h> -#include "./searchutility.h" -#include "./spellingtrie.h" - -namespace ime_pinyin { - -// Used to cache LmaPsbItem list for half spelling ids. -class LpiCache { - private: - static LpiCache *instance_; - static const int kMaxLpiCachePerId = 15; - - LmaPsbItem *lpi_cache_; - uint16 *lpi_cache_len_; - - public: - LpiCache(); - ~LpiCache(); - - static LpiCache& get_instance(); - - // Test if the LPI list of the given splid has been cached. - // If splid is a full spelling id, it returns false, because we only cache - // list for half ids. - bool is_cached(uint16 splid); - - // Put LPI list to cahce. If the length of the list, lpi_num, is longer than - // the cache buffer. the list will be truncated, and function returns the - // maximum length of the cache buffer. - // Note: splid must be a half id, and lpi_items must be not NULL. The - // caller of this function should guarantee this. - size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num); - - // Get the cached list for the given half id. - // Return the length of the cached buffer. - // Note: splid must be a half id, and lpi_items must be not NULL. The - // caller of this function should guarantee this. - size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max); -}; - -} // namespace - -#endif // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h b/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h deleted file mode 100644 index 61e78aa6..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ -#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ - -#include <stdlib.h> -#include "./atomdictbase.h" -#include "./dicttrie.h" -#include "./searchutility.h" -#include "./spellingtrie.h" -#include "./splparser.h" - -namespace ime_pinyin { - -static const size_t kMaxRowNum = kMaxSearchSteps; - -typedef struct { - // MileStoneHandle objects for the system and user dictionaries. - MileStoneHandle dict_handles[2]; - // From which DMI node. -1 means it's from root. - PoolPosType dmi_fr; - // The spelling id for the Pinyin string from the previous DMI to this node. - // If it is a half id like Shengmu, the node pointed by dict_node is the first - // node with this Shengmu, - uint16 spl_id; - // What's the level of the dict node. Level of root is 0, but root is never - // recorded by dict_node. - unsigned char dict_level:7; - // If this node is for composing phrase, this bit is 1. - unsigned char c_phrase:1; - // Whether the spl_id is parsed with a split character at the end. - unsigned char splid_end_split:1; - // What's the length of the spelling string for this match, for the whole - // word. - unsigned char splstr_len:7; - // Used to indicate whether all spelling ids from the root are full spelling - // ids. This information is useful for keymapping mode(not finished). Because - // in this mode, there is no clear boundaries, we prefer those results which - // have full spelling ids. - unsigned char all_full_id:1; -} DictMatchInfo, *PDictMatchInfo; - -typedef struct MatrixNode { - LemmaIdType id; - float score; - MatrixNode *from; - // From which DMI node. Used to trace the spelling segmentation. - PoolPosType dmi_fr; - uint16 step; -} MatrixNode, *PMatrixNode; - -typedef struct { - // The MatrixNode position in the matrix pool - PoolPosType mtrx_nd_pos; - // The DictMatchInfo position in the DictMatchInfo pool. - PoolPosType dmi_pos; - uint16 mtrx_nd_num; - uint16 dmi_num:15; - // Used to indicate whether there are dmi nodes in this step with full - // spelling id. This information is used to decide whether a substring of a - // valid Pinyin should be extended. - // - // Example1: shoudao - // When the last char 'o' is added, the parser will find "dao" is a valid - // Pinyin, and because all dmi nodes at location 'd' (including those for - // "shoud", and those for "d") have Shengmu id only, so it is not necessary - // to extend "ao", otherwise the result may be "shoud ao", that is not - // reasonable. - // - // Example2: hengao - // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. - // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi - // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus - // "heng ao" can also be the result. - // - // Similarly, "ganga" is expanded to "gang a". - // - // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi - // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it - // is not valid either). If the parser uses break in the loop, the result - // always be "xian"; but if the parser uses continue in the loop, "xi an" will - // also be tried. This behaviour can be set via the function - // set_xi_an_switch(). - uint16 dmi_has_full_id:1; - // Points to a MatrixNode of the current step to indicate which choice the - // user selects. - MatrixNode *mtrx_nd_fixed; -} MatrixRow, *PMatrixRow; - -// When user inputs and selects candidates, the fixed lemma ids are stored in -// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many -// lemmas from the beginning are fixed. If user deletes Pinyin characters one -// by one from the end, these fixed lemmas can be unlocked one by one when -// necessary. Whenever user deletes a Chinese character and its spelling string -// in these fixed lemmas, all fixed lemmas will be merged together into a unit -// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing -// phrase will be the first lemma in the sentence. Because it contains some -// modified lemmas (by deleting a character), these merged lemmas are called -// sub lemmas (sublma), and each of them are represented individually, so that -// when user deletes Pinyin characters from the end, these sub lemmas can also -// be unlocked one by one. -typedef struct { - uint16 spl_ids[kMaxRowNum]; - uint16 spl_start[kMaxRowNum]; - char16 chn_str[kMaxRowNum]; // Chinese string. - uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. - size_t sublma_num; - uint16 length; // Counted in Chinese characters. -} ComposingPhrase, *TComposingPhrase; - -class MatrixSearch { - private: - // If it is true, prediction list by string whose length is greater than 1 - // will be limited to a reasonable number. - static const bool kPredictLimitGt1 = false; - - // If it is true, the engine will prefer long history based prediction, - // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are - // based on the two-character history. - static const bool kPreferLongHistoryPredict = true; - - // If it is true, prediction will only be based on user dictionary. this flag - // is for debug purpose. - static const bool kOnlyUserDictPredict = false; - - // The maximum buffer to store LmaPsbItems. - static const size_t kMaxLmaPsbItems = 1450; - - // How many rows for each step. - static const size_t kMaxNodeARow = 5; - - // The maximum length of the sentence candidates counted in chinese - // characters - static const size_t kMaxSentenceLength = 16; - - // The size of the matrix node pool. - static const size_t kMtrxNdPoolSize = 200; - - // The size of the DMI node pool. - static const size_t kDmiPoolSize = 800; - - // Used to indicate whether this object has been initialized. - bool inited_; - - // Spelling trie. - const SpellingTrie *spl_trie_; - - // Used to indicate this switcher status: when "xian" is parseed, should - // "xi an" also be extended. Default is false. - // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string - // should be valid for a FULL spelling, or a combination of two spellings, - // first of which is a FULL id too. So even it is true, "da" will never be - // split into "d a", because "d" is not a full spelling id. - bool xi_an_enabled_; - - // System dictionary. - DictTrie* dict_trie_; - - // User dictionary. - AtomDictBase* user_dict_; - - // Spelling parser. - SpellingParser* spl_parser_; - - // The maximum allowed length of spelling string (such as a Pinyin string). - size_t max_sps_len_; - - // The maximum allowed length of a result Chinese string. - size_t max_hzs_len_; - - // Pinyin string. Max length: kMaxRowNum - 1 - char pys_[kMaxRowNum]; - - // The length of the string that has been decoded successfully. - size_t pys_decoded_len_; - - // Shared buffer for multiple purposes. - size_t *share_buf_; - - MatrixNode *mtrx_nd_pool_; - PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool - DictMatchInfo *dmi_pool_; - PoolPosType dmi_pool_used_; // How many items used in the pool - - MatrixRow *matrix_; // The first row is for starting - - DictExtPara *dep_; // Parameter used to extend DMI nodes. - - NPredictItem *npre_items_; // Used to do prediction - size_t npre_items_len_; - - // The starting positions and lemma ids for the full sentence candidate. - size_t lma_id_num_; - uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. - LemmaIdType lma_id_[kMaxRowNum]; - size_t fixed_lmas_; - - // If fixed_lmas_ is bigger than i, Element i is used to indicate whether - // the i'th lemma id in lma_id_ is the first candidate for that step. - // If all candidates are the first one for that step, the whole string can be - // decoded by the engine automatically, so no need to add it to user - // dictionary. (We are considering to add it to user dictionary in the - // future). - uint8 fixed_lmas_no1_[kMaxRowNum]; - - // Composing phrase - ComposingPhrase c_phrase_; - - // If dmi_c_phrase_ is true, the decoder will try to match the - // composing phrase (And definitely it will match successfully). If it - // is false, the decoder will try to match lemmas items in dictionaries. - bool dmi_c_phrase_; - - // The starting positions and spelling ids for the first full sentence - // candidate. - size_t spl_id_num_; // Number of splling ids - uint16 spl_start_[kMaxRowNum]; // Starting positions - uint16 spl_id_[kMaxRowNum]; // Spelling ids - // Used to remember the last fixed position, counted in Hanzi. - size_t fixed_hzs_; - - // Lemma Items with possibility score, two purposes: - // 1. In Viterbi decoding, this buffer is used to get all possible candidates - // for current step; - // 2. When the search is done, this buffer is used to get candiates from the - // first un-fixed step and show them to the user. - LmaPsbItem lpi_items_[kMaxLmaPsbItems]; - size_t lpi_total_; - - // Assign the pointers with NULL. The caller makes sure that all pointers are - // not valid before calling it. This function only will be called in the - // construction function and free_resource(). - void reset_pointers_to_null(); - - bool alloc_resource(); - - void free_resource(); - - // Reset the search space totally. - bool reset_search0(); - - // Reset the search space from ch_pos step. For example, if the original - // input Pinyin is "an", reset_search(1) will reset the search space to the - // result of "a". If the given position is out of range, return false. - // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, - // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. - // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. - // The DMI nodes will be kept. - // - // Note: this function should not destroy content of pys_. - bool reset_search(size_t ch_pos, bool clear_fixed_this_step, - bool clear_dmi_this_step, bool clear_mtrx_this_step); - - // Delete a part of the content in pys_. - void del_in_pys(size_t start, size_t len); - - // Delete a spelling id and its corresponding Chinese character, and merge - // the fixed lemmas into the composing phrase. - // del_spl_pos indicates which spelling id needs to be delete. - // This function will update the lemma and spelling segmentation information. - // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within - // the fixed lemmas. - void merge_fixed_lmas(size_t del_spl_pos); - - // Get spelling start posistions and ids. The result will be stored in - // spl_id_num_, spl_start_[], spl_id_[]. - // fixed_hzs_ will be also assigned. - void get_spl_start_id(); - - // Get all lemma ids with match the given spelling id stream(shorter than the - // maximum length of a word). - // If pfullsent is not NULL, means the full sentence candidate may be the - // same with the coming lemma string, if so, remove that lemma. - // The result is sorted in descendant order by the frequency score. - size_t get_lpis(const uint16* splid_str, size_t splid_str_len, - LmaPsbItem* lma_buf, size_t max_lma_buf, - const char16 *pfullsent, bool sort_by_psb); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - - // Extend a DMI node with a spelling id. ext_len is the length of the rows - // to extend, actually, it is the size of the spelling string of splid. - // return value can be 1 or 0. - // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in - // the pool). - // 0 means either the dmi node can not be extended with splid, or the splid - // is a Shengmu id, which is only used to get lpi_items, or the result node - // in DictTrie has no son, it is not nccessary to keep the new DMI. - // - // This function modifies the content of lpi_items_ and lpi_total_. - // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. - // The function's returned value has no relation with the value of lpi_num. - // - // If dmi == NULL, this function will extend the root node of DictTrie - // - // This function will not change dmi_nd_pool_used_. Please change it after - // calling this function if necessary. - // - // The caller should guarantees that NULL != dep. - size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); - - // Extend dmi for the composing phrase. - size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); - - // Extend a MatrixNode with the give LmaPsbItem list. - // res_row is the destination row number. - // This function does not change mtrx_nd_pool_used_. Please change it after - // calling this function if necessary. - // return 0 always. - size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], - size_t lpi_num, PoolPosType dmi_fr, size_t res_row); - - - // Try to find a dmi node at step_to position, and the found dmi node should - // match the given spelling id strings. - PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); - - bool add_char(char ch); - bool prepare_add_char(char ch); - - // Called after prepare_add_char, so the input char has been saved. - bool add_char_qwerty(); - - // Prepare candidates from the last fixed hanzi position. - void prepare_candidates(); - - // Is the character in step pos a splitter character? - // The caller guarantees that the position is valid. - bool is_split_at(uint16 pos); - - void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, - PoolPosType dmi_fr, - uint16 spl_id, uint16 node_num, unsigned char dict_level, - bool splid_end_split, unsigned char splstr_len, - unsigned char all_full_id); - - size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len); - - // Add the first candidate to the user dictionary. - bool try_add_cand0_to_userdict(); - - // Add a user lemma to the user dictionary. This lemma is a subset of - // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the - // number of lemmas to be combined together as a new lemma. The caller - // gurantees that the combined new lemma's length is less or equal to - // kMaxLemmaSize. - bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); - - // Update dictionary frequencies. - void update_dict_freq(); - - void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); - - public: - MatrixSearch(); - ~MatrixSearch(); - - bool init(const char *fn_sys_dict, const char *fn_usr_dict); - - bool init_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict); - - void init_user_dictionary(const char *fn_usr_dict); - - bool is_user_dictionary_enabled() const; - - void set_max_lens(size_t max_sps_len, size_t max_hzs_len); - - void close(); - - void flush_cache(); - - void set_xi_an_switch(bool xi_an_enabled); - - bool get_xi_an_switch(); - - // Reset the search space. Equivalent to reset_search(0). - // If inited, always return true; - bool reset_search(); - - // Search a Pinyin string. - // Return value is the position successfully parsed. - size_t search(const char *py, size_t py_len); - - // Used to delete something in the Pinyin string kept by the engine, and do - // a re-search. - // Return value is the new length of Pinyin string kept by the engine which - // is parsed successfully. - // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin - // character needs to be deleted. If is_pos_in_splid is true, all Pinyin - // characters for pos-th spelling id needs to be deleted. - // If the deleted character(s) is just after a fixed lemma or sub lemma in - // composing phrase, clear_fixed_this_step indicates whether we needs to - // unlock the last fixed lemma or sub lemma. - // If is_pos_in_splid is false, and pos-th character is in the range for the - // fixed lemmas or composing string, this function will do nothing and just - // return the result of the previous search. - size_t delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step); - - // Get the number of candiates, called after search(). - size_t get_candidate_num(); - - // Get the Pinyin string stored by the engine. - // *decoded_len returns the length of the successfully decoded string. - const char* get_pystr(size_t *decoded_len); - - // Get the spelling boundaries for the first sentence candidate. - // Number of spellings will be returned. The number of valid elements in - // spl_start is one more than the return value because the last one is used - // to indicate the beginning of the next un-input speling. - // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . - size_t get_spl_start(const uint16 *&spl_start); - - // Get one candiate string. If full sentence candidate is available, it will - // be the first one. - char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); - - // Get the first candiate, which is a "full sentence". - // retstr_len is not NULL, it will be used to return the string length. - // If only_unfixed is true, only unfixed part will be fetched. - char16* get_candidate0(char16* cand_str, size_t max_len, - uint16 *retstr_len, bool only_unfixed); - - // Choose a candidate. The decoder will do a search after the fixed position. - size_t choose(size_t cand_id); - - // Cancel the last choosing operation, and return the new number of choices. - size_t cancel_last_choice(); - - // Get the length of fixed Hanzis. - size_t get_fixedlen(); - - size_t get_predicts(const char16 fixed_buf[], - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len); -}; -} - -#endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h b/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h deleted file mode 100644 index dfcf980b..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_MYSTDLIB_H__ -#define PINYINIME_INCLUDE_MYSTDLIB_H__ - -#include <stdlib.h> - -namespace ime_pinyin { - -void myqsort(void *p, size_t n, size_t es, - int (*cmp)(const void *, const void *)); - -void *mybsearch(const void *key, const void *base, - size_t nmemb, size_t size, - int (*compar)(const void *, const void *)); -} - -#endif // PINYINIME_INCLUDE_MYSTDLIB_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h b/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h deleted file mode 100644 index 7adb46d8..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_NGRAM_H__ -#define PINYINIME_INCLUDE_NGRAM_H__ - -#include <stdio.h> -#include <stdlib.h> -#include "./dictdef.h" - -namespace ime_pinyin { - -typedef unsigned char CODEBOOK_TYPE; - -static const size_t kCodeBookSize = 256; - -class NGram { - public: - // The maximum score of a lemma item. - static const LmaScoreType kMaxScore = 0x3fff; - - // In order to reduce the storage size, the original log value is amplified by - // kScoreAmplifier, and we use LmaScoreType to store. - // After this process, an item with a lower score has a higher frequency. - static const int kLogValueAmplifier = -800; - - // System words' total frequency. It is not the real total frequency, instead, - // It is only used to adjust system lemmas' scores when the user dictionary's - // total frequency changes. - // In this version, frequencies of system lemmas are fixed. We are considering - // to make them changable in next version. - static const size_t kSysDictTotalFreq = 100000000; - - private: - - static NGram* instance_; - - bool initialized_; - uint32 idx_num_; - - size_t total_freq_none_sys_; - - // Score compensation for system dictionary lemmas. - // Because after user adds some user lemmas, the total frequency changes, and - // we use this value to normalize the score. - float sys_score_compensation_; - -#ifdef ___BUILD_MODEL___ - double *freq_codes_df_; -#endif - LmaScoreType *freq_codes_; - CODEBOOK_TYPE *lma_freq_idx_; - - public: - NGram(); - ~NGram(); - - static NGram& get_instance(); - - bool save_ngram(FILE *fp); - bool load_ngram(FILE *fp); - - // Set the total frequency of all none system dictionaries. - void set_total_freq_none_sys(size_t freq_none_sys); - - float get_uni_psb(LemmaIdType lma_id); - - // Convert a probability to score. Actually, the score will be limited to - // kMaxScore, but at runtime, we also need float expression to get accurate - // value of the score. - // After the conversion, a lower score indicates a higher probability of the - // item. - static float convert_psb_to_score(double psb); - -#ifdef ___BUILD_MODEL___ - // For constructing the unigram mode model. - bool build_unigram(LemmaEntry *lemma_arr, size_t num, - LemmaIdType next_idx_unused); -#endif -}; -} - -#endif // PINYINIME_INCLUDE_NGRAM_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h b/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h deleted file mode 100644 index e376c20c..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_ANDPYIME_H__ -#define PINYINIME_INCLUDE_ANDPYIME_H__ - -#include <stdlib.h> -#include "./dictdef.h" - -#ifdef __cplusplus -extern "C" { -#endif - - namespace ime_pinyin { - - /** - * Open the decoder engine via the system and user dictionary file names. - * - * @param fn_sys_dict The file name of the system dictionary. - * @param fn_usr_dict The file name of the user dictionary. - * @return true if open the decoder engine successfully. - */ - bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict); - - /** - * Open the decoder engine via the system dictionary FD and user dictionary - * file name. Because on Android, the system dictionary is embedded in the - * whole application apk file. - * - * @param sys_fd The file in which the system dictionary is embedded. - * @param start_offset The starting position of the system dictionary in the - * file sys_fd. - * @param length The length of the system dictionary in the file sys_fd, - * counted in byte. - * @return true if succeed. - */ - bool im_open_decoder_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict); - - /** - * Close the decoder engine. - */ - void im_close_decoder(); - - /** - * Set maximum limitations for decoding. If this function is not called, - * default values will be used. For example, due to screen size limitation, - * the UI engine of the IME can only show a certain number of letters(input) - * to decode, and a certain number of Chinese characters(output). If after - * user adds a new letter, the input or the output string is longer than the - * limitations, the engine will discard the recent letter. - * - * @param max_sps_len Maximum length of the spelling string(Pinyin string). - * @max_hzs_len Maximum length of the decoded Chinese character string. - */ - void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len); - - /** - * Flush cached data to persistent memory. Because at runtime, in order to - * achieve best performance, some data is only store in memory. - */ - void im_flush_cache(); - - /** - * Use a spelling string(Pinyin string) to search. The engine will try to do - * an incremental search based on its previous search result, so if the new - * string has the same prefix with the previous one stored in the decoder, - * the decoder will only continue the search from the end of the prefix. - * If the caller needs to do a brand new search, please call im_reset_search() - * first. Calling im_search() is equivalent to calling im_add_letter() one by - * one. - * - * @param sps_buf The spelling string buffer to decode. - * @param sps_len The length of the spelling string buffer. - * @return The number of candidates. - */ - size_t im_search(const char* sps_buf, size_t sps_len); - - /** - * Make a delete operation in the current search result, and make research if - * necessary. - * - * @param pos The posistion of char in spelling string to delete, or the - * position of spelling id in result string to delete. - * @param is_pos_in_splid Indicate whether the pos parameter is the position - * in the spelling string, or the position in the result spelling id string. - * @return The number of candidates. - */ - size_t im_delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step); - - /** - * Reset the previous search result. - */ - void im_reset_search(); - - /** - * Add a Pinyin letter to the current spelling string kept by decoder. If the - * decoder fails in adding the letter, it will do nothing. im_get_sps_str() - * can be used to get the spelling string kept by decoder currently. - * - * @param ch The letter to add. - * @return The number of candidates. - */ - size_t im_add_letter(char ch); - - /** - * Get the spelling string kept by the decoder. - * - * @param decoded_len Used to return how many characters in the spelling - * string is successfully parsed. - * @return The spelling string kept by the decoder. - */ - const char *im_get_sps_str(size_t *decoded_len); - - /** - * Get a candidate(or choice) string. - * - * @param cand_id The id to get a candidate. Started from 0. Usually, id 0 - * is a sentence-level candidate. - * @param cand_str The buffer to store the candidate. - * @param max_len The maximum length of the buffer. - * @return cand_str if succeeds, otherwise NULL. - */ - char16* im_get_candidate(size_t cand_id, char16* cand_str, - size_t max_len); - - /** - * Get the segmentation information(the starting positions) of the spelling - * string. - * - * @param spl_start Used to return the starting posistions. - * @return The number of spelling ids. If it is L, there will be L+1 valid - * elements in spl_start, and spl_start[L] is the posistion after the end of - * the last spelling id. - */ - size_t im_get_spl_start_pos(const uint16 *&spl_start); - - /** - * Choose a candidate and make it fixed. If the candidate does not match - * the end of all spelling ids, new candidates will be provided from the - * first unfixed position. If the candidate matches the end of the all - * spelling ids, there will be only one new candidates, or the whole fixed - * sentence. - * - * @param cand_id The id of candidate to select and make it fixed. - * @return The number of candidates. If after the selection, the whole result - * string has been fixed, there will be only one candidate. - */ - size_t im_choose(size_t cand_id); - - /** - * Cancel the last selection, or revert the last operation of im_choose(). - * - * @return The number of candidates. - */ - size_t im_cancel_last_choice(); - - /** - * Get the number of fixed spelling ids, or Chinese characters. - * - * @return The number of fixed spelling ids, of Chinese characters. - */ - size_t im_get_fixed_len(); - - /** - * Cancel the input state and reset the search workspace. - */ - bool im_cancel_input(); - - /** - * Get prediction candiates based on the given fixed Chinese string as the - * history. - * - * @param his_buf The history buffer to do the prediction. It should be ended - * with '\0'. - * @param pre_buf Used to return prediction result list. - * @return The number of predicted result string. - */ - size_t im_get_predicts(const char16 *his_buf, - char16 (*&pre_buf)[kMaxPredictSize + 1]); - - /** - * Enable Shengmus in ShouZiMu mode. - */ - void im_enable_shm_as_szm(bool enable); - - /** - * Enable Yunmus in ShouZiMu mode. - */ - void im_enable_ym_as_szm(bool enable); - - /** - * Initializes or uninitializes the user dictionary. - * - * @param fn_usr_dict The file name of the user dictionary. - */ - void im_init_user_dictionary(const char *fn_usr_dict); - - /** - * Returns the current status of user dictinary. - */ - bool im_is_user_dictionary_enabled(void); -} - -#ifdef __cplusplus -} -#endif - -#endif // PINYINIME_INCLUDE_ANDPYIME_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h b/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h deleted file mode 100644 index f1357107..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ -#define PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ - -#include <stdlib.h> -#include "./spellingtrie.h" - -namespace ime_pinyin { - -// Type used to identify the size of a pool, such as id pool, etc. -typedef uint16 PoolPosType; - -// Type used to identify a parsing mile stone in an atom dictionary. -typedef uint16 MileStoneHandle; - -// Type used to express a lemma and its probability score. -typedef struct { - size_t id:(kLemmaIdSize * 8); - size_t lma_len:4; - uint16 psb; // The score, the lower psb, the higher possibility. - // For single character items, we may also need Hanzi. - // For multiple characer items, ignore it. - char16 hanzi; -} LmaPsbItem, *PLmaPsbItem; - -// LmaPsbItem extended with string. -typedef struct { - LmaPsbItem lpi; - char16 str[kMaxLemmaSize + 1]; -} LmaPsbStrItem, *PLmaPsbStrItem; - - -typedef struct { - float psb; - char16 pre_hzs[kMaxPredictSize]; - uint16 his_len; // The length of the history used to do the prediction. -} NPredictItem, *PNPredictItem; - -// Parameter structure used to extend in a dictionary. All dictionaries -// receives the same DictExtPara and a dictionary specific MileStoneHandle for -// extending. -// -// When the user inputs a new character, AtomDictBase::extend_dict() will be -// called at least once for each dictionary. -// -// For example, when the user inputs "wm", extend_dict() will be called twice, -// and the DictExtPara parameter are as follows respectively: -// 1. splids = {w, m}; splids_extended = 1; ext_len = 1; step_no = 1; -// splid_end_split = false; id_start = wa(the first id start with 'w'); -// id_num = number of ids starting with 'w'. -// 2. splids = {m}; splids_extended = 0; ext_len = 1; step_no = 1; -// splid_end_split = false; id_start = wa; id_num = number of ids starting with -// 'w'. -// -// For string "women", one of the cases of the DictExtPara parameter is: -// splids = {wo, men}, splids_extended = 1, ext_len = 3 (length of "men"), -// step_no = 4; splid_end_split = false; id_start = men, id_num = 1. -// -typedef struct { - // Spelling ids for extending, there are splids_extended + 1 ids in the - // buffer. - // For a normal lemma, there can only be kMaxLemmaSize spelling ids in max, - // but for a composing phrase, there can kMaxSearchSteps spelling ids. - uint16 splids[kMaxSearchSteps]; - - // Number of ids that have been used before. splids[splids_extended] is the - // newly added id for the current extension. - uint16 splids_extended; - - // The step span of the extension. It is also the size of the string for - // the newly added spelling id. - uint16 ext_len; - - // The step number for the current extension. It is also the ending position - // in the input Pinyin string for the substring of spelling ids in splids[]. - // For example, when the user inputs "women", step_no = 4. - // This parameter may useful to manage the MileStoneHandle list for each - // step. When the user deletes a character from the string, MileStoneHandle - // objects for the the steps after that character should be reset; when the - // user begins a new string, all MileStoneHandle objects should be reset. - uint16 step_no; - - // Indicate whether the newly added spelling ends with a splitting character - bool splid_end_split; - - // If the newly added id is a half id, id_start is the first id of the - // corresponding full ids; if the newly added id is a full id, id_start is - // that id. - uint16 id_start; - - // If the newly added id is a half id, id_num is the number of corresponding - // ids; if it is a full id, id_num == 1. - uint16 id_num; -}DictExtPara, *PDictExtPara; - -bool is_system_lemma(LemmaIdType lma_id); -bool is_user_lemma(LemmaIdType lma_id); -bool is_composing_lemma(LemmaIdType lma_id); - -int cmp_lpi_with_psb(const void *p1, const void *p2); -int cmp_lpi_with_unified_psb(const void *p1, const void *p2); -int cmp_lpi_with_id(const void *p1, const void *p2); -int cmp_lpi_with_hanzi(const void *p1, const void *p2); - -int cmp_lpsi_with_str(const void *p1, const void *p2); - -int cmp_hanzis_1(const void *p1, const void *p2); -int cmp_hanzis_2(const void *p1, const void *p2); -int cmp_hanzis_3(const void *p1, const void *p2); -int cmp_hanzis_4(const void *p1, const void *p2); -int cmp_hanzis_5(const void *p1, const void *p2); -int cmp_hanzis_6(const void *p1, const void *p2); -int cmp_hanzis_7(const void *p1, const void *p2); -int cmp_hanzis_8(const void *p1, const void *p2); - -int cmp_npre_by_score(const void *p1, const void *p2); -int cmp_npre_by_hislen_score(const void *p1, const void *p2); -int cmp_npre_by_hanzi_score(const void *p1, const void *p2); - - -size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num); - -size_t align_to_size_t(size_t size); - -} // namespace - -#endif // PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h deleted file mode 100644 index fd79c6ef..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPELLINGTABLE_H__ -#define PINYINIME_INCLUDE_SPELLINGTABLE_H__ - -#include <stdlib.h> -#include "./dictdef.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -const size_t kMaxSpellingSize = kMaxPinyinSize; - -typedef struct { - char str[kMaxSpellingSize + 1]; - double freq; -} RawSpelling, *PRawSpelling; - -// This class is used to store the spelling strings -// The length of the input spelling string should be less or equal to the -// spelling_size_ (set by init_table). If the input string is too long, -// we only keep its first spelling_size_ chars. -class SpellingTable { - private: - static const size_t kNotSupportNum = 3; - static const char kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1]; - - bool need_score_; - - size_t spelling_max_num_; - - RawSpelling *raw_spellings_; - - // Used to store spelling strings. If the spelling table needs to calculate - // score, an extra char after each spelling string is the score. - // An item with a lower score has a higher probability. - char *spelling_buf_; - size_t spelling_size_; - - double total_freq_; - - size_t spelling_num_; - - double score_amplifier_; - - unsigned char average_score_; - - // If frozen is true, put_spelling() and contain() are not allowed to call. - bool frozen_; - - size_t get_hash_pos(const char* spelling_str); - size_t hash_pos_next(size_t hash_pos); - void free_resource(); - public: - SpellingTable(); - ~SpellingTable(); - - // pure_spl_size is the pure maximum spelling string size. For example, - // "zhuang" is the longgest item in Pinyin, so pure_spl_size should be 6. - // spl_max_num is the maximum number of spelling strings to store. - // need_score is used to indicate whether the caller needs to calculate a - // score for each spelling. - bool init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score); - - // Put a spelling string to the table. - // It always returns false if called after arrange() withtout a new - // init_table() operation. - // freq is the spelling's occuring count. - // If the spelling has been in the table, occuring count will accumulated. - bool put_spelling(const char* spelling_str, double spl_count); - - // Test whether a spelling string is in the table. - // It always returns false, when being called after arrange() withtout a new - // init_table() operation. - bool contain(const char* spelling_str); - - // Sort the spelling strings and put them from the begin of the buffer. - // Return the pointer of the sorted spelling strings. - // item_size and spl_num return the item size and number of spelling. - // Because each spelling uses a '\0' as terminator, the returned item_size is - // at least one char longer than the spl_size parameter specified by - // init_table(). If the table is initialized to calculate score, item_size - // will be increased by 1, and current_spl_str[item_size - 1] stores an - // unsinged char score. - // An item with a lower score has a higher probability. - // Do not call put_spelling() and contains() after arrange(). - const char* arrange(size_t *item_size, size_t *spl_num); - - float get_score_amplifier(); - - unsigned char get_average_score(); -}; -#endif // ___BUILD_MODEL___ -} - -#endif // PINYINIME_INCLUDE_SPELLINGTABLE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h deleted file mode 100644 index 03510ed3..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__ -#define PINYINIME_INCLUDE_SPELLINGTRIE_H__ - -#include <stdio.h> -#include <stdlib.h> -#include "./dictdef.h" - -namespace ime_pinyin { - -static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1; - -// Node used for the trie of spellings -struct SpellingNode { - SpellingNode *first_son; - // The spelling id for each node. If you need more bits to store - // spelling id, please adjust this structure. - uint16 spelling_idx:11; - uint16 num_of_son:5; - char char_this_node; - unsigned char score; -}; - -class SpellingTrie { - private: - static const int kMaxYmNum = 64; - static const size_t kValidSplCharNum = 26; - - static const uint16 kHalfIdShengmuMask = 0x01; - static const uint16 kHalfIdYunmuMask = 0x02; - static const uint16 kHalfIdSzmMask = 0x04; - - // Map from half spelling id to single char. - // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively. - // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ..., - // 28 to 'Z', 29 to 'z'. - // [0] is not used to achieve better efficiency. - static const char kHalfId2Sc_[kFullSplIdStart + 1]; - - static unsigned char char_flags_[]; - static SpellingTrie* instance_; - - // The spelling table - char *spelling_buf_; - - // The size of longest spelling string, includes '\0' and an extra char to - // store score. For example, "zhuang" is the longgest item in Pinyin list, - // so spelling_size_ is 8. - // Structure: The string ended with '\0' + score char. - // An item with a lower score has a higher probability. - uint32 spelling_size_; - - // Number of full spelling ids. - uint32 spelling_num_; - - float score_amplifier_; - unsigned char average_score_; - - // The Yunmu id list for the spelling ids (for half ids of Shengmu, - // the Yunmu id is 0). - // The length of the list is spelling_num_ + kFullSplIdStart, - // so that spl_ym_ids_[splid] is the Yunmu id of the splid. - uint8 *spl_ym_ids_; - - // The Yunmu table. - // Each Yunmu will be assigned with Yunmu id from 1. - char *ym_buf_; - size_t ym_size_; // The size of longest Yunmu string, '\0'included. - size_t ym_num_; - - // The spelling string just queried - char *splstr_queried_; - - // The spelling string just queried - char16 *splstr16_queried_; - - // The root node of the spelling tree - SpellingNode* root_; - - // If a none qwerty key such as a fnction key like ENTER is given, this node - // will be used to indicate that this is not a QWERTY node. - SpellingNode* dumb_node_; - - // If a splitter key is pressed, this node will be used to indicate that this - // is a splitter key. - SpellingNode* splitter_node_; - - // Used to get the first level sons. - SpellingNode* level1_sons_[kValidSplCharNum]; - - // The full spl_id range for specific half id. - // h2f means half to full. - // A half id can be a ShouZiMu id (id to represent the first char of a full - // spelling, including Shengmu and Yunmu), or id of zh/ch/sh. - // [1..kFullSplIdStart-1] is the arrange of half id. - uint16 h2f_start_[kFullSplIdStart]; - uint16 h2f_num_[kFullSplIdStart]; - - // Map from full id to half id. - uint16 *f2h_; - -#ifdef ___BUILD_MODEL___ - // How many node used to build the trie. - size_t node_num_; -#endif - - SpellingTrie(); - - void free_son_trie(SpellingNode* node); - - // Construct a subtree using a subset of the spelling array (from - // item_star to item_end). - // Member spelliing_buf_ and spelling_size_ should be valid. - // parent is used to update its num_of_son and score. - SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end, - size_t level, SpellingNode *parent); - bool build_f2h(); - - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_shengmu_char(char ch) const; - - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_yunmu_char(char ch) const; - -#ifdef ___BUILD_MODEL___ - // Given a spelling string, return its Yunmu string. - // The caller guaratees spl_str is valid. - const char* get_ym_str(const char *spl_str); - - // Build the Yunmu list, and the mapping relation between the full ids and the - // Yunmu ids. This functin is called after the spelling trie is built. - bool build_ym_info(); -#endif - - friend class SpellingParser; - friend class SmartSplParser; - friend class SmartSplParser2; - - public: - ~SpellingTrie(); - - inline static bool is_valid_spl_char(char ch) { - return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); - } - - // The caller guarantees that the two chars are valid spelling chars. - inline static bool is_same_spl_char(char ch1, char ch2) { - return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A'; - } - - // Construct the tree from the input pinyin array - // The given string list should have been sorted. - // score_amplifier is used to convert a possibility value into score. - // average_score is the average_score of all spellings. The dumb node is - // assigned with this score. - bool construct(const char* spelling_arr, size_t item_size, size_t item_num, - float score_amplifier, unsigned char average_score); - - // Test if the given id is a valid spelling id. - // If function returns true, the given splid may be updated like this: - // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is - // first given as a half id 1, but because 'A' is a one-char Yunmu and - // it is a valid id, it needs to updated to its corresponding full id. - bool if_valid_id_update(uint16 *splid) const; - - // Test if the given id is a half id. - bool is_half_id(uint16 splid) const; - - bool is_full_id(uint16 splid) const; - - // Test if the given id is a one-char Yunmu id (obviously, it is also a half - // id), such as 'A', 'E' and 'O'. - bool is_half_id_yunmu(uint16 splid) const; - - // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled. - // For Pinyin, only i/u/v is not a ShouZiMu char. - // The caller should guarantee that ch >= 'A' && ch <= 'Z' - bool is_szm_char(char ch) const; - - // Test If this char is enabled in ShouZiMu mode. - // The caller should guarantee that ch >= 'A' && ch <= 'Z' - bool szm_is_enabled(char ch) const; - - // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling - // to input). - void szm_enable_shm(bool enable); - - // Enable/disable Yunmus in ShouZiMu mode. - void szm_enable_ym(bool enable); - - // Test if this char is enabled in ShouZiMu mode. - // The caller should guarantee ch >= 'A' && ch <= 'Z' - bool is_szm_enabled(char ch) const; - - // Return the number of full ids for the given half id. - uint16 half2full_num(uint16 half_id) const; - - // Return the number of full ids for the given half id, and fill spl_id_start - // to return the first full id. - uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const; - - // Return the corresponding half id for the given full id. - // Not frequently used, low efficient. - // Return 0 if fails. - uint16 full_to_half(uint16 full_id) const; - - // To test whether a half id is compatible with a full id. - // Generally, when half_id == full_to_half(full_id), return true. - // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible - // with a full id like "Zhe". (Fussy mode is not ready). - bool half_full_compatible(uint16 half_id, uint16 full_id) const; - - static const SpellingTrie* get_cpinstance(); - - static SpellingTrie& get_instance(); - - // Save to the file stream - bool save_spl_trie(FILE *fp); - - // Load from the file stream - bool load_spl_trie(FILE *fp); - - // Get the number of spellings - size_t get_spelling_num(); - - // Return the Yunmu id for the given Yunmu string. - // If the string is not valid, return 0; - uint8 get_ym_id(const char* ym_str); - - // Get the readonly Pinyin string for a given spelling id - const char* get_spelling_str(uint16 splid); - - // Get the readonly Pinyin string for a given spelling id - const char16* get_spelling_str16(uint16 splid); - - // Get Pinyin string for a given spelling id. Return the length of the - // string, and fill-in '\0' at the end. - size_t get_spelling_str16(uint16 splid, char16 *splstr16, - size_t splstr16_len); -}; -} - -#endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h b/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h deleted file mode 100644 index d783bd73..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SPLPARSER_H__ -#define PINYINIME_INCLUDE_SPLPARSER_H__ - -#include "./dictdef.h" -#include "./spellingtrie.h" - -namespace ime_pinyin { - -class SpellingParser { - protected: - const SpellingTrie *spl_trie_; - - public: - SpellingParser(); - - // Given a string, parse it into a spelling id stream. - // If the whole string are sucessfully parsed, last_is_pre will be true; - // if the whole string is not fullly parsed, last_is_pre will return whether - // the last part of the string is a prefix of a full spelling string. For - // example, given string "zhengzhon", "zhon" is not a valid speling, but it is - // the prefix of "zhong". - // - // If splstr starts with a character not in ['a'-z'] (it is a split char), - // return 0. - // Split char can only appear in the middle of the string or at the end. - uint16 splstr_to_idxs(const char *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs(), the only difference is that splstr_to_idxs() - // convert single-character Yunmus into half ids, while this function converts - // them into full ids. - uint16 splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs(), the only difference is that this function - // uses char16 instead of char8. - uint16 splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 splidx[], - uint16 start_pos[], uint16 max_size, bool &last_is_pre); - - // Similar to splstr_to_idxs_f(), the only difference is that this function - // uses char16 instead of char8. - uint16 splstr16_to_idxs_f(const char16 *splstr16, uint16 str_len, - uint16 splidx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre); - - // If the given string is a spelling, return the id, others, return 0. - // If the give string is a single char Yunmus like "A", and the char is - // enabled in ShouZiMu mode, the returned spelling id will be a half id. - // When the returned spelling id is a half id, *is_pre returns whether it - // is a prefix of a full spelling string. - uint16 get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre); - - // If the given string is a spelling, return the id, others, return 0. - // If the give string is a single char Yunmus like "a", no matter the char - // is enabled in ShouZiMu mode or not, the returned spelling id will be - // a full id. - // When the returned spelling id is a half id, *p_is_pre returns whether it - // is a prefix of a full spelling string. - uint16 get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre); - - // Splitter chars are not included. - bool is_valid_to_parse(char ch); - - // When auto-correction is not enabled, get_splid_by_str() will be called to - // return the single result. When auto-correction is enabled, this function - // will be called to get the results. Auto-correction is not ready. - // full_id_num returns number of full spelling ids. - // is_pre returns whether the given string is the prefix of a full spelling - // string. - // If splstr starts with a character not in [a-zA-Z] (it is a split char), - // return 0. - // Split char can only appear in the middle of the string or at the end. - // The caller should guarantee NULL != splstr && str_len > 0 && NULL != splidx - uint16 get_splids_parallel(const char *splstr, uint16 str_len, - uint16 splidx[], uint16 max_size, - uint16 &full_id_num, bool &is_pre); -}; -} - -#endif // PINYINIME_INCLUDE_SPLPARSER_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h b/src/virtualkeyboard/3rdparty/pinyin/include/sync.h deleted file mode 100644 index bf42d1f1..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_SYNC_H__ -#define PINYINIME_INCLUDE_SYNC_H__ - -#define ___SYNC_ENABLED___ - -#ifdef ___SYNC_ENABLED___ - -#include "userdict.h" - -namespace ime_pinyin { - -// Class for user dictionary synchronization -// This class is not thread safe -// Normal invoking flow will be -// begin() -> -// put_lemmas() x N -> -// { -// get_lemmas() -> -// [ get_last_got_count() ] -> -// clear_last_got() -> -// } x N -> -// finish() -class Sync { - public: - Sync(); - ~Sync(); - - static const int kUserDictMaxLemmaCount = 5000; - static const int kUserDictMaxLemmaSize = 200000; - static const int kUserDictRatio = 20; - - bool begin(const char * filename); - - // Merge lemmas downloaded from sync server into local dictionary - // lemmas, lemmas string encoded in UTF16LE - // len, length of lemmas string - // Return how many lemmas merged successfully - int put_lemmas(char16 * lemmas, int len); - - // Get local new user lemmas into UTF16LE string - // str, buffer ptr to store new user lemmas - // size, size of buffer - // Return length of returned buffer in measure of UTF16LE - int get_lemmas(char16 * str, int size); - - // Return lemmas count in last get_lemmas() - int get_last_got_count(); - - // Return total lemmas count need get_lemmas() - int get_total_count(); - - // Clear lemmas got by recent get_lemmas() - void clear_last_got(); - - void finish(); - - int get_capacity(); - - private: - UserDict * userdict_; - char * dictfile_; - int last_count_; -}; - -} - -#endif - -#endif // PINYINIME_INCLUDE_SYNC_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h b/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h deleted file mode 100644 index db010912..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_USERDICT_H__ -#define PINYINIME_INCLUDE_USERDICT_H__ - -#define ___CACHE_ENABLED___ -#define ___SYNC_ENABLED___ -#define ___PREDICT_ENABLED___ - -// Debug performance for operations -// #define ___DEBUG_PERF___ - -#ifdef _WIN32 -#include <time.h> -#include <winsock.h> // timeval -#else -#include <pthread.h> -#include <sys/time.h> -#endif -#include "atomdictbase.h" - -namespace ime_pinyin { - -class UserDict : public AtomDictBase { - public: - UserDict(); - ~UserDict(); - - bool load_dict(const char *file_name, LemmaIdType start_id, - LemmaIdType end_id); - - bool close_dict(); - - size_t number_of_lemmas(); - - void reset_milestones(uint16 from_step, MileStoneHandle from_handle); - - MileStoneHandle extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num); - - size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, - uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid); - - size_t predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used); - - // Full spelling ids are required - LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count); - - LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, - bool selected); - - LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], - uint16 lemma_len); - - LmaScoreType get_lemma_score(LemmaIdType lemma_id); - - LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len); - - bool remove_lemma(LemmaIdType lemma_id); - - size_t get_total_lemma_count(); - void set_total_lemma_count_of_others(size_t count); - - void flush_cache(); - - void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, - uint32 reclaim_ratio); - - void reclaim(); - - void defragment(); - -#ifdef ___SYNC_ENABLED___ - void clear_sync_lemmas(unsigned int start, unsigned int end); - - int get_sync_count(); - - LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - /** - * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. - * - * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' - * @param len length of lemmas string in UTF-16LE - * @return newly added lemma count - */ - int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); - - /** - * Get lemmas need sync to a UTF-16LE string of above format. - * Note: input buffer (str) must not be too small. If str is too small to - * contain single one lemma, there might be a dead loop. - * - * @param str buffer to write lemmas - * @param size buffer size in UTF-16LE - * @param count output value of lemma returned - * @return UTF-16LE string length - */ - int get_sync_lemmas_in_utf16le_string_from_beginning( - char16 * str, int size, int * count); - -#endif - - struct UserDictStat { - uint32 version; - const char * file_name; - struct timeval load_time; - struct timeval last_update; - uint32 disk_size; - uint32 lemma_count; - uint32 lemma_size; - uint32 delete_count; - uint32 delete_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - uint32 reclaim_ratio; - uint32 limit_lemma_count; - uint32 limit_lemma_size; - }; - - bool state(UserDictStat * stat); - - private: - uint32 total_other_nfreq_; - struct timeval load_time_; - LemmaIdType start_id_; - uint32 version_; - uint8 * lemmas_; - - // In-Memory-Only flag for each lemma - static const uint8 kUserDictLemmaFlagRemove = 1; - // Inuse lemmas' offset - uint32 * offsets_; - // Highest bit in offset tells whether corresponding lemma is removed - static const uint32 kUserDictOffsetFlagRemove = (1 << 31); - // Maximum possible for the offset - static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); - // Bit width for last modified time, from 1 to 16 - static const uint32 kUserDictLMTBitWidth = 16; - // Granularity for last modified time in second - static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; - // Maximum frequency count - static const uint16 kUserDictMaxFrequency = 0xFFFF; - -#define COARSE_UTC(year, month, day, hour, minute, second) \ - ( \ - (year - 1970) * 365 * 24 * 60 * 60 + \ - (month - 1) * 30 * 24 * 60 * 60 + \ - (day - 1) * 24 * 60 * 60 + \ - (hour - 0) * 60 * 60 + \ - (minute - 0) * 60 + \ - (second - 0) \ - ) - static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); - - // Correspond to offsets_ - uint32 * scores_; - // Following two fields are only valid in memory - uint32 * ids_; -#ifdef ___PREDICT_ENABLED___ - uint32 * predicts_; -#endif -#ifdef ___SYNC_ENABLED___ - uint32 * syncs_; - size_t sync_count_size_; -#endif - uint32 * offsets_by_id_; - - size_t lemma_count_left_; - size_t lemma_size_left_; - - const char * dict_file_; - - // Be sure size is 4xN - struct UserDictInfo { - // When limitation reached, how much percentage will be reclaimed (1 ~ 100) - uint32 reclaim_ratio; - // maximum lemma count, 0 means no limitation - uint32 limit_lemma_count; - // Maximum lemma size, it's different from - // whole disk file size or in-mem dict size - // 0 means no limitation - uint32 limit_lemma_size; - // Total lemma count including deleted and inuse - // Also indicate offsets_ size - uint32 lemma_count; - // Total size of lemmas including used and freed - uint32 lemma_size; - // Freed lemma count - uint32 free_count; - // Freed lemma size in byte - uint32 free_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - int32 total_nfreq; - } dict_info_; - - static const uint32 kUserDictVersion = 0x0ABCDEF0; - - static const uint32 kUserDictPreAlloc = 32; - static const uint32 kUserDictAverageNchar = 8; - - enum UserDictState { - // Keep in order - USER_DICT_NONE = 0, - USER_DICT_SYNC, -#ifdef ___SYNC_ENABLED___ - USER_DICT_SYNC_DIRTY, -#endif - USER_DICT_SCORE_DIRTY, - USER_DICT_OFFSET_DIRTY, - USER_DICT_LEMMA_DIRTY, - - USER_DICT_DEFRAGMENTED, - } state_; - - struct UserDictSearchable { - uint16 splids_len; - uint16 splid_start[kMaxLemmaSize]; - uint16 splid_count[kMaxLemmaSize]; - // Compact inital letters for both FuzzyCompareSpellId and cache system - uint32 signature[kMaxLemmaSize / 4]; - }; - -#ifdef ___CACHE_ENABLED___ - enum UserDictCacheType { - USER_DICT_CACHE, - USER_DICT_MISS_CACHE, - }; - - static const int kUserDictCacheSize = 4; - static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; - - struct UserDictMissCache { - uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; - uint16 head, tail; - } miss_caches_[kMaxLemmaSize]; - - struct UserDictCache { - uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; - uint32 offsets[kUserDictCacheSize]; - uint32 lengths[kUserDictCacheSize]; - // Ring buffer - uint16 head, tail; - } caches_[kMaxLemmaSize]; - - void cache_init(); - - void cache_push(UserDictCacheType type, - UserDictSearchable *searchable, - uint32 offset, uint32 length); - - bool cache_hit(UserDictSearchable *searchable, - uint32 *offset, uint32 *length); - - bool load_cache(UserDictSearchable *searchable, - uint32 *offset, uint32 *length); - - void save_cache(UserDictSearchable *searchable, - uint32 offset, uint32 length); - - void reset_cache(); - - bool load_miss_cache(UserDictSearchable *searchable); - - void save_miss_cache(UserDictSearchable *searchable); - - void reset_miss_cache(); -#endif - - LmaScoreType translate_score(int f); - - int extract_score_freq(int raw_score); - - uint64 extract_score_lmt(int raw_score); - - inline int build_score(uint64 lmt, int freq); - - inline int64 utf16le_atoll(uint16 *s, int len); - - inline int utf16le_lltoa(int64 v, uint16 *s, int size); - - LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - - size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); - - int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); - - int _get_lemma_score(LemmaIdType lemma_id); - - int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, - const UserDictSearchable *searchable); - - bool is_prefix_spell_id(const uint16 * fullids, - uint16 fulllen, const UserDictSearchable *searchable); - - uint32 get_dict_file_size(UserDictInfo * info); - - bool reset(const char *file); - - bool validate(const char *file); - - bool load(const char *file, LemmaIdType start_id); - - bool is_valid_state(); - - bool is_valid_lemma_id(LemmaIdType id); - - LemmaIdType get_max_lemma_id(); - - void set_lemma_flag(uint32 offset, uint8 flag); - - char get_lemma_flag(uint32 offset); - - char get_lemma_nchar(uint32 offset); - - uint16 * get_lemma_spell_ids(uint32 offset); - - uint16 * get_lemma_word(uint32 offset); - - // Prepare searchable to fasten locate process - void prepare_locate(UserDictSearchable *searchable, - const uint16 * splids, uint16 len); - - // Compare initial letters only - int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, - const UserDictSearchable *searchable); - - // Compare exactly two spell ids - // First argument must be a full id spell id - bool equal_spell_id(const uint16 * fullids, - uint16 fulllen, const UserDictSearchable *searchable); - - // Find first item by initial letters - int32 locate_first_in_offsets(const UserDictSearchable *searchable); - - LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt); - - // Check if a lemma is in dictionary - int32 locate_in_offsets(char16 lemma_str[], - uint16 splid_str[], uint16 lemma_len); - - bool remove_lemma_by_offset_index(int offset_index); -#ifdef ___PREDICT_ENABLED___ - uint32 locate_where_to_insert_in_predicts(const uint16 * words, - int lemma_len); - - int32 locate_first_in_predicts(const uint16 * words, int lemma_len); - - void remove_lemma_from_predict_list(uint32 offset); -#endif -#ifdef ___SYNC_ENABLED___ - void queue_lemma_for_sync(LemmaIdType id); - - void remove_lemma_from_sync_list(uint32 offset); - - void write_back_sync(int fd); -#endif - void write_back_score(int fd); - void write_back_offset(int fd); - void write_back_lemma(int fd); - void write_back_all(int fd); - void write_back(); - - struct UserDictScoreOffsetPair { - int score; - uint32 offset_index; - }; - - inline void swap(UserDictScoreOffsetPair * sop, int i, int j); - - void shift_down(UserDictScoreOffsetPair * sop, int i, int n); - - // On-disk format for each lemma - // +-------------+ - // | Version (4) | - // +-------------+ - // +-----------+-----------+--------------------+-------------------+ - // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | - // +-----------+-----------+--------------------+-------------------+ - // ... - // +-----------------------+ +-------------+ <---Offset of offset - // | Offset1 by_splids (4) | ... | OffsetN (4) | - // +-----------------------+ +-------------+ -#ifdef ___PREDICT_ENABLED___ - // +----------------------+ +-------------+ - // | Offset1 by_lemma (4) | ... | OffsetN (4) | - // +----------------------+ +-------------+ -#endif - // +------------+ +------------+ - // | Score1 (4) | ... | ScoreN (4) | - // +------------+ +------------+ -#ifdef ___SYNC_ENABLED___ - // +-------------+ +-------------+ - // | NewAdd1 (4) | ... | NewAddN (4) | - // +-------------+ +-------------+ -#endif - // +----------------+ - // | Dict Info (4x) | - // +----------------+ -}; -} - -#endif diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h deleted file mode 100644 index 7e957db5..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_UTF16CHAR_H__ -#define PINYINIME_INCLUDE_UTF16CHAR_H__ - -#include <stdlib.h> - -namespace ime_pinyin { - -#ifdef __cplusplus -extern "C" { -#endif - - typedef unsigned short char16; - - // Get a token from utf16_str, - // Returned pointer is a '\0'-terminated utf16 string, or NULL - // *utf16_str_next returns the next part of the string for further tokenizing - char16* utf16_strtok(char16 *utf16_str, size_t *token_size, - char16 **utf16_str_next); - - int utf16_atoi(const char16 *utf16_str); - - float utf16_atof(const char16 *utf16_str); - - size_t utf16_strlen(const char16 *utf16_str); - - int utf16_strcmp(const char16 *str1, const char16 *str2); - int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size); - - char16* utf16_strcpy(char16 *dst, const char16 *src); - char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size); - - - char* utf16_strcpy_tochar(char *dst, const char16 *src); - -#ifdef __cplusplus -} -#endif -} - -#endif // PINYINIME_INCLUDE_UTF16CHAR_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h deleted file mode 100644 index b6d6719e..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_UTF16READER_H__ -#define PINYINIME_INCLUDE_UTF16READER_H__ - -#include <stdio.h> -#include "./utf16char.h" - -namespace ime_pinyin { - -class Utf16Reader { - private: - FILE *fp_; - char16 *buffer_; - size_t buffer_total_len_; - size_t buffer_next_pos_; - - // Always less than buffer_total_len_ - buffer_next_pos_ - size_t buffer_valid_len_; - - public: - Utf16Reader(); - ~Utf16Reader(); - - // filename is the name of the file to open. - // buffer_len specifies how long buffer should be allocated to speed up the - // future reading - bool open(const char* filename, size_t buffer_len); - char16* readline(char16* read_buf, size_t max_len); - bool close(); -}; -} - -#endif // PINYINIME_INCLUDE_UTF16READER_H__ diff --git a/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro b/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro deleted file mode 100644 index 9ad9a318..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro +++ /dev/null @@ -1,59 +0,0 @@ -TARGET = qtpinyin - -VERSION = 1.0.0 -CONFIG += static -CONFIG += warn_off - -MODULE_INCLUDEPATH = $$PWD/include - -SOURCES += \ - share/dictbuilder.cpp \ - share/dictlist.cpp \ - share/dicttrie.cpp \ - share/lpicache.cpp \ - share/matrixsearch.cpp \ - share/mystdlib.cpp \ - share/ngram.cpp \ - share/pinyinime.cpp \ - share/searchutility.cpp \ - share/spellingtable.cpp \ - share/spellingtrie.cpp \ - share/splparser.cpp \ - share/sync.cpp \ - share/userdict.cpp \ - share/utf16char.cpp \ - share/utf16reader.cpp - -HEADERS += \ - include/atomdictbase.h \ - include/dictbuilder.h \ - include/dictdef.h \ - include/dictlist.h \ - include/dicttrie.h \ - include/lpicache.h \ - include/matrixsearch.h \ - include/mystdlib.h \ - include/ngram.h \ - include/pinyinime.h \ - include/searchutility.h \ - include/spellingtable.h \ - include/spellingtrie.h \ - include/splparser.h \ - include/sync.h \ - include/userdict.h \ - include/utf16char.h \ - include/utf16reader.h - -OTHER_FILES +=\ - data/rawdict_utf16_65105_freq.txt \ - data/valid_utf16.txt - -load(qt_helper_lib) - -# On Windows, the library uses Qt for platform abstraction. -win32 { - CONFIG += qt - QT = core -} else { - CONFIG *= thread -} diff --git a/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json b/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json deleted file mode 100644 index c739749f..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "Id": "pinyin", - "Name": "PinyinIME", - "QDocModule": "qtvirtualkeyboard", - "Description": "PinyinIME is an input method engine for Pinyin (the official romanization system for Standard Chinese -in mainland China, Malaysia, Singapore, and Taiwan) from the Android Open Source Project.", - "QtUsage": "Optionally used in Qt Virtual Keyboard.", - - "License": "Apache License 2.0", - "LicenseId": "Apache-2.0", - "LicenseFile": "NOTICE", - "Copyright": "Copyright (C) 2009 The Android Open Source Project" -} diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp deleted file mode 100644 index 6f0bd4f7..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp +++ /dev/null @@ -1,1070 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "../include/dictbuilder.h" -#include "../include/dicttrie.h" -#include "../include/mystdlib.h" -#include "../include/ngram.h" -#include "../include/searchutility.h" -#include "../include/spellingtable.h" -#include "../include/spellingtrie.h" -#include "../include/splparser.h" -#include "../include/utf16reader.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -static const size_t kReadBufLen = 512; -static const size_t kSplTableHashLen = 2000; - -// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by -// frequencies. -int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { - const SingleCharItem *s1, *s2; - s1 = static_cast<const SingleCharItem*>(p1); - s2 = static_cast<const SingleCharItem*>(p2); - - if (s1->hz < s2->hz) - return -1; - if (s1->hz > s2->hz) - return 1; - - if (s1->splid.half_splid < s2->splid.half_splid) - return -1; - if (s1->splid.half_splid > s2->splid.half_splid) - return 1; - - if (s1->splid.full_splid < s2->splid.full_splid) - return -1; - if (s1->splid.full_splid > s2->splid.full_splid) - return 1; - - if (s1->freq > s2->freq) - return -1; - if (s1->freq < s2->freq) - return 1; - return 0; -} - -int cmp_scis_hz_splid(const void* p1, const void* p2) { - const SingleCharItem *s1, *s2; - s1 = static_cast<const SingleCharItem*>(p1); - s2 = static_cast<const SingleCharItem*>(p2); - - if (s1->hz < s2->hz) - return -1; - if (s1->hz > s2->hz) - return 1; - - if (s1->splid.half_splid < s2->splid.half_splid) - return -1; - if (s1->splid.half_splid > s2->splid.half_splid) - return 1; - - if (s1->splid.full_splid < s2->splid.full_splid) - return -1; - if (s1->splid.full_splid > s2->splid.full_splid) - return 1; - - return 0; -} - -int cmp_lemma_entry_hzs(const void* p1, const void* p2) { - size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); - size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); - if (size1 < size2) - return -1; - else if (size1 > size2) - return 1; - - return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, - ((const LemmaEntry*)p2)->hanzi_str); -} - -int compare_char16(const void* p1, const void* p2) { - if (*((const char16*)p1) < *((const char16*)p2)) - return -1; - if (*((const char16*)p1) > *((const char16*)p2)) - return 1; - return 0; -} - -int compare_py(const void* p1, const void* p2) { - int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, - ((const LemmaEntry*)p2)->spl_idx_arr); - - if (0 != ret) - return ret; - - return static_cast<int>(((const LemmaEntry*)p2)->freq) - - static_cast<int>(((const LemmaEntry*)p1)->freq); -} - -// First hanzi, if the same, then Pinyin -int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { - size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); - size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); - if (size1 < size2) - return -1; - else if (size1 > size2) - return 1; - int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, - ((const LemmaEntry*)p2)->hanzi_str); - - if (0 != ret) - return ret; - - ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, - ((const LemmaEntry*)p2)->spl_idx_arr); - return ret; -} - -int compare_splid2(const void* p1, const void* p2) { - int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, - ((const LemmaEntry*)p2)->spl_idx_arr); - return ret; -} - -DictBuilder::DictBuilder() { - lemma_arr_ = NULL; - lemma_num_ = 0; - - scis_ = NULL; - scis_num_ = 0; - - lma_nodes_le0_ = NULL; - lma_nodes_ge1_ = NULL; - - lma_nds_used_num_le0_ = 0; - lma_nds_used_num_ge1_ = 0; - - homo_idx_buf_ = NULL; - homo_idx_num_eq1_ = 0; - homo_idx_num_gt1_ = 0; - - top_lmas_ = NULL; - top_lmas_num_ = 0; - - spl_table_ = NULL; - spl_parser_ = NULL; -} - -DictBuilder::~DictBuilder() { - free_resource(); -} - -bool DictBuilder::alloc_resource(size_t lma_num) { - if (0 == lma_num) - return false; - - free_resource(); - - lemma_num_ = lma_num; - lemma_arr_ = new LemmaEntry[lemma_num_]; - - top_lmas_num_ = 0; - top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; - - // New the scis_ buffer to the possible maximum size. - scis_num_ = lemma_num_ * kMaxLemmaSize; - scis_ = new SingleCharItem[scis_num_]; - - // The root and first level nodes is less than kMaxSpellingNum + 1 - lma_nds_used_num_le0_ = 0; - lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; - - // Other nodes is less than lemma_num - lma_nds_used_num_ge1_ = 0; - lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; - - homo_idx_buf_ = new LemmaIdType[lemma_num_]; - spl_table_ = new SpellingTable(); - spl_parser_ = new SpellingParser(); - - if (NULL == lemma_arr_ || NULL == top_lmas_ || - NULL == scis_ || NULL == spl_table_ || - NULL == spl_parser_ || NULL == lma_nodes_le0_ || - NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { - free_resource(); - return false; - } - - memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); - memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); - memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); - memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); - memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); - spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); - - return true; -} - -char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { - if (NULL == fn_validhzs || NULL == num) - return NULL; - - *num = 0; - FILE *fp = fopen(fn_validhzs, "rb"); - if (NULL == fp) - return NULL; - - char16 utf16header; - if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || - 0xfeff != utf16header) { - fclose(fp); - return NULL; - } - - fseek(fp, 0, SEEK_END); - *num = ftell(fp) / sizeof(char16); - assert(*num >= 1); - *num -= 1; - - char16 *hzs = new char16[*num]; - if (NULL == hzs) { - fclose(fp); - return NULL; - } - - fseek(fp, 2, SEEK_SET); - - if (fread(hzs, sizeof(char16), *num, fp) != *num) { - fclose(fp); - delete [] hzs; - return NULL; - } - fclose(fp); - - myqsort(hzs, *num, sizeof(char16), compare_char16); - return hzs; -} - -bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, - char16 hz) { - if (NULL == hzs) - return false; - - char16 *found; - found = static_cast<char16*>( - mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); - if (NULL == found) - return false; - - assert(*found == hz); - return true; -} - -// The caller makes sure that the parameters are valid. -bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, - const char16 *str, size_t str_len) { - if (NULL == hzs || NULL == str) - return false; - - for (size_t pos = 0; pos < str_len; pos++) { - if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) - return false; - } - return true; -} - -void DictBuilder::get_top_lemmas() { - top_lmas_num_ = 0; - if (NULL == lemma_arr_) - return; - - for (size_t pos = 0; pos < lemma_num_; pos++) { - if (0 == top_lmas_num_) { - top_lmas_[0] = lemma_arr_[pos]; - top_lmas_num_ = 1; - continue; - } - - if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { - if (kTopScoreLemmaNum > top_lmas_num_) - top_lmas_num_ += 1; - - size_t move_pos; - for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { - top_lmas_[move_pos] = top_lmas_[move_pos - 1]; - if (0 == move_pos - 1 || - (move_pos - 1 > 0 && - top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { - break; - } - } - assert(move_pos > 0); - top_lmas_[move_pos - 1] = lemma_arr_[pos]; - } else if (kTopScoreLemmaNum > top_lmas_num_) { - top_lmas_[top_lmas_num_] = lemma_arr_[pos]; - top_lmas_num_ += 1; - } - } - - if (kPrintDebug0) { - printf("\n------Top Lemmas------------------\n"); - for (size_t pos = 0; pos < top_lmas_num_; pos++) { - printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, - top_lmas_[pos].freq); - } - } -} - -void DictBuilder::free_resource() { - if (NULL != lemma_arr_) - delete [] lemma_arr_; - - if (NULL != scis_) - delete [] scis_; - - if (NULL != lma_nodes_le0_) - delete [] lma_nodes_le0_; - - if (NULL != lma_nodes_ge1_) - delete [] lma_nodes_ge1_; - - if (NULL != homo_idx_buf_) - delete [] homo_idx_buf_; - - if (NULL != spl_table_) - delete spl_table_; - - if (NULL != spl_parser_) - delete spl_parser_; - - lemma_arr_ = NULL; - scis_ = NULL; - lma_nodes_le0_ = NULL; - lma_nodes_ge1_ = NULL; - homo_idx_buf_ = NULL; - spl_table_ = NULL; - spl_parser_ = NULL; - - lemma_num_ = 0; - lma_nds_used_num_le0_ = 0; - lma_nds_used_num_ge1_ = 0; - homo_idx_num_eq1_ = 0; - homo_idx_num_gt1_ = 0; -} - -size_t DictBuilder::read_raw_dict(const char* fn_raw, - const char *fn_validhzs, - size_t max_item) { - if (NULL == fn_raw) return 0; - - Utf16Reader utf16_reader; - if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) - return false; - - char16 read_buf[kReadBufLen]; - - // Read the number of lemmas in the file - size_t lemma_num = 240000; - - // allocate resource required - if (!alloc_resource(lemma_num)) { - utf16_reader.close(); - } - - // Read the valid Hanzi list. - char16 *valid_hzs = NULL; - size_t valid_hzs_num = 0; - valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); - - // Begin reading the lemma entries - for (size_t i = 0; i < max_item; i++) { - // read next entry - if (!utf16_reader.readline(read_buf, kReadBufLen)) { - lemma_num = i; - break; - } - - size_t token_size; - char16 *token; - char16 *to_tokenize = read_buf; - - // Get the Hanzi string - token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); - if (NULL == token) { - free_resource(); - utf16_reader.close(); - return false; - } - - size_t lemma_size = utf16_strlen(token); - - if (lemma_size > kMaxLemmaSize) { - i--; - continue; - } - - if (lemma_size > 4) { - i--; - continue; - } - - // Copy to the lemma entry - utf16_strcpy(lemma_arr_[i].hanzi_str, token); - - lemma_arr_[i].hz_str_len = token_size; - - // Get the freq string - token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); - if (NULL == token) { - free_resource(); - utf16_reader.close(); - return false; - } - lemma_arr_[i].freq = utf16_atof(token); - - if (lemma_size > 1 && lemma_arr_[i].freq < 60) { - i--; - continue; - } - - // Get GBK mark, if no valid Hanzi list available, all items which contains - // GBK characters will be discarded. Otherwise, all items which contains - // characters outside of the valid Hanzi list will be discarded. - token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); - assert(NULL != token); - int gbk_flag = utf16_atoi(token); - if (NULL == valid_hzs || 0 == valid_hzs_num) { - if (0 != gbk_flag) { - i--; - continue; - } - } else { - if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, - lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { - i--; - continue; - } - } - - // Get spelling String - bool spelling_not_support = false; - for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; - hz_pos++) { - // Get a Pinyin - token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); - if (NULL == token) { - free_resource(); - utf16_reader.close(); - return false; - } - - assert(utf16_strlen(token) <= kMaxPinyinSize); - - utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); - - format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); - - // Put the pinyin to the spelling table - if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], - lemma_arr_[i].freq)) { - spelling_not_support = true; - break; - } - } - - // The whole line must have been parsed fully, otherwise discard this one. - token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); - if (spelling_not_support || NULL != token) { - i--; - continue; - } - } - - delete [] valid_hzs; - utf16_reader.close(); - - printf("read succesfully, lemma num: %d\n", lemma_num); - - return lemma_num; -} - -bool DictBuilder::build_dict(const char *fn_raw, - const char *fn_validhzs, - DictTrie *dict_trie) { - if (NULL == fn_raw || NULL == dict_trie) - return false; - - lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); - if (0 == lemma_num_) - return false; - - // Arrange the spelling table, and build a spelling tree - // The size of an spelling. '\0' is included. If the spelling table is - // initialized to calculate the spelling scores, the last char in the - // spelling string will be score, and it is also included in spl_item_size. - size_t spl_item_size; - size_t spl_num; - const char* spl_buf; - spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); - if (NULL == spl_buf) { - free_resource(); - return false; - } - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - - if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, - spl_table_->get_score_amplifier(), - spl_table_->get_average_score())) { - free_resource(); - return false; - } - - printf("spelling tree construct successfully.\n"); - - // Convert the spelling string to idxs - for (size_t i = 0; i < lemma_num_; i++) { - for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; - hz_pos++) { - uint16 spl_idxs[2]; - uint16 spl_start_pos[3]; - bool is_pre = true; - int spl_idx_num = - spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], - strlen(lemma_arr_[i].pinyin_str[hz_pos]), - spl_idxs, spl_start_pos, 2, is_pre); - assert(1 == spl_idx_num); - - if (spl_trie.is_half_id(spl_idxs[0])) { - uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); - assert(0 != num); - } - lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; - } - } - - // Sort the lemma items according to the hanzi, and give each unique item a - // id - sort_lemmas_by_hz(); - - scis_num_ = build_scis(); - - // Construct the dict list - dict_trie->dict_list_ = new DictList(); - bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, - lemma_arr_, lemma_num_); - assert(dl_success); - - // Construct the NGram information - NGram& ngram = NGram::get_instance(); - ngram.build_unigram(lemma_arr_, lemma_num_, - lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); - - // sort the lemma items according to the spelling idx string - myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); - - get_top_lemmas(); - -#ifdef ___DO_STATISTICS___ - stat_init(); -#endif - - lma_nds_used_num_le0_ = 1; // The root node - bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_), - lemma_arr_, 0, lemma_num_, 0); - if (!dt_success) { - free_resource(); - return false; - } - -#ifdef ___DO_STATISTICS___ - stat_print(); -#endif - - // Move the node data and homo data to the DictTrie - dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; - dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; - size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; - dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; - assert(NULL != dict_trie->root_); - assert(NULL != dict_trie->lma_idx_buf_); - dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; - dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; - dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; - dict_trie->top_lmas_num_ = top_lmas_num_; - - memcpy(dict_trie->root_, lma_nodes_le0_, - sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); - memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, - sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); - - for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { - id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, - homo_idx_buf_[pos]); - } - - for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; - pos < lma_idx_num; pos++) { - LemmaIdType idx = - top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; - id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); - } - - if (kPrintDebug0) { - printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); - printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); - printf("top_lmas_num_: %d\n", top_lmas_num_); - } - - free_resource(); - - if (kPrintDebug0) { - printf("Building dict succeds\n"); - } - return dt_success; -} - -void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { - if (NULL == buf) return; - for (size_t pos = 0; pos < kLemmaIdSize; pos++) { - (buf)[pos] = (unsigned char)(id >> (pos * 8)); - } -} - -void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { - node->son_1st_off_l = static_cast<uint16>(offset); - node->son_1st_off_h = static_cast<unsigned char>(offset >> 16); -} - -void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { - node->homo_idx_buf_off_l = static_cast<uint16>(offset); - node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16); - -} - -// All spelling strings will be converted to upper case, except that -// spellings started with "ZH"/"CH"/"SH" will be converted to -// "Zh"/"Ch"/"Sh" -void DictBuilder::format_spelling_str(char *spl_str) { - if (NULL == spl_str) - return; - - uint16 pos = 0; - while ('\0' != spl_str[pos]) { - if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') - spl_str[pos] = spl_str[pos] - 'a' + 'A'; - - if (1 == pos && 'H' == spl_str[pos]) { - if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { - spl_str[pos] = 'h'; - } - } - pos++; - } -} - -LemmaIdType DictBuilder::sort_lemmas_by_hz() { - if (NULL == lemma_arr_ || 0 == lemma_num_) - return 0; - - myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); - - lemma_arr_[0].idx_by_hz = 1; - LemmaIdType idx_max = 1; - for (size_t i = 1; i < lemma_num_; i++) { - if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { - idx_max++; - lemma_arr_[i].idx_by_hz = idx_max; - } else { - idx_max++; - lemma_arr_[i].idx_by_hz = idx_max; - } - } - return idx_max + 1; -} - -size_t DictBuilder::build_scis() { - if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) - return 0; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - - // This first one is blank, because id 0 is invalid. - scis_[0].freq = 0; - scis_[0].hz = 0; - scis_[0].splid.full_splid = 0; - scis_[0].splid.half_splid = 0; - scis_num_ = 1; - - // Copy the hanzis to the buffer - for (size_t pos = 0; pos < lemma_num_; pos++) { - size_t hz_num = lemma_arr_[pos].hz_str_len; - for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { - scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; - scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; - scis_[scis_num_].splid.half_splid = - spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); - if (1 == hz_num) - scis_[scis_num_].freq = lemma_arr_[pos].freq; - else - scis_[scis_num_].freq = 0.000001; - scis_num_++; - } - } - - myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); - - // Remove repeated items - size_t unique_scis_num = 1; - for (size_t pos = 1; pos < scis_num_; pos++) { - if (scis_[pos].hz == scis_[pos - 1].hz && - scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) - continue; - scis_[unique_scis_num] = scis_[pos]; - scis_[unique_scis_num].splid.half_splid = - spl_trie.full_to_half(scis_[pos].splid.full_splid); - unique_scis_num++; - } - - scis_num_ = unique_scis_num; - - // Update the lemma list. - for (size_t pos = 0; pos < lemma_num_; pos++) { - size_t hz_num = lemma_arr_[pos].hz_str_len; - for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { - SingleCharItem key; - key.hz = lemma_arr_[pos].hanzi_str[hzpos]; - key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; - key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); - - SingleCharItem *found; - found = static_cast<SingleCharItem*>(mybsearch(&key, scis_, - unique_scis_num, - sizeof(SingleCharItem), - cmp_scis_hz_splid)); - - assert(found); - - lemma_arr_[pos].hanzi_scis_ids[hzpos] = - static_cast<uint16>(found - scis_); - lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; - } - } - - return scis_num_; -} - -bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, - size_t item_start, size_t item_end, - size_t level) { - if (level >= kMaxLemmaSize || item_end <= item_start) - return false; - - // 1. Scan for how many sons - size_t parent_son_num = 0; - // LemmaNode *son_1st = NULL; - // parent.num_of_son = 0; - - LemmaEntry *lma_last_start = lemma_arr_ + item_start; - uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; - - // Scan for how many sons to be allocaed - for (size_t i = item_start + 1; i< item_end; i++) { - LemmaEntry *lma_current = lemma_arr + i; - uint16 spl_idx_current = lma_current->spl_idx_arr[level]; - if (spl_idx_current != spl_idx_node) { - parent_son_num++; - spl_idx_node = spl_idx_current; - } - } - parent_son_num++; - -#ifdef ___DO_STATISTICS___ - // Use to indicate whether all nodes of this layer have no son. - bool allson_noson = true; - - assert(level < kMaxLemmaSize); - if (parent_son_num > max_sonbuf_len_[level]) - max_sonbuf_len_[level] = parent_son_num; - - total_son_num_[level] += parent_son_num; - total_sonbuf_num_[level] += 1; - - if (parent_son_num == 1) - sonbufs_num1_++; - else - sonbufs_numgt1_++; - total_lma_node_num_ += parent_son_num; -#endif - - // 2. Update the parent's information - // Update the parent's son list; - LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used - LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. - if (0 == level) { // the parent is root - (static_cast<LmaNodeLE0*>(parent))->son_1st_off = - lma_nds_used_num_le0_; - son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; - lma_nds_used_num_le0_ += parent_son_num; - - assert(parent_son_num <= 65535); - (static_cast<LmaNodeLE0*>(parent))->num_of_son = - static_cast<uint16>(parent_son_num); - } else if (1 == level) { // the parent is a son of root - (static_cast<LmaNodeLE0*>(parent))->son_1st_off = - lma_nds_used_num_ge1_; - son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; - lma_nds_used_num_ge1_ += parent_son_num; - - assert(parent_son_num <= 65535); - (static_cast<LmaNodeLE0*>(parent))->num_of_son = - static_cast<uint16>(parent_son_num); - } else { - set_son_offset((static_cast<LmaNodeGE1*>(parent)), - lma_nds_used_num_ge1_); - son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; - lma_nds_used_num_ge1_ += parent_son_num; - - assert(parent_son_num <= 255); - (static_cast<LmaNodeGE1*>(parent))->num_of_son = - (unsigned char)parent_son_num; - } - - // 3. Now begin to construct the son one by one - size_t son_pos = 0; - - lma_last_start = lemma_arr_ + item_start; - spl_idx_node = lma_last_start->spl_idx_arr[level]; - - size_t homo_num = 0; - if (lma_last_start->spl_idx_arr[level + 1] == 0) - homo_num = 1; - - size_t item_start_next = item_start; - - for (size_t i = item_start + 1; i < item_end; i++) { - LemmaEntry* lma_current = lemma_arr_ + i; - uint16 spl_idx_current = lma_current->spl_idx_arr[level]; - - if (spl_idx_current == spl_idx_node) { - if (lma_current->spl_idx_arr[level + 1] == 0) - homo_num++; - } else { - // Construct a node - LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid - LmaNodeGE1 *node_cur_ge1 = NULL; - if (0 == level) { - node_cur_le0 = son_1st_le0 + son_pos; - node_cur_le0->spl_idx = spl_idx_node; - node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; - node_cur_le0->son_1st_off = 0; - homo_idx_num_eq1_ += homo_num; - } else { - node_cur_ge1 = son_1st_ge1 + son_pos; - node_cur_ge1->spl_idx = spl_idx_node; - - set_homo_id_buf_offset(node_cur_ge1, - (homo_idx_num_eq1_ + homo_idx_num_gt1_)); - set_son_offset(node_cur_ge1, 0); - homo_idx_num_gt1_ += homo_num; - } - - if (homo_num > 0) { - LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + - homo_idx_num_gt1_ - homo_num; - if (0 == level) { - assert(homo_num <= 65535); - node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); - } else { - assert(homo_num <= 255); - node_cur_ge1->num_of_homo = (unsigned char)homo_num; - } - - for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { - idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; - } - -#ifdef ___DO_STATISTICS___ - if (homo_num > max_homobuf_len_[level]) - max_homobuf_len_[level] = homo_num; - - total_homo_num_[level] += homo_num; -#endif - } - - if (i - item_start_next > homo_num) { - void *next_parent; - if (0 == level) - next_parent = static_cast<void*>(node_cur_le0); - else - next_parent = static_cast<void*>(node_cur_ge1); - construct_subset(next_parent, lemma_arr, - item_start_next + homo_num, i, level + 1); -#ifdef ___DO_STATISTICS___ - - total_node_hasson_[level] += 1; - allson_noson = false; -#endif - } - - // for the next son - lma_last_start = lma_current; - spl_idx_node = spl_idx_current; - item_start_next = i; - homo_num = 0; - if (lma_current->spl_idx_arr[level + 1] == 0) - homo_num = 1; - - son_pos++; - } - } - - // 4. The last one to construct - LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid - LmaNodeGE1 *node_cur_ge1 = NULL; - if (0 == level) { - node_cur_le0 = son_1st_le0 + son_pos; - node_cur_le0->spl_idx = spl_idx_node; - node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; - node_cur_le0->son_1st_off = 0; - homo_idx_num_eq1_ += homo_num; - } else { - node_cur_ge1 = son_1st_ge1 + son_pos; - node_cur_ge1->spl_idx = spl_idx_node; - - set_homo_id_buf_offset(node_cur_ge1, - (homo_idx_num_eq1_ + homo_idx_num_gt1_)); - set_son_offset(node_cur_ge1, 0); - homo_idx_num_gt1_ += homo_num; - } - - if (homo_num > 0) { - LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + - homo_idx_num_gt1_ - homo_num; - if (0 == level) { - assert(homo_num <= 65535); - node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); - } else { - assert(homo_num <= 255); - node_cur_ge1->num_of_homo = (unsigned char)homo_num; - } - - for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { - idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; - } - -#ifdef ___DO_STATISTICS___ - if (homo_num > max_homobuf_len_[level]) - max_homobuf_len_[level] = homo_num; - - total_homo_num_[level] += homo_num; -#endif - } - - if (item_end - item_start_next > homo_num) { - void *next_parent; - if (0 == level) - next_parent = static_cast<void*>(node_cur_le0); - else - next_parent = static_cast<void*>(node_cur_ge1); - construct_subset(next_parent, lemma_arr, - item_start_next + homo_num, item_end, level + 1); -#ifdef ___DO_STATISTICS___ - - total_node_hasson_[level] += 1; - allson_noson = false; -#endif - } - -#ifdef ___DO_STATISTICS___ - if (allson_noson) { - total_sonbuf_allnoson_[level] += 1; - total_node_in_sonbuf_allnoson_[level] += parent_son_num; - } -#endif - - assert(son_pos + 1 == parent_son_num); - return true; -} - -#ifdef ___DO_STATISTICS___ -void DictBuilder::stat_init() { - memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); - memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); - - sonbufs_num1_ = 0; - sonbufs_numgt1_ = 0; - total_lma_node_num_ = 0; -} - -void DictBuilder::stat_print() { - printf("\n------------STAT INFO-------------\n"); - printf("[root is layer -1]\n"); - printf(".. max_sonbuf_len per layer(from layer 0):\n "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", max_sonbuf_len_[i]); - printf("-, \n"); - - printf(".. max_homobuf_len per layer:\n -, "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", max_homobuf_len_[i]); - printf("\n"); - - printf(".. total_son_num per layer:\n "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_son_num_[i]); - printf("-, \n"); - - printf(".. total_node_hasson per layer:\n 1, "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_node_hasson_[i]); - printf("\n"); - - printf(".. total_sonbuf_num per layer:\n "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_sonbuf_num_[i]); - printf("-, \n"); - - printf(".. total_sonbuf_allnoson per layer:\n "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_sonbuf_allnoson_[i]); - printf("-, \n"); - - printf(".. total_node_in_sonbuf_allnoson per layer:\n "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_node_in_sonbuf_allnoson_[i]); - printf("-, \n"); - - printf(".. total_homo_num per layer:\n 0, "); - for (size_t i = 0; i < kMaxLemmaSize; i++) - printf("%d, ", total_homo_num_[i]); - printf("\n"); - - printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); - printf(".. son buf allocation number with more than 1 son: %d\n", - sonbufs_numgt1_); - printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); -} -#endif // ___DO_STATISTICS___ - -#endif // ___BUILD_MODEL___ -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp deleted file mode 100644 index 64d8d085..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp +++ /dev/null @@ -1,446 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include "../include/dictlist.h" -#include "../include/mystdlib.h" -#include "../include/ngram.h" -#include "../include/searchutility.h" - -namespace ime_pinyin { - -DictList::DictList() { - initialized_ = false; - scis_num_ = 0; - scis_hz_ = NULL; - scis_splid_ = NULL; - buf_ = NULL; - spl_trie_ = SpellingTrie::get_cpinstance(); - - assert(kMaxLemmaSize == 8); - cmp_func_[0] = cmp_hanzis_1; - cmp_func_[1] = cmp_hanzis_2; - cmp_func_[2] = cmp_hanzis_3; - cmp_func_[3] = cmp_hanzis_4; - cmp_func_[4] = cmp_hanzis_5; - cmp_func_[5] = cmp_hanzis_6; - cmp_func_[6] = cmp_hanzis_7; - cmp_func_[7] = cmp_hanzis_8; -} - -DictList::~DictList() { - free_resource(); -} - -bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { - // Allocate memory - buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16))); - if (NULL == buf_) - return false; - - scis_num_ = scis_num; - - scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16))); - if (NULL == scis_hz_) - return false; - - scis_splid_ = static_cast<SpellingId*> - (malloc(scis_num_ * sizeof(SpellingId))); - - if (NULL == scis_splid_) - return false; - - return true; -} - -void DictList::free_resource() { - if (NULL != buf_) - free(buf_); - buf_ = NULL; - - if (NULL != scis_hz_) - free(scis_hz_); - scis_hz_ = NULL; - - if (NULL != scis_splid_) - free(scis_splid_); - scis_splid_ = NULL; -} - -#ifdef ___BUILD_MODEL___ -bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, - const LemmaEntry *lemma_arr, size_t lemma_num) { - if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) - return false; - - initialized_ = false; - - if (NULL != buf_) - free(buf_); - - // calculate the size - size_t buf_size = calculate_size(lemma_arr, lemma_num); - if (0 == buf_size) - return false; - - if (!alloc_resource(buf_size, scis_num)) - return false; - - fill_scis(scis, scis_num); - - // Copy the related content from the array to inner buffer - fill_list(lemma_arr, lemma_num); - - initialized_ = true; - return true; -} - -size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { - size_t last_hz_len = 0; - size_t list_size = 0; - size_t id_num = 0; - - for (size_t i = 0; i < lemma_num; i++) { - if (0 == i) { - last_hz_len = lemma_arr[i].hz_str_len; - - assert(last_hz_len > 0); - assert(lemma_arr[0].idx_by_hz == 1); - - id_num++; - start_pos_[0] = 0; - start_id_[0] = id_num; - - last_hz_len = 1; - list_size += last_hz_len; - } else { - size_t current_hz_len = lemma_arr[i].hz_str_len; - - assert(current_hz_len >= last_hz_len); - - if (current_hz_len == last_hz_len) { - list_size += current_hz_len; - id_num++; - } else { - for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { - start_pos_[len] = start_pos_[len - 1]; - start_id_[len] = start_id_[len - 1]; - } - - start_pos_[current_hz_len - 1] = list_size; - - id_num++; - start_id_[current_hz_len - 1] = id_num; - - last_hz_len = current_hz_len; - list_size += current_hz_len; - } - } - } - - for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { - if (0 == i) { - start_pos_[0] = 0; - start_id_[0] = 1; - } else { - start_pos_[i] = list_size; - start_id_[i] = id_num; - } - } - - return start_pos_[kMaxLemmaSize]; -} - -void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { - assert(scis_num_ == scis_num); - - for (size_t pos = 0; pos < scis_num_; pos++) { - scis_hz_[pos] = scis[pos].hz; - scis_splid_[pos] = scis[pos].splid; - } -} - -void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { - size_t current_pos = 0; - - utf16_strncpy(buf_, lemma_arr[0].hanzi_str, - lemma_arr[0].hz_str_len); - - current_pos = lemma_arr[0].hz_str_len; - - size_t id_num = 1; - - for (size_t i = 1; i < lemma_num; i++) { - utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, - lemma_arr[i].hz_str_len); - - id_num++; - current_pos += lemma_arr[i].hz_str_len; - } - - assert(current_pos == start_pos_[kMaxLemmaSize]); - assert(id_num == start_id_[kMaxLemmaSize]); -} - -char16* DictList::find_pos2_startedbyhz(char16 hz_char) { - char16 *found_2w = static_cast<char16*> - (mybsearch(&hz_char, buf_ + start_pos_[1], - (start_pos_[2] - start_pos_[1]) / 2, - sizeof(char16) * 2, cmp_hanzis_1)); - if (NULL == found_2w) - return NULL; - - while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) - found_2w -= 2; - - return found_2w; -} -#endif // ___BUILD_MODEL___ - -char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], - size_t word_len, int (*cmp_func)(const void *, const void *)) { - char16 *found_w = static_cast<char16*> - (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], - (start_pos_[word_len] - start_pos_[word_len - 1]) - / word_len, - sizeof(char16) * word_len, cmp_func)); - - if (NULL == found_w) - return NULL; - - while (found_w > buf_ + start_pos_[word_len -1] && - cmp_func(found_w, found_w - word_len) == 0) - found_w -= word_len; - - return found_w; -} - -size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) { - assert(hzs_len <= kMaxPredictSize && hzs_len > 0); - - // 1. Prepare work - int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; - - NGram& ngram = NGram::get_instance(); - - size_t item_num = 0; - - // 2. Do prediction - for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; - pre_len++) { - uint16 word_len = hzs_len + pre_len; - char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); - if (NULL == w_buf) - continue; - while (w_buf < buf_ + start_pos_[word_len] && - cmp_func(w_buf, last_hzs) == 0 && - item_num < npre_max) { - memset(npre_items + item_num, 0, sizeof(NPredictItem)); - utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); - npre_items[item_num].psb = - ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) - / word_len + start_id_[word_len - 1]); - npre_items[item_num].his_len = hzs_len; - item_num++; - w_buf += word_len; - } - } - - size_t new_num = 0; - for (size_t i = 0; i < item_num; i++) { - // Try to find it in the existing items - size_t e_pos; - for (e_pos = 1; e_pos <= b4_used; e_pos++) { - if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, - kMaxPredictSize) == 0) - break; - } - if (e_pos <= b4_used) - continue; - - // If not found, append it to the buffer - npre_items[new_num] = npre_items[i]; - new_num++; - } - - return new_num; -} - -uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) { - if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf - || str_max <= 1) - return 0; - - // Find the range - for (uint16 i = 0; i < kMaxLemmaSize; i++) { - if (i + 1 > str_max - 1) - return 0; - if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { - size_t id_span = id_lemma - start_id_[i]; - - uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); - for (uint16 len = 0; len <= i; len++) { - str_buf[len] = buf[len]; - } - str_buf[i+1] = (char16)'\0'; - return i + 1; - } - } - return 0; -} - -uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, - uint16 *splids, uint16 max_splids) { - char16 *hz_found = static_cast<char16*> - (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); - assert(NULL != hz_found && hanzi == *hz_found); - - // Move to the first one. - while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) - hz_found--; - - // First try to found if strict comparison result is not zero. - char16 *hz_f = hz_found; - bool strict = false; - while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { - uint16 pos = hz_f - scis_hz_; - if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { - strict = true; - } - hz_f++; - } - - uint16 found_num = 0; - while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { - uint16 pos = hz_found - scis_hz_; - if (0 == half_splid || - (strict && scis_splid_[pos].half_splid == half_splid) || - (!strict && spl_trie_->half_full_compatible(half_splid, - scis_splid_[pos].full_splid))) { - assert(found_num + 1 < max_splids); - splids[found_num] = scis_splid_[pos].full_splid; - found_num++; - } - hz_found++; - } - - return found_num; -} - -LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { - if (NULL == str || str_len > kMaxLemmaSize) - return 0; - - char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); - if (NULL == found) - return 0; - - assert(found > buf_); - assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]); - return static_cast<LemmaIdType> - (start_id_[str_len - 1] + - (found - buf_ - start_pos_[str_len - 1]) / str_len); -} - -void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { - assert(NULL != str); - - for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { - str[str_pos] = scis_hz_[str[str_pos]]; - } -} - -void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { - assert(NULL != str); - - for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { - str[str_pos] = 0x100; - } -} - -bool DictList::save_list(FILE *fp) { - if (!initialized_ || NULL == fp) - return false; - - if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || - NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) - return false; - - if (fwrite(&scis_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != - kMaxLemmaSize + 1) - return false; - - if (fwrite(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != - kMaxLemmaSize + 1) - return false; - - if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) - return false; - - if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) - return false; - - if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != - start_pos_[kMaxLemmaSize]) - return false; - - return true; -} - -bool DictList::load_list(FILE *fp) { - if (NULL == fp) - return false; - - initialized_ = false; - - if (fread(&scis_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != - kMaxLemmaSize + 1) - return false; - - if (fread(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != - kMaxLemmaSize + 1) - return false; - - free_resource(); - - if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) - return false; - - if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) - return false; - - if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) - return false; - - if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != - start_pos_[kMaxLemmaSize]) - return false; - - initialized_ = true; - return true; -} -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp deleted file mode 100644 index 0cdd0982..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp +++ /dev/null @@ -1,941 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdio.h> -#include <string.h> -#include "../include/dicttrie.h" -#include "../include/dictbuilder.h" -#include "../include/lpicache.h" -#include "../include/mystdlib.h" -#include "../include/ngram.h" - -namespace ime_pinyin { - -DictTrie::DictTrie() { - spl_trie_ = SpellingTrie::get_cpinstance(); - - root_ = NULL; - splid_le0_index_ = NULL; - lma_node_num_le0_ = 0; - nodes_ge1_ = NULL; - lma_node_num_ge1_ = 0; - lma_idx_buf_ = NULL; - lma_idx_buf_len_ = 0; - total_lma_num_ = 0; - top_lmas_num_ = 0; - dict_list_ = NULL; - - parsing_marks_ = NULL; - mile_stones_ = NULL; - reset_milestones(0, kFirstValidMileStoneHandle); -} - -DictTrie::~DictTrie() { - free_resource(true); -} - -void DictTrie::free_resource(bool free_dict_list) { - if (NULL != root_) - free(root_); - root_ = NULL; - - if (NULL != splid_le0_index_) - free(splid_le0_index_); - splid_le0_index_ = NULL; - - if (NULL != nodes_ge1_) - free(nodes_ge1_); - nodes_ge1_ = NULL; - - if (NULL != lma_idx_buf_) - free(lma_idx_buf_); - lma_idx_buf_ = NULL; - - if (free_dict_list) { - if (NULL != dict_list_) { - delete dict_list_; - } - dict_list_ = NULL; - } - - if (parsing_marks_) - delete [] parsing_marks_; - parsing_marks_ = NULL; - - if (mile_stones_) - delete [] mile_stones_; - mile_stones_ = NULL; - - reset_milestones(0, kFirstValidMileStoneHandle); -} - -inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) { - return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16)); -} - -inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) { - return ((size_t)node->homo_idx_buf_off_l + - ((size_t)node->homo_idx_buf_off_h << 16)); -} - -inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) { - LemmaIdType id = 0; - for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--) - id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos]; - id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize]; - return id; -} - -#ifdef ___BUILD_MODEL___ -bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) { - DictBuilder* dict_builder = new DictBuilder(); - - free_resource(true); - - return dict_builder->build_dict(fn_raw, fn_validhzs, this); -} - -bool DictTrie::save_dict(FILE *fp) { - if (NULL == fp) - return false; - - if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) - != lma_node_num_le0_) - return false; - - if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) - != lma_node_num_ge1_) - return false; - - if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != - lma_idx_buf_len_) - return false; - - return true; -} - -bool DictTrie::save_dict(const char *filename) { - if (NULL == filename) - return false; - - if (NULL == root_ || NULL == dict_list_) - return false; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - FILE *fp = fopen(filename, "wb"); - if (NULL == fp) - return false; - - if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) || - !save_dict(fp) || !ngram.save_ngram(fp)) { - fclose(fp); - return false; - } - - fclose(fp); - return true; -} -#endif // ___BUILD_MODEL___ - -bool DictTrie::load_dict(FILE *fp) { - if (NULL == fp) - return false; - if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 || - top_lmas_num_ >= lma_idx_buf_len_) - return false; - - free_resource(false); - - root_ = static_cast<LmaNodeLE0*> - (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0))); - nodes_ge1_ = static_cast<LmaNodeGE1*> - (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1))); - lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_); - total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize; - - size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1; - assert(lma_node_num_le0_ <= buf_size); - splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16))); - - // Init the space for parsing. - parsing_marks_ = new ParsingMark[kMaxParsingMark]; - mile_stones_ = new MileStone[kMaxMileStone]; - reset_milestones(0, kFirstValidMileStoneHandle); - - if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ || - NULL == splid_le0_index_ || NULL == parsing_marks_ || - NULL == mile_stones_) { - free_resource(false); - return false; - } - - if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) - != lma_node_num_le0_) - return false; - - if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) - != lma_node_num_ge1_) - return false; - - if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != - lma_idx_buf_len_) - return false; - - // The quick index for the first level sons - uint16 last_splid = kFullSplIdStart; - size_t last_pos = 0; - for (size_t i = 1; i < lma_node_num_le0_; i++) { - for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++) - splid_le0_index_[splid - kFullSplIdStart] = last_pos; - - splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] = - static_cast<uint16>(i); - last_splid = root_[i].spl_idx; - last_pos = i; - } - - for (uint16 splid = last_splid + 1; - splid < buf_size + kFullSplIdStart; splid++) { - assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size); - splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1; - } - - return true; -} - -bool DictTrie::load_dict(const char *filename, LemmaIdType start_id, - LemmaIdType end_id) { - if (NULL == filename || end_id <= start_id) - return false; - - FILE *fp = fopen(filename, "rb"); - if (NULL == fp) - return false; - - free_resource(true); - - dict_list_ = new DictList(); - if (NULL == dict_list_) { - fclose(fp); - return false; - } - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || - !load_dict(fp) || !ngram.load_ngram(fp) || - total_lma_num_ > end_id - start_id + 1) { - free_resource(true); - fclose(fp); - return false; - } - - fclose(fp); - return true; -} - -bool DictTrie::load_dict_fd(int sys_fd, long start_offset, - long length, LemmaIdType start_id, - LemmaIdType end_id) { - if (start_offset < 0 || length <= 0 || end_id <= start_id) - return false; - - FILE *fp = fdopen(sys_fd, "rb"); - if (NULL == fp) - return false; - - if (-1 == fseek(fp, start_offset, SEEK_SET)) { - fclose(fp); - return false; - } - - free_resource(true); - - dict_list_ = new DictList(); - if (NULL == dict_list_) { - fclose(fp); - return false; - } - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - NGram &ngram = NGram::get_instance(); - - if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || - !load_dict(fp) || !ngram.load_ngram(fp) || - ftell(fp) < start_offset + length || - total_lma_num_ > end_id - start_id + 1) { - free_resource(true); - fclose(fp); - return false; - } - - fclose(fp); - return true; -} - -size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, - LmaNodeLE0 *node) { - size_t lpi_num = 0; - NGram& ngram = NGram::get_instance(); - for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { - lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off + - homo); - lpi_items[lpi_num].lma_len = 1; - lpi_items[lpi_num].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); - lpi_num++; - if (lpi_num >= lpi_max) - break; - } - - return lpi_num; -} - -size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, - size_t homo_buf_off, LmaNodeGE1 *node, - uint16 lma_len) { - size_t lpi_num = 0; - NGram& ngram = NGram::get_instance(); - for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { - lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo); - lpi_items[lpi_num].lma_len = lma_len; - lpi_items[lpi_num].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); - lpi_num++; - if (lpi_num >= lpi_max) - break; - } - - return lpi_num; -} - -void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { - if (0 == from_step) { - parsing_marks_pos_ = 0; - mile_stones_pos_ = kFirstValidMileStoneHandle; - } else { - if (from_handle > 0 && from_handle < mile_stones_pos_) { - mile_stones_pos_ = from_handle; - - MileStone *mile_stone = mile_stones_ + from_handle; - parsing_marks_pos_ = mile_stone->mark_start; - } - } -} - -MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, size_t lpi_max, - size_t *lpi_num) { - if (NULL == dep) - return 0; - - // from LmaNodeLE0 (root) to LmaNodeLE0 - if (0 == from_handle) { - assert(0 == dep->splids_extended); - return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num); - } - - // from LmaNodeLE0 to LmaNodeGE1 - if (1 == dep->splids_extended) - return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num); - - // From LmaNodeGE1 to LmaNodeGE1 - return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num); -} - -MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && 0 == from_handle); - *lpi_num = 0; - MileStoneHandle ret_handle = 0; - - uint16 splid = dep->splids[dep->splids_extended]; - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - LpiCache& lpi_cache = LpiCache::get_instance(); - bool cached = lpi_cache.is_cached(splid); - - // 2. Begin exgtending - // 2.1 Get the LmaPsbItem list - LmaNodeLE0 *node = root_; - size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; - size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart]; - for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { - assert(1 == node->son_1st_off); - LmaNodeLE0 *son = root_ + son_pos; - assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num); - - if (!cached && *lpi_num < lpi_max) { - bool need_lpi = true; - if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start) - need_lpi = false; - - if (need_lpi) - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, son); - } - - // If necessary, fill in a new mile stone. - if (son->spl_idx == id_start) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = son_pos; - parsing_marks_[parsing_marks_pos_].node_num = id_num; - mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_; - mile_stones_[mile_stones_pos_].mark_num = 1; - ret_handle = mile_stones_pos_; - parsing_marks_pos_++; - mile_stones_pos_++; - } - } - - if (son->spl_idx >= id_start + id_num -1) - break; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); - - MileStoneHandle ret_handle = 0; - - // 1. If this is a half Id, get its corresponding full starting Id and - // number of full Id. - size_t ret_val = 0; - - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - // 2. Begin extending. - MileStone *mile_stone = mile_stones_ + from_handle; - - for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { - ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; - uint16 ext_num = p_mark.node_num; - for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { - LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos; - size_t found_start = 0; - size_t found_num = 0; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { - assert(node->son_1st_off <= lma_node_num_ge1_); - LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos; - if (son->spl_idx >= id_start - && son->spl_idx < id_start + id_num) { - if (*lpi_num < lpi_max) { - size_t homo_buf_off = get_homo_idx_buf_offset(son); - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, homo_buf_off, son, - 2); - } - - // If necessary, fill in the new DTMI - if (0 == found_num) { - found_start = son_pos; - } - found_num++; - } - if (son->spl_idx >= id_start + id_num - 1 || son_pos == - (size_t)node->num_of_son - 1) { - if (found_num > 0) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = - node->son_1st_off + found_start; - parsing_marks_[parsing_marks_pos_].node_num = found_num; - if (0 == ret_val) - mile_stones_[mile_stones_pos_].mark_start = - parsing_marks_pos_; - parsing_marks_pos_++; - } - - ret_val++; - } - break; - } // for son_pos - } // for ext_pos - } // for h_pos - } - - if (ret_val > 0) { - mile_stones_[mile_stones_pos_].mark_num = ret_val; - ret_handle = mile_stones_pos_; - mile_stones_pos_++; - ret_val = 1; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); - - MileStoneHandle ret_handle = 0; - - // 1. If this is a half Id, get its corresponding full starting Id and - // number of full Id. - size_t ret_val = 0; - - uint16 id_start = dep->id_start; - uint16 id_num = dep->id_num; - - // 2. Begin extending. - MileStone *mile_stone = mile_stones_ + from_handle; - - for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { - ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; - uint16 ext_num = p_mark.node_num; - for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { - LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos; - size_t found_start = 0; - size_t found_num = 0; - - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { - assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); - LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos; - if (son->spl_idx >= id_start - && son->spl_idx < id_start + id_num) { - if (*lpi_num < lpi_max) { - size_t homo_buf_off = get_homo_idx_buf_offset(son); - *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), - lpi_max - *lpi_num, homo_buf_off, son, - dep->splids_extended + 1); - } - - // If necessary, fill in the new DTMI - if (0 == found_num) { - found_start = son_pos; - } - found_num++; - } - if (son->spl_idx >= id_start + id_num - 1 || son_pos == - (size_t)node->num_of_son - 1) { - if (found_num > 0) { - if (mile_stones_pos_ < kMaxMileStone && - parsing_marks_pos_ < kMaxParsingMark) { - parsing_marks_[parsing_marks_pos_].node_offset = - get_son_offset(node) + found_start; - parsing_marks_[parsing_marks_pos_].node_num = found_num; - if (0 == ret_val) - mile_stones_[mile_stones_pos_].mark_start = - parsing_marks_pos_; - parsing_marks_pos_++; - } - - ret_val++; - } - break; - } - } // for son_pos - } // for ext_pos - } // for h_pos - - if (ret_val > 0) { - mile_stones_[mile_stones_pos_].mark_num = ret_val; - ret_handle = mile_stones_pos_; - mile_stones_pos_++; - } - - // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, - // mile_stones_pos_); - return ret_handle; -} - -bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num, - LemmaIdType id_lemma) { - if (0 == splid_num || NULL == splids) - return false; - - void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart]; - - for (uint16 pos = 1; pos < splid_num; pos++) { - if (1 == pos) { - LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node); - LmaNodeGE1 *node_son; - uint16 son_pos; - for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son); - son_pos++) { - assert(node_le0->son_1st_off <= lma_node_num_ge1_); - node_son = nodes_ge1_ + node_le0->son_1st_off - + son_pos; - if (node_son->spl_idx == splids[pos]) - break; - } - if (son_pos < node_le0->num_of_son) - node = reinterpret_cast<void*>(node_son); - else - return false; - } else { - LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); - LmaNodeGE1 *node_son; - uint16 son_pos; - for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son); - son_pos++) { - assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0); - node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos; - if (node_son->spl_idx == splids[pos]) - break; - } - if (son_pos < node_ge1->num_of_son) - node = reinterpret_cast<void*>(node_son); - else - return false; - } - } - - if (1 == splid_num) { - LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node); - size_t num_of_homo = (size_t)node_le0->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); - char16 str[2]; - get_lemma_str(id_this, str, 2); - if (id_this == id_lemma) - return true; - } - } else { - LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); - size_t num_of_homo = (size_t)node_ge1->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); - if (get_lemma_id(node_homo_off + homo_pos) == id_lemma) - return true; - } - } - - return false; -} - -size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len, - LmaPsbItem* lma_buf, size_t max_lma_buf) { - if (splid_str_len > kMaxLemmaSize) - return 0; - -#define MAX_EXTENDBUF_LEN 200 - - size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment - size_t* node_buf2[MAX_EXTENDBUF_LEN]; - LmaNodeLE0** node_fr_le0 = - reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from. - LmaNodeLE0** node_to_le0 = - reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to. - LmaNodeGE1** node_fr_ge1 = NULL; - LmaNodeGE1** node_to_ge1 = NULL; - size_t node_fr_num = 1; - size_t node_to_num = 0; - node_fr_le0[0] = root_; - if (NULL == node_fr_le0[0]) - return 0; - - size_t spl_pos = 0; - - while (spl_pos < splid_str_len) { - uint16 id_num = 1; - uint16 id_start = splid_str[spl_pos]; - // If it is a half id - if (spl_trie_->is_half_id(splid_str[spl_pos])) { - id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start); - assert(id_num > 0); - } - - // Extend the nodes - if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; - assert(node == root_ && 1 == node_fr_num); - size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; - size_t son_end = - splid_le0_index_[id_start + id_num - kFullSplIdStart]; - for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { - assert(1 == node->son_1st_off); - LmaNodeLE0 *node_son = root_ + son_pos; - assert(node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num); - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_le0[node_to_num] = node_son; - node_to_num++; - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeLE0 to LmaNodeGE1 - LmaNodeLE0** node_tmp = node_fr_le0; - node_fr_le0 = node_to_le0; - node_to_le0 = NULL; - node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp); - } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; - son_pos++) { - assert(node->son_1st_off <= lma_node_num_ge1_); - LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off - + son_pos; - if (node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num) { - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_ge1[node_to_num] = node_son; - node_to_num++; - } - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeGE1 to LmaNodeGE1 - node_fr_ge1 = node_to_ge1; - node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0); - node_fr_le0 = NULL; - node_to_le0 = NULL; - } else { // From LmaNodeGE1 to LmaNodeGE1 nodes - for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { - LmaNodeGE1 *node = node_fr_ge1[node_fr_pos]; - for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; - son_pos++) { - assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); - LmaNodeGE1 *node_son = nodes_ge1_ - + get_son_offset(node) + son_pos; - if (node_son->spl_idx >= id_start - && node_son->spl_idx < id_start + id_num) { - if (node_to_num < MAX_EXTENDBUF_LEN) { - node_to_ge1[node_to_num] = node_son; - node_to_num++; - } - } - // id_start + id_num - 1 is the last one, which has just been - // recorded. - if (node_son->spl_idx >= id_start + id_num - 1) - break; - } - } - - spl_pos++; - if (spl_pos >= splid_str_len || node_to_num == 0) - break; - // Prepare the nodes for next extending - // next time, from LmaNodeGE1 to LmaNodeGE1 - LmaNodeGE1 **node_tmp = node_fr_ge1; - node_fr_ge1 = node_to_ge1; - node_to_ge1 = node_tmp; - } - - // The number of node for next extending - node_fr_num = node_to_num; - node_to_num = 0; - } // while - - if (0 == node_to_num) - return 0; - - NGram &ngram = NGram::get_instance(); - size_t lma_num = 0; - - // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e', - // only those candidates for the full matched one-char id will be returned. - if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0])) - node_to_num = node_to_num > 0 ? 1 : 0; - - for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) { - size_t num_of_homo = 0; - if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes - LmaNodeLE0* node_le0 = node_to_le0[node_pos]; - num_of_homo = (size_t)node_le0->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t ch_pos = lma_num + homo_pos; - lma_buf[ch_pos].id = - get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); - lma_buf[ch_pos].lma_len = 1; - lma_buf[ch_pos].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); - - if (lma_num + homo_pos >= max_lma_buf - 1) - break; - } - } else { // Get from LmaNodeGE1 nodes - LmaNodeGE1* node_ge1 = node_to_ge1[node_pos]; - num_of_homo = (size_t)node_ge1->num_of_homo; - for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { - size_t ch_pos = lma_num + homo_pos; - size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); - lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos); - lma_buf[ch_pos].lma_len = splid_str_len; - lma_buf[ch_pos].psb = - static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); - - if (lma_num + homo_pos >= max_lma_buf - 1) - break; - } - } - - lma_num += num_of_homo; - if (lma_num >= max_lma_buf) { - lma_num = max_lma_buf; - break; - } - } - return lma_num; -} - -uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) { - return dict_list_->get_lemma_str(id_lemma, str_buf, str_max); -} - -uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) { - char16 lma_str[kMaxLemmaSize + 1]; - uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1); - assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max); - - uint16 spl_mtrx[kMaxLemmaSize * 5]; - uint16 spl_start[kMaxLemmaSize + 1]; - spl_start[0] = 0; - uint16 try_num = 1; - - for (uint16 pos = 0; pos < lma_len; pos++) { - uint16 cand_splids_this = 0; - if (arg_valid && spl_trie_->is_full_id(splids[pos])) { - spl_mtrx[spl_start[pos]] = splids[pos]; - cand_splids_this = 1; - } else { - cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos], - arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos], - kMaxLemmaSize * 5 - spl_start[pos]); - assert(cand_splids_this > 0); - } - spl_start[pos + 1] = spl_start[pos] + cand_splids_this; - try_num *= cand_splids_this; - } - - for (uint16 try_pos = 0; try_pos < try_num; try_pos++) { - uint16 mod = 1; - for (uint16 pos = 0; pos < lma_len; pos++) { - uint16 radix = spl_start[pos + 1] - spl_start[pos]; - splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix]; - mod *= radix; - } - - if (try_extend(splids, lma_len, id_lemma)) - return lma_len; - } - - return 0; -} - -void DictTrie::set_total_lemma_count_of_others(size_t count) { - NGram& ngram = NGram::get_instance(); - ngram.set_total_freq_none_sys(count); -} - -void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) { - return dict_list_->convert_to_hanzis(str, str_len); -} - -void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) { - return dict_list_->convert_to_scis_ids(str, str_len); -} - -LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) { - if (NULL == lemma_str || lemma_len > kMaxLemmaSize) - return 0; - - return dict_list_->get_lemma_id(lemma_str, lemma_len); -} - -size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items, - size_t npre_max, size_t b4_used) { - NGram &ngram = NGram::get_instance(); - - size_t item_num = 0; - size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_; - size_t top_lmas_pos = 0; - while (item_num < npre_max && top_lmas_pos < top_lmas_num_) { - memset(npre_items + item_num, 0, sizeof(NPredictItem)); - LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos); - top_lmas_pos += 1; - if (dict_list_->get_lemma_str(top_lma_id, - npre_items[item_num].pre_hzs, - kMaxLemmaSize - 1) == 0) { - continue; - } - npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id); - npre_items[item_num].his_len = his_len; - item_num++; - } - return item_num; -} - -size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) { - return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used); -} -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp deleted file mode 100644 index 4bb4ca26..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include "../include/lpicache.h" - -namespace ime_pinyin { - -LpiCache* LpiCache::instance_ = NULL; - -LpiCache::LpiCache() { - lpi_cache_ = new LmaPsbItem[kFullSplIdStart * kMaxLpiCachePerId]; - lpi_cache_len_ = new uint16[kFullSplIdStart]; - assert(NULL != lpi_cache_); - assert(NULL != lpi_cache_len_); - for (uint16 id = 0; id < kFullSplIdStart; id++) - lpi_cache_len_[id] = 0; -} - -LpiCache::~LpiCache() { - if (NULL != lpi_cache_) - delete [] lpi_cache_; - - if (NULL != lpi_cache_len_) - delete [] lpi_cache_len_; -} - -LpiCache& LpiCache::get_instance() { - if (NULL == instance_) { - instance_ = new LpiCache(); - assert(NULL != instance_); - } - return *instance_; -} - -bool LpiCache::is_cached(uint16 splid) { - if (splid >= kFullSplIdStart) - return false; - return lpi_cache_len_[splid] != 0; -} - -size_t LpiCache::put_cache(uint16 splid, LmaPsbItem lpi_items[], - size_t lpi_num) { - uint16 num = kMaxLpiCachePerId; - if (num > lpi_num) - num = static_cast<uint16>(lpi_num); - - LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; - for (uint16 pos = 0; pos < num; pos++) - lpi_cache_this[pos] = lpi_items[pos]; - - lpi_cache_len_[splid] = num; - return num; -} - -size_t LpiCache::get_cache(uint16 splid, LmaPsbItem lpi_items[], - size_t lpi_max) { - if (lpi_max > lpi_cache_len_[splid]) - lpi_max = lpi_cache_len_[splid]; - - LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; - for (uint16 pos = 0; pos < lpi_max; pos++) { - lpi_items[pos] = lpi_cache_this[pos]; - } - return lpi_max; -} - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp deleted file mode 100644 index 41e11433..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp +++ /dev/null @@ -1,1981 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <math.h> -#include <stdio.h> -#include <string.h> -#include "../include/lpicache.h" -#include "../include/matrixsearch.h" -#include "../include/mystdlib.h" -#include "../include/ngram.h" -#include "../include/userdict.h" - -namespace ime_pinyin { - -#define PRUMING_SCORE 8000.0 - -MatrixSearch::MatrixSearch() { - inited_ = false; - spl_trie_ = SpellingTrie::get_cpinstance(); - - reset_pointers_to_null(); - - pys_decoded_len_ = 0; - mtrx_nd_pool_used_ = 0; - dmi_pool_used_ = 0; - xi_an_enabled_ = false; - dmi_c_phrase_ = false; - - assert(kMaxSearchSteps > 0); - max_sps_len_ = kMaxSearchSteps - 1; - max_hzs_len_ = kMaxSearchSteps; -} - -MatrixSearch::~MatrixSearch() { - free_resource(); -} - -void MatrixSearch::reset_pointers_to_null() { - dict_trie_ = NULL; - user_dict_ = NULL; - spl_parser_ = NULL; - - share_buf_ = NULL; - - // The following four buffers are used for decoding, and they are based on - // share_buf_, no need to delete them. - mtrx_nd_pool_ = NULL; - dmi_pool_ = NULL; - matrix_ = NULL; - dep_ = NULL; - - // Based on share_buf_, no need to delete them. - npre_items_ = NULL; -} - -bool MatrixSearch::alloc_resource() { - free_resource(); - - dict_trie_ = new DictTrie(); - user_dict_ = static_cast<AtomDictBase*>(new UserDict()); - spl_parser_ = new SpellingParser(); - - size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize; - mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t); - size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize; - dmi_size = align_to_size_t(dmi_size) / sizeof(size_t); - size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum; - matrix_size = align_to_size_t(matrix_size) / sizeof(size_t); - size_t dep_size = sizeof(DictExtPara); - dep_size = align_to_size_t(dep_size) / sizeof(size_t); - - // share_buf's size is determined by the buffers for search. - share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size]; - - if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ || - NULL == share_buf_) - return false; - - // The buffers for search are based on the share buffer - mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_); - dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size); - matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size); - dep_ = reinterpret_cast<DictExtPara*> - (share_buf_ + mtrx_nd_size + dmi_size + matrix_size); - - // The prediction buffer is also based on the share buffer. - npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_); - npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) * - sizeof(size_t) / sizeof(NPredictItem); - return true; -} - -void MatrixSearch::free_resource() { - if (NULL != dict_trie_) - delete dict_trie_; - - if (NULL != user_dict_) - delete user_dict_; - - if (NULL != spl_parser_) - delete spl_parser_; - - if (NULL != share_buf_) - delete [] share_buf_; - - reset_pointers_to_null(); -} - -bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) { - if (NULL == fn_sys_dict || NULL == fn_usr_dict) - return false; - - if (!alloc_resource()) - return false; - - if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd)) - return false; - - // If engine fails to load the user dictionary, reset the user dictionary - // to NULL. - if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { - delete user_dict_; - user_dict_ = NULL; - } else{ - user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); - } - - reset_search0(); - - inited_ = true; - return true; -} - -bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict) { - if (NULL == fn_usr_dict) - return false; - - if (!alloc_resource()) - return false; - - if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd)) - return false; - - if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { - delete user_dict_; - user_dict_ = NULL; - } else { - user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); - } - - reset_search0(); - - inited_ = true; - return true; -} - -void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) { - assert(inited_); - - if (NULL != user_dict_) { - delete user_dict_; - user_dict_ = NULL; - } - - if (NULL != fn_usr_dict) { - user_dict_ = static_cast<AtomDictBase*>(new UserDict()); - if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { - delete user_dict_; - user_dict_ = NULL; - } - } - - reset_search0(); -} - -bool MatrixSearch::is_user_dictionary_enabled() const { - return NULL != user_dict_; -} - -void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) { - if (0 != max_sps_len) - max_sps_len_ = max_sps_len; - if (0 != max_hzs_len) - max_hzs_len_ = max_hzs_len; -} - -void MatrixSearch::close() { - flush_cache(); - free_resource(); - inited_ = false; -} - -void MatrixSearch::flush_cache() { - if (NULL != user_dict_) - user_dict_->flush_cache(); -} - -void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) { - xi_an_enabled_ = xi_an_enabled; -} - -bool MatrixSearch::get_xi_an_switch() { - return xi_an_enabled_; -} - -bool MatrixSearch::reset_search() { - if (!inited_) - return false; - return reset_search0(); -} - -bool MatrixSearch::reset_search0() { - if (!inited_) - return false; - - pys_decoded_len_ = 0; - mtrx_nd_pool_used_ = 0; - dmi_pool_used_ = 0; - - // Get a MatrixNode from the pool - matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_; - matrix_[0].mtrx_nd_num = 1; - mtrx_nd_pool_used_ += 1; - - // Update the node, and make it to be a starting node - MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos; - node->id = 0; - node->score = 0; - node->from = NULL; - node->step = 0; - node->dmi_fr = (PoolPosType)-1; - - matrix_[0].dmi_pos = 0; - matrix_[0].dmi_num = 0; - matrix_[0].dmi_has_full_id = 1; - matrix_[0].mtrx_nd_fixed = node; - - lma_start_[0] = 0; - fixed_lmas_ = 0; - spl_start_[0] = 0; - fixed_hzs_ = 0; - - dict_trie_->reset_milestones(0, 0); - if (NULL != user_dict_) - user_dict_->reset_milestones(0, 0); - - return true; -} - -bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step, - bool clear_dmi_this_step, - bool clear_mtrx_this_step) { - if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum) - return false; - - if (0 == ch_pos) { - reset_search0(); - } else { - // Prepare mile stones of this step to clear. - MileStoneHandle *dict_handles_to_clear = NULL; - if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) { - dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles; - } - - // If there are more steps, and this step is not allowed to clear, find - // milestones of next step. - if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) { - dict_handles_to_clear = NULL; - if (matrix_[ch_pos + 1].dmi_num > 0) { - dict_handles_to_clear = - dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles; - } - } - - if (NULL != dict_handles_to_clear) { - dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]); - if (NULL != user_dict_) - user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]); - } - - pys_decoded_len_ = ch_pos; - - if (clear_dmi_this_step) { - dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos - + matrix_[ch_pos - 1].dmi_num; - matrix_[ch_pos].dmi_num = 0; - } else { - dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num; - } - - if (clear_mtrx_this_step) { - mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos - + matrix_[ch_pos - 1].mtrx_nd_num; - matrix_[ch_pos].mtrx_nd_num = 0; - } else { - mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos - + matrix_[ch_pos].mtrx_nd_num; - } - - // Modify fixed_hzs_ - if (fixed_hzs_ > 0 && - ((kLemmaIdComposing != lma_id_[0]) || - (kLemmaIdComposing == lma_id_[0] && - spl_start_[c_phrase_.length] <= ch_pos))) { - size_t fixed_ch_pos = ch_pos; - if (clear_fixed_this_step) - fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0; - while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0) - fixed_ch_pos--; - - fixed_lmas_ = 0; - fixed_hzs_ = 0; - if (fixed_ch_pos > 0) { - while (spl_start_[fixed_hzs_] < fixed_ch_pos) - fixed_hzs_++; - assert(spl_start_[fixed_hzs_] == fixed_ch_pos); - - while (lma_start_[fixed_lmas_] < fixed_hzs_) - fixed_lmas_++; - assert(lma_start_[fixed_lmas_] == fixed_hzs_); - } - - // Re-search the Pinyin string for the unlocked lemma - // which was previously fixed. - // - // Prepare mile stones of this step to clear. - MileStoneHandle *dict_handles_to_clear = NULL; - if (clear_dmi_this_step && ch_pos == fixed_ch_pos && - matrix_[fixed_ch_pos].dmi_num > 0) { - dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles; - } - - // If there are more steps, and this step is not allowed to clear, find - // milestones of next step. - if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) { - dict_handles_to_clear = NULL; - if (matrix_[fixed_ch_pos + 1].dmi_num > 0) { - dict_handles_to_clear = - dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles; - } - } - - if (NULL != dict_handles_to_clear) { - dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]); - if (NULL != user_dict_) - user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]); - } - - - pys_decoded_len_ = fixed_ch_pos; - - if (clear_dmi_this_step && ch_pos == fixed_ch_pos) { - dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos - + matrix_[fixed_ch_pos - 1].dmi_num; - matrix_[fixed_ch_pos].dmi_num = 0; - } else { - dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos + - matrix_[fixed_ch_pos].dmi_num; - } - - if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) { - mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos - + matrix_[fixed_ch_pos - 1].mtrx_nd_num; - matrix_[fixed_ch_pos].mtrx_nd_num = 0; - } else { - mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos - + matrix_[fixed_ch_pos].mtrx_nd_num; - } - - for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) { - add_char(pys_[re_pos]); - } - } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) { - for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) { - uint16 splpos_begin = c_phrase_.sublma_start[subpos]; - uint16 splpos_end = c_phrase_.sublma_start[subpos + 1]; - for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) { - // If ch_pos is in this spelling - uint16 spl_start = c_phrase_.spl_start[splpos]; - uint16 spl_end = c_phrase_.spl_start[splpos + 1]; - if (ch_pos >= spl_start && ch_pos < spl_end) { - // Clear everything after this position - c_phrase_.chn_str[splpos] = static_cast<char16>('\0'); - c_phrase_.sublma_start[subpos + 1] = splpos; - c_phrase_.sublma_num = subpos + 1; - c_phrase_.length = splpos; - - if (splpos == splpos_begin) { - c_phrase_.sublma_num = subpos; - } - } - } - } - - // Extend the composing phrase. - reset_search0(); - dmi_c_phrase_ = true; - uint16 c_py_pos = 0; - while (c_py_pos < spl_start_[c_phrase_.length]) { - bool b_ac_tmp = add_char(pys_[c_py_pos]); - assert(b_ac_tmp); - c_py_pos++; - } - dmi_c_phrase_ = false; - - lma_id_num_ = 1; - fixed_lmas_ = 1; - fixed_lmas_no1_[0] = 0; // A composing string is always modified. - fixed_hzs_ = c_phrase_.length; - lma_start_[1] = fixed_hzs_; - lma_id_[0] = kLemmaIdComposing; - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; - } - } - - return true; -} - -void MatrixSearch::del_in_pys(size_t start, size_t len) { - while (start < kMaxRowNum - len && '\0' != pys_[start]) { - pys_[start] = pys_[start + len]; - start++; - } -} - -size_t MatrixSearch::search(const char *py, size_t py_len) { - if (!inited_ || NULL == py) - return 0; - - // If the search Pinyin string is too long, it will be truncated. - if (py_len > kMaxRowNum - 1) - py_len = kMaxRowNum - 1; - - // Compare the new string with the previous one. Find their prefix to - // increase search efficiency. - size_t ch_pos = 0; - for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) { - if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos]) - break; - } - - bool clear_fix = true; - if (ch_pos == pys_decoded_len_) - clear_fix = false; - - reset_search(ch_pos, clear_fix, false, false); - - memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos); - pys_[py_len] = '\0'; - - while ('\0' != pys_[ch_pos]) { - if (!add_char(py[ch_pos])) { - pys_decoded_len_ = ch_pos; - break; - } - ch_pos++; - } - - // Get spelling ids and starting positions. - get_spl_start_id(); - - // If there are too many spellings, remove the last letter until the spelling - // number is acceptable. - while (spl_id_num_ > 9) { - py_len--; - reset_search(py_len, false, false, false); - pys_[py_len] = '\0'; - get_spl_start_id(); - } - - prepare_candidates(); - - if (kPrintDebug0) { - printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_); - printf("--DMI Pool Used: %d\n", dmi_pool_used_); - - if (kPrintDebug1) { - for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) { - debug_print_dmi(pos, 1); - } - } - } - - return ch_pos; -} - -size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step) { - if (!inited_) - return 0; - - size_t reset_pos = pos; - - // Out of range for both Pinyin mode and Spelling id mode. - if (pys_decoded_len_ <= pos) { - del_in_pys(pos, 1); - - reset_pos = pys_decoded_len_; - // Decode the string after the un-decoded position - while ('\0' != pys_[reset_pos]) { - if (!add_char(pys_[reset_pos])) { - pys_decoded_len_ = reset_pos; - break; - } - reset_pos++; - } - get_spl_start_id(); - prepare_candidates(); - return pys_decoded_len_; - } - - // Spelling id mode, but out of range. - if (is_pos_in_splid && spl_id_num_ <= pos) - return pys_decoded_len_; - - // Begin to handle two modes respectively. - // Pinyin mode by default - size_t c_py_len = 0; // The length of composing phrase's Pinyin - size_t del_py_len = 1; - if (!is_pos_in_splid) { - // Pinyin mode is only allowed to delete beyond the fixed lemmas. - if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]]) - return pys_decoded_len_; - - del_in_pys(pos, 1); - - // If the deleted character is just the one after the last fixed lemma - if (pos == spl_start_[lma_start_[fixed_lmas_]]) { - // If all fixed lemmas have been merged, and the caller of the function - // request to unlock the last fixed lemma. - if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) { - // Unlock the last sub lemma in the composing phrase. Because it is not - // easy to unlock it directly. Instead, we re-decode the modified - // composing phrase. - c_phrase_.sublma_num--; - c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num]; - reset_pos = spl_start_[c_phrase_.length]; - c_py_len = reset_pos; - } - } - } else { - del_py_len = spl_start_[pos + 1] - spl_start_[pos]; - - del_in_pys(spl_start_[pos], del_py_len); - - if (pos >= lma_start_[fixed_lmas_]) { - c_py_len = 0; - reset_pos = spl_start_[pos + 1] - del_py_len; - } else { - c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len; - reset_pos = c_py_len; - if (c_py_len > 0) - merge_fixed_lmas(pos); - } - } - - if (c_py_len > 0) { - assert(c_phrase_.length > 0 && c_py_len == - c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]); - // The composing phrase is valid, reset all search space, - // and begin a new search which will only extend the composing - // phrase. - reset_search0(); - - dmi_c_phrase_ = true; - // Extend the composing phrase. - uint16 c_py_pos = 0; - while (c_py_pos < c_py_len) { - bool b_ac_tmp = add_char(pys_[c_py_pos]); - assert(b_ac_tmp); - c_py_pos++; - } - dmi_c_phrase_ = false; - - // Fixd the composing phrase as the first choice. - lma_id_num_ = 1; - fixed_lmas_ = 1; - fixed_lmas_no1_[0] = 0; // A composing string is always modified. - fixed_hzs_ = c_phrase_.length; - lma_start_[1] = fixed_hzs_; - lma_id_[0] = kLemmaIdComposing; - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; - } else { - // Reseting search only clear pys_decoded_len_, but the string is kept. - reset_search(reset_pos, clear_fixed_this_step, false, false); - } - - // Decode the string after the delete position. - while ('\0' != pys_[reset_pos]) { - if (!add_char(pys_[reset_pos])) { - pys_decoded_len_ = reset_pos; - break; - } - reset_pos++; - } - - get_spl_start_id(); - prepare_candidates(); - return pys_decoded_len_; -} - -size_t MatrixSearch::get_candidate_num() { - if (!inited_ || 0 == pys_decoded_len_ || - 0 == matrix_[pys_decoded_len_].mtrx_nd_num) - return 0; - - return 1 + lpi_total_; -} - -char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str, - size_t max_len) { - if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str) - return NULL; - - if (0 == cand_id) { - return get_candidate0(cand_str, max_len, NULL, false); - } else { - cand_id--; - } - - // For this case: the current sentence is a word only, and the user fixed it, - // so the result will be fixed to the sentence space, and - // lpi_total_ will be set to 0. - if (0 == lpi_total_) { - return get_candidate0(cand_str, max_len, NULL, false); - } - - LemmaIdType id = lpi_items_[cand_id].id; - char16 s[kMaxLemmaSize + 1]; - - uint16 s_len = lpi_items_[cand_id].lma_len; - if (s_len > 1) { - s_len = get_lemma_str(id, s, kMaxLemmaSize + 1); - } else { - // For a single character, Hanzi is ready. - s[0] = lpi_items_[cand_id].hanzi; - s[1] = static_cast<char16>(0); - } - - if (s_len > 0 && max_len > s_len) { - utf16_strncpy(cand_str, s, s_len); - cand_str[s_len] = (char16)'\0'; - return cand_str; - } - - return NULL; -} - -void MatrixSearch::update_dict_freq() { - if (NULL != user_dict_) { - // Update the total frequency of all lemmas, including system lemmas and - // user dictionary lemmas. - size_t total_freq = user_dict_->get_total_lemma_count(); - dict_trie_->set_total_lemma_count_of_others(total_freq); - } -} - -bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to, - float score) { - if (lma_to - lma_fr <= 1 || NULL == user_dict_) - return false; - - char16 word_str[kMaxLemmaSize + 1]; - uint16 spl_ids[kMaxLemmaSize]; - - uint16 spl_id_fr = 0; - - for (uint16 pos = lma_fr; pos < lma_to; pos++) { - LemmaIdType lma_id = lma_id_[pos]; - if (is_user_lemma(lma_id)) { - user_dict_->update_lemma(lma_id, 1, true); - } - uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos]; - utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len); - - uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr, - kMaxLemmaSize + 1 - spl_id_fr); - assert(tmp == lma_len); - - tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true); - if (tmp != lma_len) { - return false; - } - - spl_id_fr += lma_len; - } - - assert(spl_id_fr <= kMaxLemmaSize); - - return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids, - spl_id_fr, 1); -} - -void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) { - if (dmi_pos >= dmi_pool_used_) return; - - DictMatchInfo *dmi = dmi_pool_ + dmi_pos; - - if (1 == nest_level) { - printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos); - } - if (dmi->dict_level > 1) { - debug_print_dmi(dmi->dmi_fr, nest_level + 1); - } - printf("---%d\n", dmi->dict_level); - printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]); - printf(" Spelling : %s, %d\n", SpellingTrie::get_instance(). - get_spelling_str(dmi->spl_id), dmi->spl_id); - printf(" Total Pinyin Len: %d\n", dmi->splstr_len); - if (1 == nest_level) { - printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos); - } -} - -bool MatrixSearch::try_add_cand0_to_userdict() { - size_t new_cand_num = get_candidate_num(); - if (fixed_hzs_ > 0 && 1 == new_cand_num) { - float score_from = 0; - uint16 lma_id_from = 0; - uint16 pos = 0; - bool modified = false; - while (pos < fixed_lmas_) { - if (lma_start_[pos + 1] - lma_start_[lma_id_from] > - static_cast<uint16>(kMaxLemmaSize)) { - float score_to_add = - mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] - .mtrx_nd_pos].score - score_from; - if (modified) { - score_to_add += 1.0; - if (score_to_add > NGram::kMaxScore) { - score_to_add = NGram::kMaxScore; - } - add_lma_to_userdict(lma_id_from, pos, score_to_add); - } - lma_id_from = pos; - score_from += score_to_add; - - // Clear the flag for next user lemma. - modified = false; - } - - if (0 == fixed_lmas_no1_[pos]) { - modified = true; - } - pos++; - } - - // Single-char word is not allowed to add to userdict. - if (lma_start_[pos] - lma_start_[lma_id_from] > 1) { - float score_to_add = - mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] - .mtrx_nd_pos].score - score_from; - if (modified) { - score_to_add += 1.0; - if (score_to_add > NGram::kMaxScore) { - score_to_add = NGram::kMaxScore; - } - add_lma_to_userdict(lma_id_from, pos, score_to_add); - } - } - } - return true; -} - -// Choose a candidate, and give new candidates for next step. -// If user finishes selection, we will try to communicate with user dictionary -// to add new items or update score of some existing items. -// -// Basic rule: -// 1. If user selects the first choice: -// 1.1. If the first choice is not a sentence, instead, it is a lemma: -// 1.1.1. If the first choice is a user lemma, notify the user -// dictionary that a user lemma is hit, and add occuring count -// by 1. -// 1.1.2. If the first choice is a system lemma, do nothing. -// 1.2. If the first choice is a sentence containing more than one lemma: -// 1.2.1. The whole sentence will be added as a user lemma. If the -// sentence contains user lemmas, -> hit, and add occuring count -// by 1. -size_t MatrixSearch::choose(size_t cand_id) { - if (!inited_ || 0 == pys_decoded_len_) - return 0; - - if (0 == cand_id) { - fixed_hzs_ = spl_id_num_; - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + - matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; - for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) { - fixed_lmas_no1_[pos] = 1; - } - fixed_lmas_ = lma_id_num_; - lpi_total_ = 0; // Clean all other candidates. - - // 1. It is the first choice - if (1 == lma_id_num_) { - // 1.1. The first choice is not a sentence but a lemma - if (is_user_lemma(lma_id_[0])) { - // 1.1.1. The first choice is a user lemma, notify the user dictionary - // that it is hit. - if (NULL != user_dict_) - user_dict_->update_lemma(lma_id_[0], 1, true); - } else { - // 1.1.2. do thing for a system lemma. - } - } else { - // 1.2. The first choice is a sentence. - // 1.2.1 Try to add the whole sentence to user dictionary, the whole - // sentence may be splitted into many items. - if (NULL != user_dict_) { - try_add_cand0_to_userdict(); - } - } - update_dict_freq(); - return 1; - } else { - cand_id--; - } - - // 2. It is not the full sentence candidate. - // Find the length of the candidate. - LemmaIdType id_chosen = lpi_items_[cand_id].id; - LmaScoreType score_chosen = lpi_items_[cand_id].psb; - size_t cand_len = lpi_items_[cand_id].lma_len; - - assert(cand_len > 0); - - // Notify the atom dictionary that this item is hit. - if (is_user_lemma(id_chosen)) { - if (NULL != user_dict_) { - user_dict_->update_lemma(id_chosen, 1, true); - } - update_dict_freq(); - } - - // 3. Fixed the chosen item. - // 3.1 Get the steps number. - size_t step_fr = spl_start_[fixed_hzs_]; - size_t step_to = spl_start_[fixed_hzs_ + cand_len]; - - // 3.2 Save the length of the original string. - size_t pys_decoded_len = pys_decoded_len_; - - // 3.2 Reset the space of the fixed part. - reset_search(step_to, false, false, true); - - // 3.3 For the last character of the fixed part, the previous DMI - // information will be kept, while the MTRX information will be re-extended, - // and only one node will be extended. - matrix_[step_to].mtrx_nd_num = 0; - - LmaPsbItem lpi_item; - lpi_item.psb = score_chosen; - lpi_item.id = id_chosen; - - PoolPosType step_to_dmi_fr = match_dmi(step_to, - spl_id_ + fixed_hzs_, cand_len); - //assert(step_to_dmi_fr != static_cast<PoolPosType>(-1)); - - extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1, - step_to_dmi_fr, step_to); - - matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos; - mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos + - matrix_[step_to].mtrx_nd_num; - - if (id_chosen == lma_id_[fixed_lmas_]) - fixed_lmas_no1_[fixed_lmas_] = 1; - else - fixed_lmas_no1_[fixed_lmas_] = 0; - lma_id_[fixed_lmas_] = id_chosen; - lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len; - fixed_lmas_++; - fixed_hzs_ = fixed_hzs_ + cand_len; - - while (step_to != pys_decoded_len) { - bool b = add_char(pys_[step_to]); - assert(b); - step_to++; - } - - if (fixed_hzs_ < spl_id_num_) { - prepare_candidates(); - } else { - lpi_total_ = 0; - if (NULL != user_dict_) { - try_add_cand0_to_userdict(); - } - } - - return get_candidate_num(); -} - -size_t MatrixSearch::cancel_last_choice() { - if (!inited_ || 0 == pys_decoded_len_) - return 0; - - size_t step_start = 0; - if (fixed_hzs_ > 0) { - size_t step_end = spl_start_[fixed_hzs_]; - MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed; - assert(NULL != end_node); - - step_start = end_node->from->step; - - if (step_start > 0) { - DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr; - fixed_hzs_ -= dmi->dict_level; - } else { - fixed_hzs_ = 0; - } - - reset_search(step_start, false, false, false); - - while (pys_[step_start] != '\0') { - bool b = add_char(pys_[step_start]); - assert(b); - step_start++; - } - - prepare_candidates(); - } - return get_candidate_num(); -} - -size_t MatrixSearch::get_fixedlen() { - if (!inited_ || 0 == pys_decoded_len_) - return 0; - return fixed_hzs_; -} - -bool MatrixSearch::prepare_add_char(char ch) { - if (pys_decoded_len_ >= kMaxRowNum - 1 || - (!spl_parser_->is_valid_to_parse(ch) && ch != '\'')) - return false; - - if (dmi_pool_used_ >= kDmiPoolSize) return false; - - pys_[pys_decoded_len_] = ch; - pys_decoded_len_++; - - MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_; - mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_; - mtrx_this_row->mtrx_nd_num = 0; - mtrx_this_row->dmi_pos = dmi_pool_used_; - mtrx_this_row->dmi_num = 0; - mtrx_this_row->dmi_has_full_id = 0; - - return true; -} - -bool MatrixSearch::is_split_at(uint16 pos) { - return !spl_parser_->is_valid_to_parse(pys_[pos - 1]); -} - -void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, - PoolPosType dmi_fr, uint16 spl_id, - uint16 node_num, unsigned char dict_level, - bool splid_end_split, unsigned char splstr_len, - unsigned char all_full_id) { - dmi->dict_handles[0] = handles[0]; - dmi->dict_handles[1] = handles[1]; - dmi->dmi_fr = dmi_fr; - dmi->spl_id = spl_id; - dmi->dict_level = dict_level; - dmi->splid_end_split = splid_end_split ? 1 : 0; - dmi->splstr_len = splstr_len; - dmi->all_full_id = all_full_id; - dmi->c_phrase = 0; -} - -bool MatrixSearch::add_char(char ch) { - if (!prepare_add_char(ch)) - return false; - return add_char_qwerty(); -} - -bool MatrixSearch::add_char_qwerty() { - matrix_[pys_decoded_len_].mtrx_nd_num = 0; - - bool spl_matched = false; - uint16 longest_ext = 0; - // Extend the search matrix, from the oldest unfixed row. ext_len means - // extending length. - for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) { - if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_]) - continue; - - // Refer to the declaration of the variable dmi_has_full_id for the - // explanation of this piece of code. In one word, it is used to prevent - // from the unwise extending of "shoud ou" but allow the reasonable - // extending of "heng ao", "lang a", etc. - if (ext_len > 1 && 0 != longest_ext && - 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) { - if (xi_an_enabled_) - continue; - else - break; - } - - uint16 oldrow = pys_decoded_len_ - ext_len; - - // 0. If that row is before the last fixed step, ignore. - if (spl_start_[fixed_hzs_] > oldrow) - continue; - - // 1. Check if that old row has valid MatrixNode. If no, means that row is - // not a boundary, either a word boundary or a spelling boundary. - // If it is for extending composing phrase, it's OK to ignore the 0. - if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_) - continue; - - // 2. Get spelling id(s) for the last ext_len chars. - uint16 spl_idx; - bool is_pre = false; - spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow, - ext_len, &is_pre); - if (is_pre) - spl_matched = true; - - if (0 == spl_idx) - continue; - - bool splid_end_split = is_split_at(oldrow + ext_len); - - // 3. Extend the DMI nodes of that old row - // + 1 is to extend an extra node from the root - for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos; - dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1; - dmi_pos++) { - DictMatchInfo *dmi = dmi_pool_ + dmi_pos; - if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) { - dmi = NULL; // The last one, NULL means extending from the root. - } else { - // If the dmi is covered by the fixed arrange, ignore it. - if (fixed_hzs_ > 0 && - pys_decoded_len_ - ext_len - dmi->splstr_len < - spl_start_[fixed_hzs_]) { - continue; - } - // If it is not in mode for composing phrase, and the source DMI node - // is marked for composing phrase, ignore this node. - if (dmi->c_phrase != 0 && !dmi_c_phrase_) { - continue; - } - } - - // For example, if "gao" is extended, "g ao" is not allowed. - // or "zh" has been passed, "z h" is not allowed. - // Both word and word-connection will be prevented. - if (longest_ext > ext_len) { - if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) { - continue; - } - - // "z h" is not allowed. - if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) { - continue; - } - } - - dep_->splids_extended = 0; - if (NULL != dmi) { - uint16 prev_ids_num = dmi->dict_level; - if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) || - (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) { - continue; - } - - DictMatchInfo *d = dmi; - while (d) { - dep_->splids[--prev_ids_num] = d->spl_id; - if ((PoolPosType)-1 == d->dmi_fr) - break; - d = dmi_pool_ + d->dmi_fr; - } - assert(0 == prev_ids_num); - dep_->splids_extended = dmi->dict_level; - } - dep_->splids[dep_->splids_extended] = spl_idx; - dep_->ext_len = ext_len; - dep_->splid_end_split = splid_end_split; - - dep_->id_num = 1; - dep_->id_start = spl_idx; - if (spl_trie_->is_half_id(spl_idx)) { - // Get the full id list - dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start)); - assert(dep_->id_num > 0); - } - - uint16 new_dmi_num; - - new_dmi_num = extend_dmi(dep_, dmi); - - if (new_dmi_num > 0) { - if (dmi_c_phrase_) { - dmi_pool_[dmi_pool_used_].c_phrase = 1; - } - matrix_[pys_decoded_len_].dmi_num += new_dmi_num; - dmi_pool_used_ += new_dmi_num; - - if (!spl_trie_->is_half_id(spl_idx)) - matrix_[pys_decoded_len_].dmi_has_full_id = 1; - } - - // If get candiate lemmas, try to extend the path - if (lpi_total_ > 0) { - uint16 fr_row; - if (NULL == dmi) { - fr_row = oldrow; - } else { - assert(oldrow >= dmi->splstr_len); - fr_row = oldrow - dmi->splstr_len; - } - for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos; - mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos + - matrix_[fr_row].mtrx_nd_num; - mtrx_nd_pos++) { - MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos; - - extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_, - dmi_pool_used_ - new_dmi_num, pys_decoded_len_); - if (longest_ext == 0) - longest_ext = ext_len; - } - } - } // for dmi_pos - } // for ext_len - mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num; - - if (dmi_c_phrase_) - return true; - - return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched); -} - -void MatrixSearch::prepare_candidates() { - // Get candiates from the first un-fixed step. - uint16 lma_size_max = kMaxLemmaSize; - if (lma_size_max > spl_id_num_ - fixed_hzs_) - lma_size_max = spl_id_num_ - fixed_hzs_; - - uint16 lma_size = lma_size_max; - - // If the full sentense candidate's unfixed part may be the same with a normal - // lemma. Remove the lemma candidate in this case. - char16 fullsent[kMaxLemmaSize + 1]; - char16 *pfullsent = NULL; - uint16 sent_len; - pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true); - - // If the unfixed part contains more than one ids, it is not necessary to - // check whether a lemma's string is the same to the unfixed part of the full - // sentence candidate, so, set it to NULL; - if (sent_len > kMaxLemmaSize) - pfullsent = NULL; - - lpi_total_ = 0; - size_t lpi_num_full_match = 0; // Number of items which are fully-matched. - while (lma_size > 0) { - size_t lma_num; - lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size, - lpi_items_ + lpi_total_, - size_t(kMaxLmaPsbItems - lpi_total_), - pfullsent, lma_size == lma_size_max); - - if (lma_num > 0) { - lpi_total_ += lma_num; - // For next lemma candidates which are not the longest, it is not - // necessary to compare with the full sentence candiate. - pfullsent = NULL; - } - if (lma_size == lma_size_max) { - lpi_num_full_match = lpi_total_; - } - lma_size--; - } - - // Sort those partially-matched items by their unified scores. - myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match, - sizeof(LmaPsbItem), cmp_lpi_with_unified_psb); - - if (kPrintDebug0) { - printf("-----Prepare candidates, score:\n"); - for (size_t a = 0; a < lpi_total_; a++) { - printf("[%03d]%d ", a, lpi_items_[a].psb); - if ((a + 1) % 6 == 0) printf("\n"); - } - printf("\n"); - } - - if (kPrintDebug0) { - printf("--- lpi_total_ = %d\n", lpi_total_); - } -} - -const char* MatrixSearch::get_pystr(size_t *decoded_len) { - if (!inited_ || NULL == decoded_len) - return NULL; - - *decoded_len = pys_decoded_len_; - return pys_; -} - -void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) { - if (fixed_lmas_ == 0) - return; - // Update spelling segmentation information first. - spl_id_num_ -= 1; - uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos]; - for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) { - spl_start_[pos] = spl_start_[pos + 1] - del_py_len; - if (pos == spl_id_num_) - break; - spl_id_[pos] = spl_id_[pos + 1]; - } - - // Begin to merge. - uint16 phrase_len = 0; - - // Update the spelling ids to the composing phrase. - // We need to convert these ids into full id in the future. - memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16)); - memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16)); - - // If composing phrase has not been created, first merge all fixed - // lemmas into a composing phrase without deletion. - if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) { - uint16 bp = 1; // Begin position of real fixed lemmas. - // There is no existing composing phrase. - if (kLemmaIdComposing != lma_id_[0]) { - c_phrase_.sublma_num = 0; - bp = 0; - } - - uint16 sub_num = c_phrase_.sublma_num; - for (uint16 pos = bp; pos <= fixed_lmas_; pos++) { - c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos]; - if (lma_start_[pos] > del_spl_pos) { - c_phrase_.sublma_start[sub_num + pos - bp] -= 1; - } - - if (pos == fixed_lmas_) - break; - - uint16 lma_len; - char16 *lma_str = c_phrase_.chn_str + - c_phrase_.sublma_start[sub_num] + phrase_len; - - lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len); - assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]); - phrase_len += lma_len; - } - assert(phrase_len == lma_start_[fixed_lmas_]); - c_phrase_.length = phrase_len; // will be deleted by 1 - c_phrase_.sublma_num += fixed_lmas_ - bp; - } else { - for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) { - if (c_phrase_.sublma_start[pos] > del_spl_pos) { - c_phrase_.sublma_start[pos] -= 1; - } - } - phrase_len = c_phrase_.length; - } - - assert(phrase_len > 0); - if (1 == phrase_len) { - // After the only one is deleted, nothing will be left. - fixed_lmas_ = 0; - return; - } - - // Delete the Chinese character in the merged phrase. - // The corresponding elements in spl_ids and spl_start of the - // phrase have been deleted. - char16 *chn_str = c_phrase_.chn_str + del_spl_pos; - for (uint16 pos = 0; - pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos; - pos++) { - chn_str[pos] = chn_str[pos + 1]; - } - c_phrase_.length -= 1; - - // If the deleted spelling id is in a sub lemma which contains more than - // one id, del_a_sub will be false; but if the deleted id is in a sub lemma - // which only contains 1 id, the whole sub lemma needs to be deleted, so - // del_a_sub will be true. - bool del_a_sub = false; - for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) { - if (c_phrase_.sublma_start[pos - 1] == - c_phrase_.sublma_start[pos]) { - del_a_sub = true; - } - if (del_a_sub) { - c_phrase_.sublma_start[pos - 1] = - c_phrase_.sublma_start[pos]; - } - } - if (del_a_sub) - c_phrase_.sublma_num -= 1; - - return; -} - -void MatrixSearch::get_spl_start_id() { - lma_id_num_ = 0; - lma_start_[0] = 0; - - spl_id_num_ = 0; - spl_start_[0] = 0; - if (!inited_ || 0 == pys_decoded_len_ || - 0 == matrix_[pys_decoded_len_].mtrx_nd_num) - return; - - // Calculate number of lemmas and spellings - // Only scan those part which is not fixed. - lma_id_num_ = fixed_lmas_; - spl_id_num_ = fixed_hzs_; - - MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; - while (mtrx_nd != mtrx_nd_pool_) { - if (fixed_hzs_ > 0) { - if (mtrx_nd->step <= spl_start_[fixed_hzs_]) - break; - } - - // Update the spelling segamentation information - unsigned char word_splstr_len = 0; - PoolPosType dmi_fr = mtrx_nd->dmi_fr; - if ((PoolPosType)-1 != dmi_fr) - word_splstr_len = dmi_pool_[dmi_fr].splstr_len; - - while ((PoolPosType)-1 != dmi_fr) { - spl_start_[spl_id_num_ + 1] = mtrx_nd->step - - (word_splstr_len - dmi_pool_[dmi_fr].splstr_len); - spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id; - spl_id_num_++; - dmi_fr = dmi_pool_[dmi_fr].dmi_fr; - } - - // Update the lemma segmentation information - lma_start_[lma_id_num_ + 1] = spl_id_num_; - lma_id_[lma_id_num_] = mtrx_nd->id; - lma_id_num_++; - - mtrx_nd = mtrx_nd->from; - } - - // Reverse the result of spelling info - for (size_t pos = fixed_hzs_; - pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) { - if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) { - spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; - spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1]; - spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; - - spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1]; - spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos]; - spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1]; - } - } - - // Reverse the result of lemma info - for (size_t pos = fixed_lmas_; - pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) { - assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos); - - if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) { - lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; - lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1]; - lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; - - lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; - lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos]; - lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; - } - } - - for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) { - if (pos < lma_id_num_) - lma_start_[pos] = lma_start_[pos - 1] + - (lma_start_[pos] - lma_start_[pos + 1]); - else - lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] - - lma_start_[fixed_lmas_]; - } - - // Find the last fixed position - fixed_hzs_ = 0; - for (size_t pos = spl_id_num_; pos > 0; pos--) { - if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) { - fixed_hzs_ = pos; - break; - } - } - - return; -} - -size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) { - get_spl_start_id(); - spl_start = spl_start_; - return spl_id_num_; -} - -size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) { - if (dmi_pool_used_ >= kDmiPoolSize) return 0; - - if (dmi_c_phrase_) - return extend_dmi_c(dep, dmi_s); - - LpiCache& lpi_cache = LpiCache::get_instance(); - uint16 splid = dep->splids[dep->splids_extended]; - - bool cached = false; - if (0 == dep->splids_extended) - cached = lpi_cache.is_cached(splid); - - // 1. If this is a half Id, get its corresponding full starting Id and - // number of full Id. - size_t ret_val = 0; - PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node - - lpi_total_ = 0; - - MileStoneHandle from_h[3]; - from_h[0] = 0; - from_h[1] = 0; - - if (0 != dep->splids_extended) { - from_h[0] = dmi_s->dict_handles[0]; - from_h[1] = dmi_s->dict_handles[1]; - } - - // 2. Begin exgtending in the system dictionary - size_t lpi_num = 0; - MileStoneHandle handles[2]; - handles[0] = handles[1] = 0; - if (from_h[0] > 0 || NULL == dmi_s) { - handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_, - kMaxLmaPsbItems, &lpi_num); - } - if (handles[0] > 0) - lpi_total_ = lpi_num; - - if (NULL == dmi_s) { // from root - assert(0 != handles[0]); - mtrx_dmi_fr = dmi_pool_used_; - } - - // 3. Begin extending in the user dictionary - if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) { - handles[1] = user_dict_->extend_dict(from_h[1], dep, - lpi_items_ + lpi_total_, - kMaxLmaPsbItems - lpi_total_, - &lpi_num); - if (handles[1] > 0) { - if (kPrintDebug0) { - for (size_t t = 0; t < lpi_num; t++) { - printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id, - lpi_items_[lpi_total_ + t].psb); - } - } - lpi_total_ += lpi_num; - } - } - - if (0 != handles[0] || 0 != handles[1]) { - if (dmi_pool_used_ >= kDmiPoolSize) return 0; - - DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; - if (NULL == dmi_s) { - fill_dmi(dmi_add, handles, - (PoolPosType)-1, splid, - 1, 1, dep->splid_end_split, dep->ext_len, - spl_trie_->is_half_id(splid) ? 0 : 1); - } else { - fill_dmi(dmi_add, handles, - dmi_s - dmi_pool_, splid, 1, - dmi_s->dict_level + 1, dep->splid_end_split, - dmi_s->splstr_len + dep->ext_len, - spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); - } - - ret_val = 1; - } - - if (!cached) { - if (0 == lpi_total_) - return ret_val; - - if (kPrintDebug0) { - printf("--- lpi_total_ = %d\n", lpi_total_); - } - - myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb); - if (NULL == dmi_s && spl_trie_->is_half_id(splid)) - lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_); - } else { - assert(spl_trie_->is_half_id(splid)); - lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems); - } - - return ret_val; -} - -size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) { - lpi_total_ = 0; - - uint16 pos = dep->splids_extended; - assert(dmi_c_phrase_); - if (pos >= c_phrase_.length) - return 0; - - uint16 splid = dep->splids[pos]; - if (splid == c_phrase_.spl_ids[pos]) { - DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; - MileStoneHandle handles[2]; // Actually never used. - if (NULL == dmi_s) - fill_dmi(dmi_add, handles, - (PoolPosType)-1, splid, - 1, 1, dep->splid_end_split, dep->ext_len, - spl_trie_->is_half_id(splid) ? 0 : 1); - else - fill_dmi(dmi_add, handles, - dmi_s - dmi_pool_, splid, 1, - dmi_s->dict_level + 1, dep->splid_end_split, - dmi_s->splstr_len + dep->ext_len, - spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); - - if (pos == c_phrase_.length - 1) { - lpi_items_[0].id = kLemmaIdComposing; - lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score. - lpi_total_ = 1; - } - return 1; - } - return 0; -} - -size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], - size_t lpi_num, PoolPosType dmi_fr, - size_t res_row) { - assert(NULL != mtrx_nd); - matrix_[res_row].mtrx_nd_fixed = NULL; - - if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow) - return 0; - - if (0 == mtrx_nd->step) { - // Because the list is sorted, if the source step is 0, it is only - // necessary to pick up the first kMaxNodeARow items. - if (lpi_num > kMaxNodeARow) - lpi_num = kMaxNodeARow; - } - - MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos; - for (size_t pos = 0; pos < lpi_num; pos++) { - float score = mtrx_nd->score + lpi_items[pos].psb; - if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score) - break; - - // Try to add a new node - size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num; - MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num; - bool replace = false; - // Find its position - while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) { - if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow) - *mtrx_nd_res = *(mtrx_nd_res - 1); - mtrx_nd_res--; - replace = true; - } - if (replace || (mtrx_nd_num < kMaxNodeARow && - matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) { - mtrx_nd_res->id = lpi_items[pos].id; - mtrx_nd_res->score = score; - mtrx_nd_res->from = mtrx_nd; - mtrx_nd_res->dmi_fr = dmi_fr; - mtrx_nd_res->step = res_row; - if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow) - matrix_[res_row].mtrx_nd_num++; - } - } - return matrix_[res_row].mtrx_nd_num; -} - -PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[], - uint16 spl_id_num) { - if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) { - return static_cast<PoolPosType>(-1); - } - - for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) { - DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos; - - if (dmi->dict_level != spl_id_num) - continue; - - bool matched = true; - for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) { - if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) { - matched = false; - break; - } - - dmi = dmi_pool_ + dmi->dmi_fr; - } - if (matched) { - return matrix_[step_to].dmi_pos + dmi_pos; - } - } - - return static_cast<PoolPosType>(-1); -} - -char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len, - uint16 *retstr_len, - bool only_unfixed) { - if (pys_decoded_len_ == 0 || - matrix_[pys_decoded_len_].mtrx_nd_num == 0) - return NULL; - - LemmaIdType idxs[kMaxRowNum]; - size_t id_num = 0; - - MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; - - if (kPrintDebug0) { - printf("--- sentence score: %f\n", mtrx_nd->score); - } - - if (kPrintDebug1) { - printf("==============Sentence DMI (reverse order) begin===========>>\n"); - } - - while (mtrx_nd != NULL) { - idxs[id_num] = mtrx_nd->id; - id_num++; - - if (kPrintDebug1) { - printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n", - mtrx_nd->step, mtrx_nd->id, mtrx_nd->score); - debug_print_dmi(mtrx_nd->dmi_fr, 1); - } - - mtrx_nd = mtrx_nd->from; - } - - if (kPrintDebug1) { - printf("<<==============Sentence DMI (reverse order) end=============\n"); - } - - size_t ret_pos = 0; - do { - id_num--; - if (0 == idxs[id_num]) - continue; - - char16 str[kMaxLemmaSize + 1]; - uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1); - if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) || - (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) { - if (!only_unfixed) - utf16_strncpy(cand_str + ret_pos, str, str_len); - else if (ret_pos >= fixed_hzs_) - utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len); - - ret_pos += str_len; - } else { - return NULL; - } - } while (id_num != 0); - - if (!only_unfixed) { - if (NULL != retstr_len) - *retstr_len = ret_pos; - cand_str[ret_pos] = (char16)'\0'; - } else { - if (NULL != retstr_len) - *retstr_len = ret_pos - fixed_hzs_; - cand_str[ret_pos - fixed_hzs_] = (char16)'\0'; - } - return cand_str; -} - -size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len, - LmaPsbItem* lma_buf, size_t max_lma_buf, - const char16 *pfullsent, bool sort_by_psb) { - if (splid_str_len > kMaxLemmaSize) - return 0; - - size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len, - lma_buf, max_lma_buf); - size_t num2 = 0; - if (NULL != user_dict_) { - num2 = user_dict_->get_lpis(splid_str, splid_str_len, - lma_buf + num1, max_lma_buf - num1); - } - - size_t num = num1 + num2; - - if (0 == num) - return 0; - - // Remove repeated items. - if (splid_str_len > 1) { - LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num); - size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) / - sizeof(LmaPsbStrItem); - //assert(lpsi_num > num); - if (num > lpsi_num) num = lpsi_num; - lpsi_num = num; - - for (size_t pos = 0; pos < lpsi_num; pos++) { - lpsis[pos].lpi = lma_buf[pos]; - get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1); - } - - myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str); - - size_t remain_num = 0; - for (size_t pos = 0; pos < lpsi_num; pos++) { - if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) { - if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) { - assert(remain_num > 0); - lma_buf[remain_num - 1] = lpsis[pos].lpi; - } - continue; - } - if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0) - continue; - - lma_buf[remain_num] = lpsis[pos].lpi; - remain_num++; - } - - // Update the result number - num = remain_num; - } else { - // For single character, some characters have more than one spelling, for - // example, "de" and "di" are all valid for a Chinese character, so when - // the user input "d", repeated items are generated. - // For single character lemmas, Hanzis will be gotten - for (size_t pos = 0; pos < num; pos++) { - char16 hanzis[2]; - get_lemma_str(lma_buf[pos].id, hanzis, 2); - lma_buf[pos].hanzi = hanzis[0]; - } - - myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi); - - size_t remain_num = 0; - for (size_t pos = 0; pos < num; pos++) { - if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) { - if (NULL != pfullsent && - static_cast<char16>(0) == pfullsent[1] && - lma_buf[pos].hanzi == pfullsent[0]) - continue; - - if (lma_buf[pos].psb < lma_buf[pos - 1].psb) { - assert(remain_num > 0); - assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi); - lma_buf[remain_num - 1] = lma_buf[pos]; - } - continue; - } - if (NULL != pfullsent && - static_cast<char16>(0) == pfullsent[1] && - lma_buf[pos].hanzi == pfullsent[0]) - continue; - - lma_buf[remain_num] = lma_buf[pos]; - remain_num++; - } - - num = remain_num; - } - - if (sort_by_psb) { - myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb); - } - return num; -} - -uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, - uint16 str_max) { - uint16 str_len = 0; - - if (is_system_lemma(id_lemma)) { - str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max); - } else if (is_user_lemma(id_lemma)) { - if (NULL != user_dict_) { - str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max); - } else { - str_len = 0; - str_buf[0] = static_cast<char16>('\0'); - } - } else if (is_composing_lemma(id_lemma)) { - if (str_max <= 1) - return 0; - str_len = c_phrase_.sublma_start[c_phrase_.sublma_num]; - if (str_len > str_max - 1) - str_len = str_max - 1; - utf16_strncpy(str_buf, c_phrase_.chn_str, str_len); - str_buf[str_len] = (char16)'\0'; - return str_len; - } - - return str_len; -} - -uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) { - uint16 splid_num = 0; - - if (arg_valid) { - for (splid_num = 0; splid_num < splids_max; splid_num++) { - if (spl_trie_->is_half_id(splids[splid_num])) - break; - } - if (splid_num == splids_max) - return splid_num; - } - - if (is_system_lemma(id_lemma)) { - splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max, - arg_valid); - } else if (is_user_lemma(id_lemma)) { - if (NULL != user_dict_) { - splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max, - arg_valid); - } else { - splid_num = 0; - } - } else if (is_composing_lemma(id_lemma)) { - if (c_phrase_.length > splids_max) { - return 0; - } - for (uint16 pos = 0; pos < c_phrase_.length; pos++) { - splids[pos] = c_phrase_.spl_ids[pos]; - if (spl_trie_->is_half_id(splids[pos])) { - return 0; - } - } - } - return splid_num; -} - -size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len, - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len) { - size_t res_total = 0; - memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_); - // In order to shorten the comments, j-character candidates predicted by - // i-character prefix are called P(i,j). All candiates predicted by - // i-character prefix are called P(i,*) - // Step 1. Get P(kMaxPredictSize, *) and sort them, here - // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1) - for (size_t len = fixed_len; len >0; len--) { - // How many blank items are available - size_t this_max = npre_items_len_ - res_total; - size_t res_this; - // If the history is longer than 1, and we can not get prediction from - // lemmas longer than 2, in this case, we will add lemmas with - // highest scores as the prediction result. - if (fixed_len > 1 && 1 == len && 0 == res_total) { - // Try to find if recent n (n>1) characters can be a valid lemma in system - // dictionary. - bool nearest_n_word = false; - for (size_t nlen = 2; nlen <= fixed_len; nlen++) { - if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) { - nearest_n_word = true; - break; - } - } - res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0, - npre_items_ + res_total, - this_max, res_total); - res_total += res_this; - } - - // How many blank items are available - this_max = npre_items_len_ - res_total; - res_this = 0; - if (!kOnlyUserDictPredict) { - res_this = - dict_trie_->predict(fixed_buf + fixed_len - len, len, - npre_items_ + res_total, this_max, - res_total); - } - - if (NULL != user_dict_) { - res_this = res_this + - user_dict_->predict(fixed_buf + fixed_len - len, len, - npre_items_ + res_total + res_this, - this_max - res_this, res_total + res_this); - } - - if (kPredictLimitGt1) { - myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem), - cmp_npre_by_score); - - if (len > 3) { - if (res_this > kMaxPredictNumByGt3) - res_this = kMaxPredictNumByGt3; - } else if (3 == len) { - if (res_this > kMaxPredictNumBy3) - res_this = kMaxPredictNumBy3; - } else if (2 == len) { - if (res_this > kMaxPredictNumBy2) - res_this = kMaxPredictNumBy2; - } - } - - res_total += res_this; - } - - res_total = remove_duplicate_npre(npre_items_, res_total); - - if (kPreferLongHistoryPredict) { - myqsort(npre_items_, res_total, sizeof(NPredictItem), - cmp_npre_by_hislen_score); - } else { - myqsort(npre_items_, res_total, sizeof(NPredictItem), - cmp_npre_by_score); - } - - if (buf_len < res_total) { - res_total = buf_len; - } - - if (kPrintDebug2) { - printf("/////////////////Predicted Items Begin////////////////////>>\n"); - for (size_t i = 0; i < res_total; i++) { - printf("---"); - for (size_t j = 0; j < kMaxPredictSize; j++) { - printf("%d ", npre_items_[i].pre_hzs[j]); - } - printf("\n"); - } - printf("<<///////////////Predicted Items End////////////////////////\n"); - } - - for (size_t i = 0; i < res_total; i++) { - utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs, - kMaxPredictSize); - predict_buf[i][kMaxPredictSize] = '\0'; - } - - return res_total; -} - -size_t MatrixSearch::get_predicts(const char16 fixed_buf[], - char16 predict_buf[][kMaxPredictSize + 1], - size_t buf_len) { - size_t fixed_len = utf16_strlen(fixed_buf); - if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len) - return 0; - - return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len); -} - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp deleted file mode 100644 index 93bbcc9f..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> - -namespace ime_pinyin { - -// For debug purpose. You can add a fixed version of qsort and bsearch functions -// here so that the output will be totally the same under different platforms. - -void myqsort(void *p, size_t n, size_t es, - int (*cmp)(const void *, const void *)) { - qsort(p,n, es, cmp); -} - -void *mybsearch(const void *k, const void *b, - size_t n, size_t es, - int (*cmp)(const void *, const void *)) { - return bsearch(k, b, n, es, cmp); -} -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp deleted file mode 100644 index 6aec850b..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <math.h> -#include <stdio.h> -#include <string.h> -#include <time.h> -#include "../include/mystdlib.h" -#include "../include/ngram.h" - -namespace ime_pinyin { - -#define ADD_COUNT 0.3 - -int comp_double(const void *p1, const void *p2) { - if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) - return -1; - if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) - return 1; - return 0; -} - -inline double distance(double freq, double code) { - // return fabs(freq - code); - return freq * fabs(log(freq) - log(code)); -} - -// Find the index of the code value which is nearest to the given freq -int qsearch_nearest(double code_book[], double freq, int start, int end) { - if (start == end) - return start; - - if (start + 1 == end) { - if (distance(freq, code_book[end]) > distance(freq, code_book[start])) - return start; - return end; - } - - int mid = (start + end) / 2; - - if (code_book[mid] > freq) - return qsearch_nearest(code_book, freq, start, mid); - else - return qsearch_nearest(code_book, freq, mid, end); -} - -size_t update_code_idx(double freqs[], size_t num, double code_book[], - CODEBOOK_TYPE *code_idx) { - size_t changed = 0; - for (size_t pos = 0; pos < num; pos++) { - CODEBOOK_TYPE idx; - idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); - if (idx != code_idx[pos]) - changed++; - code_idx[pos] = idx; - } - return changed; -} - -double recalculate_kernel(double freqs[], size_t num, double code_book[], - CODEBOOK_TYPE *code_idx) { - double ret = 0; - - size_t *item_num = new size_t[kCodeBookSize]; - assert(item_num); - memset(item_num, 0, sizeof(size_t) * kCodeBookSize); - - double *cb_new = new double[kCodeBookSize]; - assert(cb_new); - memset(cb_new, 0, sizeof(double) * kCodeBookSize); - - for (size_t pos = 0; pos < num; pos++) { - ret += distance(freqs[pos], code_book[code_idx[pos]]); - - cb_new[code_idx[pos]] += freqs[pos]; - item_num[code_idx[pos]] += 1; - } - - for (size_t code = 0; code < kCodeBookSize; code++) { - assert(item_num[code] > 0); - code_book[code] = cb_new[code] / item_num[code]; - } - - delete [] item_num; - delete [] cb_new; - - return ret; -} - -void iterate_codes(double freqs[], size_t num, double code_book[], - CODEBOOK_TYPE *code_idx) { - size_t iter_num = 0; - double delta_last = 0; - do { - size_t changed = update_code_idx(freqs, num, code_book, code_idx); - - double delta = recalculate_kernel(freqs, num, code_book, code_idx); - - if (kPrintDebug0) { - printf("---Unigram codebook iteration: %d : %d, %.9f\n", - iter_num, changed, delta); - } - iter_num++; - - if (iter_num > 1 && - (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) - break; - delta_last = delta; - } while (true); -} - - -NGram* NGram::instance_ = NULL; - -NGram::NGram() { - initialized_ = false; - idx_num_ = 0; - lma_freq_idx_ = NULL; - sys_score_compensation_ = 0; - -#ifdef ___BUILD_MODEL___ - freq_codes_df_ = NULL; -#endif - freq_codes_ = NULL; -} - -NGram::~NGram() { - if (NULL != lma_freq_idx_) - free(lma_freq_idx_); - -#ifdef ___BUILD_MODEL___ - if (NULL != freq_codes_df_) - free(freq_codes_df_); -#endif - - if (NULL != freq_codes_) - free(freq_codes_); -} - -NGram& NGram::get_instance() { - if (NULL == instance_) - instance_ = new NGram(); - return *instance_; -} - -bool NGram::save_ngram(FILE *fp) { - if (!initialized_ || NULL == fp) - return false; - - if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) - return false; - - if (fwrite(&idx_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != - kCodeBookSize) - return false; - - if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) - return false; - - return true; -} - -bool NGram::load_ngram(FILE *fp) { - if (NULL == fp) - return false; - - initialized_ = false; - - if (fread(&idx_num_, sizeof(uint32), 1, fp) != 1 ) - return false; - - if (NULL != lma_freq_idx_) - free(lma_freq_idx_); - - if (NULL != freq_codes_) - free(freq_codes_); - - lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> - (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); - freq_codes_ = static_cast<LmaScoreType*> - (malloc(kCodeBookSize * sizeof(LmaScoreType))); - - if (NULL == lma_freq_idx_ || NULL == freq_codes_) - return false; - - if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != - kCodeBookSize) - return false; - - if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) - return false; - - initialized_ = true; - - total_freq_none_sys_ = 0; - return true; -} - -void NGram::set_total_freq_none_sys(size_t freq_none_sys) { - total_freq_none_sys_ = freq_none_sys; - if (0 == total_freq_none_sys_) { - sys_score_compensation_ = 0; - } else { - double factor = static_cast<double>(kSysDictTotalFreq) / ( - kSysDictTotalFreq + total_freq_none_sys_); - sys_score_compensation_ = static_cast<float>( - log(factor) * kLogValueAmplifier); - } -} - -// The caller makes sure this oject is initialized. -float NGram::get_uni_psb(LemmaIdType lma_id) { - return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + - sys_score_compensation_; -} - -float NGram::convert_psb_to_score(double psb) { - float score = static_cast<float>( - log(psb) * static_cast<double>(kLogValueAmplifier)); - if (score > static_cast<float>(kMaxScore)) { - score = static_cast<float>(kMaxScore); - } - return score; -} - -#ifdef ___BUILD_MODEL___ -bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, - LemmaIdType next_idx_unused) { - if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) - return false; - - double total_freq = 0; - double *freqs = new double[next_idx_unused]; - if (NULL == freqs) - return false; - - freqs[0] = ADD_COUNT; - total_freq += freqs[0]; - LemmaIdType idx_now = 0; - for (size_t pos = 0; pos < lemma_num; pos++) { - if (lemma_arr[pos].idx_by_hz == idx_now) - continue; - idx_now++; - - assert(lemma_arr[pos].idx_by_hz == idx_now); - - freqs[idx_now] = lemma_arr[pos].freq; - if (freqs[idx_now] <= 0) - freqs[idx_now] = 0.3; - - total_freq += freqs[idx_now]; - } - - double max_freq = 0; - idx_num_ = idx_now + 1; - assert(idx_now + 1 == next_idx_unused); - - for (size_t pos = 0; pos < idx_num_; pos++) { - freqs[pos] = freqs[pos] / total_freq; - assert(freqs[pos] > 0); - if (freqs[pos] > max_freq) - max_freq = freqs[pos]; - } - - // calculate the code book - if (NULL == freq_codes_df_) - freq_codes_df_ = new double[kCodeBookSize]; - assert(freq_codes_df_); - memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); - - if (NULL == freq_codes_) - freq_codes_ = new LmaScoreType[kCodeBookSize]; - assert(freq_codes_); - memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); - - size_t freq_pos = 0; - for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { - bool found = true; - - while (found) { - found = false; - double cand = freqs[freq_pos]; - for (size_t i = 0; i < code_pos; i++) - if (freq_codes_df_[i] == cand) { - found = true; - break; - } - if (found) - freq_pos++; - } - - freq_codes_df_[code_pos] = freqs[freq_pos]; - freq_pos++; - } - - myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); - - if (NULL == lma_freq_idx_) - lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; - assert(lma_freq_idx_); - - iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); - - delete [] freqs; - - if (kPrintDebug0) { - printf("\n------Language Model Unigram Codebook------\n"); - } - - for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { - double log_score = log(freq_codes_df_[code_pos]); - float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); - if (kPrintDebug0) { - printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", - code_pos, freq_codes_df_[code_pos], log_score, final_score); - } - freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); - } - - initialized_ = true; - return true; -} -#endif - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp deleted file mode 100644 index 4d206a76..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include "../include/pinyinime.h" -#include "../include/dicttrie.h" -#include "../include/matrixsearch.h" -#include "../include/spellingtrie.h" - -#ifdef __cplusplus -extern "C" { -#endif - - using namespace ime_pinyin; - - // The maximum number of the prediction items. - static const size_t kMaxPredictNum = 500; - - // Used to search Pinyin string and give the best candidate. - MatrixSearch* matrix_search = NULL; - - char16 predict_buf[kMaxPredictNum][kMaxPredictSize + 1]; - - bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict) { - if (NULL != matrix_search) - delete matrix_search; - - matrix_search = new MatrixSearch(); - if (NULL == matrix_search) { - return false; - } - - return matrix_search->init(fn_sys_dict, fn_usr_dict); - } - - bool im_open_decoder_fd(int sys_fd, long start_offset, long length, - const char *fn_usr_dict) { - if (NULL != matrix_search) - delete matrix_search; - - matrix_search = new MatrixSearch(); - if (NULL == matrix_search) - return false; - - return matrix_search->init_fd(sys_fd, start_offset, length, fn_usr_dict); - } - - void im_close_decoder() { - if (NULL != matrix_search) { - matrix_search->close(); - delete matrix_search; - } - matrix_search = NULL; - } - - void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len) { - if (NULL != matrix_search) { - matrix_search->set_max_lens(max_sps_len, max_hzs_len); - } - } - - void im_flush_cache() { - if (NULL != matrix_search) - matrix_search->flush_cache(); - } - - // To be updated. - size_t im_search(const char* pybuf, size_t pylen) { - if (NULL == matrix_search) - return 0; - - matrix_search->search(pybuf, pylen); - return matrix_search->get_candidate_num(); - } - - size_t im_delsearch(size_t pos, bool is_pos_in_splid, - bool clear_fixed_this_step) { - if (NULL == matrix_search) - return 0; - matrix_search->delsearch(pos, is_pos_in_splid, clear_fixed_this_step); - return matrix_search->get_candidate_num(); - } - - void im_reset_search() { - if (NULL == matrix_search) - return; - - matrix_search->reset_search(); - } - - // To be removed - size_t im_add_letter(char ch) { - return 0; - } - - const char* im_get_sps_str(size_t *decoded_len) { - if (NULL == matrix_search) - return NULL; - - return matrix_search->get_pystr(decoded_len); - } - - char16* im_get_candidate(size_t cand_id, char16* cand_str, - size_t max_len) { - if (NULL == matrix_search) - return NULL; - - return matrix_search->get_candidate(cand_id, cand_str, max_len); - } - - size_t im_get_spl_start_pos(const uint16 *&spl_start) { - if (NULL == matrix_search) - return 0; - - return matrix_search->get_spl_start(spl_start); - } - - size_t im_choose(size_t choice_id) { - if (NULL == matrix_search) - return 0; - - return matrix_search->choose(choice_id); - } - - size_t im_cancel_last_choice() { - if (NULL == matrix_search) - return 0; - - return matrix_search->cancel_last_choice(); - } - - size_t im_get_fixed_len() { - if (NULL == matrix_search) - return 0; - - return matrix_search->get_fixedlen(); - } - - // To be removed - bool im_cancel_input() { - return true; - } - - - size_t im_get_predicts(const char16 *his_buf, - char16 (*&pre_buf)[kMaxPredictSize + 1]) { - if (NULL == his_buf) - return 0; - - size_t fixed_len = utf16_strlen(his_buf); - const char16 *fixed_ptr = his_buf; - if (fixed_len > kMaxPredictSize) { - fixed_ptr += fixed_len - kMaxPredictSize; - fixed_len = kMaxPredictSize; - } - - pre_buf = predict_buf; - return matrix_search->get_predicts(his_buf, pre_buf, kMaxPredictNum); - } - - void im_enable_shm_as_szm(bool enable) { - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - spl_trie.szm_enable_shm(enable); - } - - void im_enable_ym_as_szm(bool enable) { - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - spl_trie.szm_enable_ym(enable); - } - - void im_init_user_dictionary(const char *fn_usr_dict) { - if (!matrix_search) - return; - matrix_search->flush_cache(); - matrix_search->init_user_dictionary(fn_usr_dict); - } - - bool im_is_user_dictionary_enabled(void) { - return NULL != matrix_search ? matrix_search->is_user_dictionary_enabled() : false; - } - -#ifdef __cplusplus -} -#endif diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp deleted file mode 100644 index 281da388..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include "../include/mystdlib.h" -#include "../include/searchutility.h" - -namespace ime_pinyin { - -bool is_system_lemma(LemmaIdType lma_id) { - return (0 < lma_id && lma_id <= kSysDictIdEnd); -} - -bool is_user_lemma(LemmaIdType lma_id) { - return (kUserDictIdStart <= lma_id && lma_id <= kUserDictIdEnd); -} - -bool is_composing_lemma(LemmaIdType lma_id) { - return (kLemmaIdComposing == lma_id); -} - -int cmp_lpi_with_psb(const void *p1, const void *p2) { - if ((static_cast<const LmaPsbItem*>(p1))->psb > - (static_cast<const LmaPsbItem*>(p2))->psb) - return 1; - if ((static_cast<const LmaPsbItem*>(p1))->psb < - (static_cast<const LmaPsbItem*>(p2))->psb) - return -1; - return 0; -} - -int cmp_lpi_with_unified_psb(const void *p1, const void *p2) { - const LmaPsbItem *item1 = static_cast<const LmaPsbItem*>(p1); - const LmaPsbItem *item2 = static_cast<const LmaPsbItem*>(p2); - - // The real unified psb is psb1 / lma_len1 and psb2 * lma_len2 - // But we use psb1 * lma_len2 and psb2 * lma_len1 to get better - // precision. - size_t up1 = item1->psb * (item2->lma_len); - size_t up2 = item2->psb * (item1->lma_len); - if (up1 < up2) { - return -1; - } - if (up1 > up2) { - return 1; - } - return 0; -} - -int cmp_lpi_with_id(const void *p1, const void *p2) { - if ((static_cast<const LmaPsbItem*>(p1))->id < - (static_cast<const LmaPsbItem*>(p2))->id) - return -1; - if ((static_cast<const LmaPsbItem*>(p1))->id > - (static_cast<const LmaPsbItem*>(p2))->id) - return 1; - return 0; -} - -int cmp_lpi_with_hanzi(const void *p1, const void *p2) { - if ((static_cast<const LmaPsbItem*>(p1))->hanzi < - (static_cast<const LmaPsbItem*>(p2))->hanzi) - return -1; - if ((static_cast<const LmaPsbItem*>(p1))->hanzi > - (static_cast<const LmaPsbItem*>(p2))->hanzi) - return 1; - - return 0; -} - -int cmp_lpsi_with_str(const void *p1, const void *p2) { - return utf16_strcmp((static_cast<const LmaPsbStrItem*>(p1))->str, - (static_cast<const LmaPsbStrItem*>(p2))->str); -} - - -int cmp_hanzis_1(const void *p1, const void *p2) { - if (*static_cast<const char16*>(p1) < - *static_cast<const char16*>(p2)) - return -1; - - if (*static_cast<const char16*>(p1) > - *static_cast<const char16*>(p2)) - return 1; - return 0; -} - -int cmp_hanzis_2(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 2); -} - -int cmp_hanzis_3(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 3); -} - -int cmp_hanzis_4(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 4); -} - -int cmp_hanzis_5(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 5); -} - -int cmp_hanzis_6(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 6); -} - -int cmp_hanzis_7(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 7); -} - -int cmp_hanzis_8(const void *p1, const void *p2) { - return utf16_strncmp(static_cast<const char16*>(p1), - static_cast<const char16*>(p2), 8); -} - -int cmp_npre_by_score(const void *p1, const void *p2) { - if ((static_cast<const NPredictItem*>(p1))->psb > - (static_cast<const NPredictItem*>(p2))->psb) - return 1; - - if ((static_cast<const NPredictItem*>(p1))->psb < - (static_cast<const NPredictItem*>(p2))->psb) - return -1; - - return 0; -} - -int cmp_npre_by_hislen_score(const void *p1, const void *p2) { - if ((static_cast<const NPredictItem*>(p1))->his_len < - (static_cast<const NPredictItem*>(p2))->his_len) - return 1; - - if ((static_cast<const NPredictItem*>(p1))->his_len > - (static_cast<const NPredictItem*>(p2))->his_len) - return -1; - - if ((static_cast<const NPredictItem*>(p1))->psb > - (static_cast<const NPredictItem*>(p2))->psb) - return 1; - - if ((static_cast<const NPredictItem*>(p1))->psb < - (static_cast<const NPredictItem*>(p2))->psb) - return -1; - - return 0; -} - -int cmp_npre_by_hanzi_score(const void *p1, const void *p2) { - int ret_v = (utf16_strncmp((static_cast<const NPredictItem*>(p1))->pre_hzs, - (static_cast<const NPredictItem*>(p2))->pre_hzs, kMaxPredictSize)); - if (0 != ret_v) - return ret_v; - - if ((static_cast<const NPredictItem*>(p1))->psb > - (static_cast<const NPredictItem*>(p2))->psb) - return 1; - - if ((static_cast<const NPredictItem*>(p1))->psb < - (static_cast<const NPredictItem*>(p2))->psb) - return -1; - - return 0; -} - -size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num) { - if (NULL == npre_items || 0 == npre_num) - return 0; - - myqsort(npre_items, npre_num, sizeof(NPredictItem), cmp_npre_by_hanzi_score); - - size_t remain_num = 1; // The first one is reserved. - for (size_t pos = 1; pos < npre_num; pos++) { - if (utf16_strncmp(npre_items[pos].pre_hzs, - npre_items[remain_num - 1].pre_hzs, - kMaxPredictSize) != 0) { - if (remain_num != pos) { - npre_items[remain_num] = npre_items[pos]; - } - remain_num++; - } - } - return remain_num; -} - -size_t align_to_size_t(size_t size) { - size_t s = sizeof(size_t); - return (size + s -1) / s * s; -} - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp deleted file mode 100644 index 6005e20d..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include "../include/spellingtable.h" - -namespace ime_pinyin { - -#ifdef ___BUILD_MODEL___ - -const char SpellingTable:: - kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; - -// "" is the biggest, so that all empty strings will be moved to the end -// _eb mean empty is biggest -int compare_raw_spl_eb(const void* p1, const void* p2) { - if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0]) - return 1; - - if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0]) - return -1; - - return strcmp((static_cast<const RawSpelling*>(p1))->str, - (static_cast<const RawSpelling*>(p2))->str); -} - -size_t get_odd_next(size_t value) { - size_t v_next = value; - while (true) { - size_t v_next_sqrt = (size_t)sqrt(v_next); - - bool is_odd = true; - for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { - if (v_next % v_dv == 0) { - is_odd = false; - break; - } - } - - if (is_odd) - return v_next; - - v_next++; - } - - // never reach here - return 0; -} - -SpellingTable::SpellingTable() { - need_score_ = false; - raw_spellings_ = NULL; - spelling_buf_ = NULL; - spelling_num_ = 0; - total_freq_ = 0; - frozen_ = true; -} - -SpellingTable::~SpellingTable() { - free_resource(); -} - -size_t SpellingTable::get_hash_pos(const char* spelling_str) { - size_t hash_pos = 0; - for (size_t pos = 0; pos < spelling_size_; pos++) { - if ('\0' == spelling_str[pos]) - break; - hash_pos += (size_t)spelling_str[pos]; - } - - hash_pos = hash_pos % spelling_max_num_; - return hash_pos; -} - -size_t SpellingTable::hash_pos_next(size_t hash_pos) { - hash_pos += 123; - hash_pos = hash_pos % spelling_max_num_; - return hash_pos; -} - -void SpellingTable::free_resource() { - if (NULL != raw_spellings_) - delete [] raw_spellings_; - raw_spellings_ = NULL; - - if (NULL != spelling_buf_) - delete [] spelling_buf_; - spelling_buf_ = NULL; -} - -bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, - bool need_score) { - if (pure_spl_size == 0 || spl_max_num ==0) - return false; - - need_score_ = need_score; - - free_resource(); - - spelling_size_ = pure_spl_size + 1; - if (need_score) - spelling_size_ += 1; - spelling_max_num_ = get_odd_next(spl_max_num); - spelling_num_ = 0; - - raw_spellings_ = new RawSpelling[spelling_max_num_]; - spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; - if (NULL == raw_spellings_ || NULL == spelling_buf_) { - free_resource(); - return false; - } - - memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); - memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); - frozen_ = false; - total_freq_ = 0; - return true; -} - -bool SpellingTable::put_spelling(const char* spelling_str, double freq) { - if (frozen_ || NULL == spelling_str) - return false; - - for (size_t pos = 0; pos < kNotSupportNum; pos++) { - if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { - return false; - } - } - - total_freq_ += freq; - - size_t hash_pos = get_hash_pos(spelling_str); - - raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; - - if (strncmp(raw_spellings_[hash_pos].str, spelling_str, - spelling_size_ - 1) == 0) { - raw_spellings_[hash_pos].freq += freq; - return true; - } - - size_t hash_pos_ori = hash_pos; - - while (true) { - if (strncmp(raw_spellings_[hash_pos].str, - spelling_str, spelling_size_ - 1) == 0) { - raw_spellings_[hash_pos].freq += freq; - return true; - } - - if ('\0' == raw_spellings_[hash_pos].str[0]) { - raw_spellings_[hash_pos].freq += freq; - strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); - raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; - spelling_num_++; - return true; - } - - hash_pos = hash_pos_next(hash_pos); - if (hash_pos_ori == hash_pos) - return false; - } - - // never reach here - return false; -} - -bool SpellingTable::contain(const char* spelling_str) { - if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) - return false; - - size_t hash_pos = get_hash_pos(spelling_str); - - if ('\0' == raw_spellings_[hash_pos].str[0]) - return false; - - if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) - == 0) - return true; - - size_t hash_pos_ori = hash_pos; - - while (true) { - hash_pos = hash_pos_next(hash_pos); - if (hash_pos_ori == hash_pos) - return false; - - if ('\0' == raw_spellings_[hash_pos].str[0]) - return false; - - if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) - == 0) - return true; - } - - // never reach here - return false; -} - -const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { - if (NULL == raw_spellings_ || NULL == spelling_buf_ || - NULL == item_size || NULL == spl_num) - return NULL; - - qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), - compare_raw_spl_eb); - - // After sorting, only the first spelling_num_ items are valid. - // Copy them to the destination buffer. - for (size_t pos = 0; pos < spelling_num_; pos++) { - strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, - spelling_size_); - } - - if (need_score_) { - if (kPrintDebug0) - printf("------------Spelling Possiblities--------------\n"); - - double max_score = 0; - double min_score = 0; - - // After sorting, only the first spelling_num_ items are valid. - for (size_t pos = 0; pos < spelling_num_; pos++) { - raw_spellings_[pos].freq /= total_freq_; - if (need_score_) { - if (0 == pos) { - max_score = raw_spellings_[0].freq; - min_score = max_score; - } else { - if (raw_spellings_[pos].freq > max_score) - max_score = raw_spellings_[pos].freq; - if (raw_spellings_[pos].freq < min_score) - min_score = raw_spellings_[pos].freq; - } - } - } - - if (kPrintDebug0) - printf("-----max psb: %f, min psb: %f\n", max_score, min_score); - - max_score = log(max_score); - min_score = log(min_score); - - if (kPrintDebug0) - printf("-----max log value: %f, min log value: %f\n", - max_score, min_score); - - // The absolute value of min_score is bigger than that of max_score because - // both of them are negative after log function. - score_amplifier_ = 1.0 * 255 / min_score; - - double average_score = 0; - for (size_t pos = 0; pos < spelling_num_; pos++) { - double score = log(raw_spellings_[pos].freq) * score_amplifier_; - assert(score >= 0); - - average_score += score; - - // Because of calculation precision issue, score might be a little bigger - // than 255 after being amplified. - if (score > 255) - score = 255; - char *this_spl_buf = spelling_buf_ + pos * spelling_size_; - this_spl_buf[spelling_size_ - 1] = - static_cast<char>((unsigned char)score); - - if (kPrintDebug0) { - printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf, - (unsigned char)this_spl_buf[spelling_size_ -1]); - } - } - average_score /= spelling_num_; - assert(average_score <= 255); - average_score_ = static_cast<uint8>(average_score); - - if (kPrintDebug0) - printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, - average_score_); - } - - *item_size = spelling_size_; - *spl_num = spelling_num_; - frozen_ = true; - return spelling_buf_; -} - -float SpellingTable::get_score_amplifier() { - return static_cast<float>(score_amplifier_); -} - -unsigned char SpellingTable::get_average_score() { - return average_score_; -} - -#endif // ___BUILD_MODEL___ -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp deleted file mode 100644 index e01c89a5..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp +++ /dev/null @@ -1,832 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdio.h> -#include <string.h> -#include <assert.h> -#include "../include/dictdef.h" - -#ifdef _WIN32 -#define snprintf _snprintf -#endif - -#ifdef ___BUILD_MODEL___ -#include "../include/spellingtable.h" -#endif - -#include "../include/spellingtrie.h" - -namespace ime_pinyin { - -SpellingTrie* SpellingTrie::instance_ = NULL; - -// z/c/s is for Zh/Ch/Sh -const char SpellingTrie::kHalfId2Sc_[kFullSplIdStart + 1] = - "0ABCcDEFGHIJKLMNOPQRSsTUVWXYZz"; - -// Bit 0 : is it a Shengmu char? -// Bit 1 : is it a Yunmu char? (one char is a Yunmu) -// Bit 2 : is it enabled in ShouZiMu(first char) mode? -unsigned char SpellingTrie::char_flags_[] = { - // a b c d e f g - 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, - // h i j k l m n - 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, - // o p q r s t - 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, - // u v w x y z - 0x00, 0x00, 0x01, 0x01, 0x01, 0x01 -}; - -int compare_spl(const void* p1, const void* p2) { - return strcmp((const char*)(p1), (const char*)(p2)); -} - -SpellingTrie::SpellingTrie() { - spelling_buf_ = NULL; - spelling_size_ = 0; - spelling_num_ = 0; - spl_ym_ids_ = NULL; - splstr_queried_ = NULL; - splstr16_queried_ = NULL; - root_ = NULL; - dumb_node_ = NULL; - splitter_node_ = NULL; - instance_ = NULL; - ym_buf_ = NULL; - f2h_ = NULL; - - szm_enable_shm(true); - szm_enable_ym(true); - -#ifdef ___BUILD_MODEL___ - node_num_ = 0; -#endif -} - -SpellingTrie::~SpellingTrie() { - if (NULL != spelling_buf_) - delete [] spelling_buf_; - - if (NULL != splstr_queried_) - delete [] splstr_queried_; - - if (NULL != splstr16_queried_) - delete [] splstr16_queried_; - - if (NULL != spl_ym_ids_) - delete [] spl_ym_ids_; - - if (NULL != root_) { - free_son_trie(root_); - delete root_; - } - - if (NULL != dumb_node_) { - delete [] dumb_node_; - } - - if (NULL != splitter_node_) { - delete [] splitter_node_; - } - - if (NULL != instance_) { - delete instance_; - instance_ = NULL; - } - - if (NULL != ym_buf_) - delete [] ym_buf_; - - if (NULL != f2h_) - delete [] f2h_; -} - -bool SpellingTrie::if_valid_id_update(uint16 *splid) const { - if (NULL == splid || 0 == *splid) - return false; - - if (*splid >= kFullSplIdStart) - return true; - if (*splid < kFullSplIdStart) { - char ch = kHalfId2Sc_[*splid]; - if (ch > 'Z') { - return true; - } else { - if (szm_is_enabled(ch)) { - return true; - } else if (is_yunmu_char(ch)) { - assert(h2f_num_[*splid] > 0); - *splid = h2f_start_[*splid]; - return true; - } - } - } - return false; -} - -bool SpellingTrie::is_half_id(uint16 splid) const { - if (0 == splid || splid >= kFullSplIdStart) - return false; - - return true; -} - -bool SpellingTrie::is_full_id(uint16 splid) const { - if (splid < kFullSplIdStart || splid >= kFullSplIdStart + spelling_num_) - return false; - return true; -} - -bool SpellingTrie::half_full_compatible(uint16 half_id, uint16 full_id) const { - uint16 half_fr_full = full_to_half(full_id); - - if (half_fr_full == half_id) - return true; - - // &~0x20 is used to conver the char to upper case. - // So that Zh/Ch/Sh(whose char is z/c/s) can be matched with Z/C/S. - char ch_f = (kHalfId2Sc_[half_fr_full] & (~0x20)); - char ch_h = kHalfId2Sc_[half_id]; - if (ch_f == ch_h) - return true; - - return false; -} - -bool SpellingTrie::is_half_id_yunmu(uint16 splid) const { - if (0 == splid || splid >= kFullSplIdStart) - return false; - - char ch = kHalfId2Sc_[splid]; - // If ch >= 'a', that means the half id is one of Zh/Ch/Sh - if (ch >= 'a') { - return false; - } - - return char_flags_[ch - 'A'] & kHalfIdYunmuMask; -} - -bool SpellingTrie::is_shengmu_char(char ch) const { - return char_flags_[ch - 'A'] & kHalfIdShengmuMask; -} - -bool SpellingTrie::is_yunmu_char(char ch) const { - return char_flags_[ch - 'A'] & kHalfIdYunmuMask; -} - -bool SpellingTrie::is_szm_char(char ch) const { - return is_shengmu_char(ch) || is_yunmu_char(ch); -} - -bool SpellingTrie::szm_is_enabled(char ch) const { - return char_flags_[ch - 'A'] & kHalfIdSzmMask; -} - -void SpellingTrie::szm_enable_shm(bool enable) { - if (enable) { - for (char ch = 'A'; ch <= 'Z'; ch++) { - if (is_shengmu_char(ch)) - char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; - } - } else { - for (char ch = 'A'; ch <= 'Z'; ch++) { - if (is_shengmu_char(ch)) - char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); - } - } -} - -void SpellingTrie::szm_enable_ym(bool enable) { - if (enable) { - for (char ch = 'A'; ch <= 'Z'; ch++) { - if (is_yunmu_char(ch)) - char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; - } - } else { - for (char ch = 'A'; ch <= 'Z'; ch++) { - if (is_yunmu_char(ch)) - char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); - } - } -} - -bool SpellingTrie::is_szm_enabled(char ch) const { - return char_flags_[ch - 'A'] & kHalfIdSzmMask; -} - -const SpellingTrie* SpellingTrie::get_cpinstance() { - return &get_instance(); -} - -SpellingTrie& SpellingTrie::get_instance() { - if (NULL == instance_) - instance_ = new SpellingTrie(); - - return *instance_; -} - -uint16 SpellingTrie::half2full_num(uint16 half_id) const { - if (NULL == root_ || half_id >= kFullSplIdStart) - return 0; - return h2f_num_[half_id]; -} - -uint16 SpellingTrie::half_to_full(uint16 half_id, uint16 *spl_id_start) const { - if (NULL == spl_id_start || NULL == root_ || half_id >= kFullSplIdStart) - return 0; - - *spl_id_start = h2f_start_[half_id]; - return h2f_num_[half_id]; -} - -uint16 SpellingTrie::full_to_half(uint16 full_id) const { - if (NULL == root_ || full_id < kFullSplIdStart || - full_id > spelling_num_ + kFullSplIdStart) - return 0; - - return f2h_[full_id - kFullSplIdStart]; -} - -void SpellingTrie::free_son_trie(SpellingNode* node) { - if (NULL == node) - return; - - for (size_t pos = 0; pos < node->num_of_son; pos++) { - free_son_trie(node->first_son + pos); - } - - if (NULL != node->first_son) - delete [] node->first_son; -} - -bool SpellingTrie::construct(const char* spelling_arr, size_t item_size, - size_t item_num, float score_amplifier, - unsigned char average_score) { - if (spelling_arr == NULL) - return false; - - memset(h2f_start_, 0, sizeof(uint16) * kFullSplIdStart); - memset(h2f_num_, 0, sizeof(uint16) * kFullSplIdStart); - - // If the arr is the same as the buf, means this function is called by - // load_table(), the table data are ready; otherwise the array should be - // saved. - if (spelling_arr != spelling_buf_) { - if (NULL != spelling_buf_) - delete [] spelling_buf_; - spelling_buf_ = new char[item_size * item_num]; - if (NULL == spelling_buf_) - return false; - memcpy(spelling_buf_, spelling_arr, sizeof(char) * item_size * item_num); - } - - spelling_size_ = item_size; - spelling_num_ = item_num; - - score_amplifier_ = score_amplifier; - average_score_ = average_score; - - if (NULL != splstr_queried_) - delete [] splstr_queried_; - splstr_queried_ = new char[spelling_size_]; - if (NULL == splstr_queried_) - return false; - - if (NULL != splstr16_queried_) - delete [] splstr16_queried_; - splstr16_queried_ = new char16[spelling_size_]; - if (NULL == splstr16_queried_) - return false; - - // First, sort the buf to ensure they are in ascendant order - qsort(spelling_buf_, spelling_num_, spelling_size_, compare_spl); - -#ifdef ___BUILD_MODEL___ - node_num_ = 1; -#endif - - root_ = new SpellingNode(); - memset(root_, 0, sizeof(SpellingNode)); - - dumb_node_ = new SpellingNode(); - memset(dumb_node_, 0, sizeof(SpellingNode)); - dumb_node_->score = average_score_; - - splitter_node_ = new SpellingNode(); - memset(splitter_node_, 0, sizeof(SpellingNode)); - splitter_node_->score = average_score_; - - memset(level1_sons_, 0, sizeof(SpellingNode*) * kValidSplCharNum); - - root_->first_son = construct_spellings_subset(0, spelling_num_, 0, root_); - - // Root's score should be cleared. - root_->score = 0; - - if (NULL == root_->first_son) - return false; - - h2f_start_[0] = h2f_num_[0] = 0; - - if (!build_f2h()) - return false; - -#ifdef ___BUILD_MODEL___ - if (kPrintDebug0) { - printf("---SpellingTrie Nodes: %d\n", (int)node_num_); - } - return build_ym_info(); -#else - return true; -#endif -} - -#ifdef ___BUILD_MODEL___ -const char* SpellingTrie::get_ym_str(const char *spl_str) { - bool start_ZCS = false; - if (is_shengmu_char(*spl_str)) { - if ('Z' == *spl_str || 'C' == *spl_str || 'S' == *spl_str) - start_ZCS = true; - spl_str += 1; - if (start_ZCS && 'h' == *spl_str) - spl_str += 1; - } - return spl_str; -} - -bool SpellingTrie::build_ym_info() { - bool sucess; - SpellingTable *spl_table = new SpellingTable(); - - sucess = spl_table->init_table(kMaxPinyinSize - 1, 2 * kMaxYmNum, false); - assert(sucess); - - for (uint16 pos = 0; pos < spelling_num_; pos++) { - const char *spl_str = spelling_buf_ + spelling_size_ * pos; - spl_str = get_ym_str(spl_str); - if ('\0' != spl_str[0]) { - sucess = spl_table->put_spelling(spl_str, 0); - assert(sucess); - } - } - - size_t ym_item_size; // '\0' is included - size_t ym_num; - const char* ym_buf; - ym_buf = spl_table->arrange(&ym_item_size, &ym_num); - - if (NULL != ym_buf_) - delete [] ym_buf_; - ym_buf_ = new char[ym_item_size * ym_num]; - if (NULL == ym_buf_) { - delete spl_table; - return false; - } - - memcpy(ym_buf_, ym_buf, sizeof(char) * ym_item_size * ym_num); - ym_size_ = ym_item_size; - ym_num_ = ym_num; - - delete spl_table; - - // Generate the maping from the spelling ids to the Yunmu ids. - if (spl_ym_ids_) - delete spl_ym_ids_; - spl_ym_ids_ = new uint8[spelling_num_ + kFullSplIdStart]; - if (NULL == spl_ym_ids_) - return false; - - memset(spl_ym_ids_, 0, sizeof(uint8) * (spelling_num_ + kFullSplIdStart)); - - for (uint16 id = 1; id < spelling_num_ + kFullSplIdStart; id++) { - const char *str = get_spelling_str(id); - - str = get_ym_str(str); - if ('\0' != str[0]) { - uint8 ym_id = get_ym_id(str); - spl_ym_ids_[id] = ym_id; - assert(ym_id > 0); - } else { - spl_ym_ids_[id] = 0; - } - } - return true; -} -#endif - -SpellingNode* SpellingTrie::construct_spellings_subset( - size_t item_start, size_t item_end, size_t level, SpellingNode* parent) { - if (level >= spelling_size_ || item_end <= item_start || NULL == parent) - return NULL; - - SpellingNode *first_son = NULL; - uint16 num_of_son = 0; - unsigned char min_son_score = 255; - - const char *spelling_last_start = spelling_buf_ + spelling_size_ * item_start; - char char_for_node = spelling_last_start[level]; - assert((char_for_node >= 'A' && char_for_node <= 'Z') || - 'h' == char_for_node); - - // Scan the array to find how many sons - for (size_t i = item_start + 1; i < item_end; i++) { - const char *spelling_current = spelling_buf_ + spelling_size_ * i; - char char_current = spelling_current[level]; - if (char_current != char_for_node) { - num_of_son++; - char_for_node = char_current; - } - } - num_of_son++; - - // Allocate memory -#ifdef ___BUILD_MODEL___ - node_num_ += num_of_son; -#endif - first_son = new SpellingNode[num_of_son]; - memset(first_son, 0, sizeof(SpellingNode)*num_of_son); - - // Now begin construct tree - size_t son_pos = 0; - - spelling_last_start = spelling_buf_ + spelling_size_ * item_start; - char_for_node = spelling_last_start[level]; - - bool spelling_endable = true; - if (spelling_last_start[level + 1] != '\0') - spelling_endable = false; - - size_t item_start_next = item_start; - - for (size_t i = item_start + 1; i < item_end; i++) { - const char *spelling_current = spelling_buf_ + spelling_size_ * i; - char char_current = spelling_current[level]; - assert(is_valid_spl_char(char_current)); - - if (char_current != char_for_node) { - // Construct a node - SpellingNode *node_current = first_son + son_pos; - node_current->char_this_node = char_for_node; - - // For quick search in the first level - if (0 == level) - level1_sons_[char_for_node - 'A'] = node_current; - - if (spelling_endable) { - node_current->spelling_idx = kFullSplIdStart + item_start_next; - } - - if (spelling_last_start[level + 1] != '\0' || i - item_start_next > 1) { - size_t real_start = item_start_next; - if (spelling_last_start[level + 1] == '\0') - real_start++; - - node_current->first_son = - construct_spellings_subset(real_start, i, level + 1, - node_current); - - if (real_start == item_start_next + 1) { - uint16 score_this = static_cast<unsigned char>( - spelling_last_start[spelling_size_ - 1]); - if (score_this < node_current->score) - node_current->score = score_this; - } - } else { - node_current->first_son = NULL; - node_current->score = static_cast<unsigned char>( - spelling_last_start[spelling_size_ - 1]); - } - - if (node_current->score < min_son_score) - min_son_score = node_current->score; - - bool is_half = false; - if (level == 0 && is_szm_char(char_for_node)) { - node_current->spelling_idx = - static_cast<uint16>(char_for_node - 'A' + 1); - - if (char_for_node > 'C') - node_current->spelling_idx++; - if (char_for_node > 'S') - node_current->spelling_idx++; - - h2f_num_[node_current->spelling_idx] = i - item_start_next; - is_half = true; - } else if (level == 1 && char_for_node == 'h') { - char ch_level0 = spelling_last_start[0]; - uint16 part_id = 0; - if (ch_level0 == 'C') - part_id = 'C' - 'A' + 1 + 1; - else if (ch_level0 == 'S') - part_id = 'S' - 'A' + 1 + 2; - else if (ch_level0 == 'Z') - part_id = 'Z' - 'A' + 1 + 3; - if (0 != part_id) { - node_current->spelling_idx = part_id; - h2f_num_[node_current->spelling_idx] = i - item_start_next; - is_half = true; - } - } - - if (is_half) { - if (h2f_num_[node_current->spelling_idx] > 0) - h2f_start_[node_current->spelling_idx] = - item_start_next + kFullSplIdStart; - else - h2f_start_[node_current->spelling_idx] = 0; - } - - // for next sibling - spelling_last_start = spelling_current; - char_for_node = char_current; - item_start_next = i; - spelling_endable = true; - if (spelling_current[level + 1] != '\0') - spelling_endable = false; - - son_pos++; - } - } - - // the last one - SpellingNode *node_current = first_son + son_pos; - node_current->char_this_node = char_for_node; - - // For quick search in the first level - if (0 == level) - level1_sons_[char_for_node - 'A'] = node_current; - - if (spelling_endable) { - node_current->spelling_idx = kFullSplIdStart + item_start_next; - } - - if (spelling_last_start[level + 1] != '\0' || - item_end - item_start_next > 1) { - size_t real_start = item_start_next; - if (spelling_last_start[level + 1] == '\0') - real_start++; - - node_current->first_son = - construct_spellings_subset(real_start, item_end, level + 1, - node_current); - - if (real_start == item_start_next + 1) { - uint16 score_this = static_cast<unsigned char>( - spelling_last_start[spelling_size_ - 1]); - if (score_this < node_current->score) - node_current->score = score_this; - } - } else { - node_current->first_son = NULL; - node_current->score = static_cast<unsigned char>( - spelling_last_start[spelling_size_ - 1]); - } - - if (node_current->score < min_son_score) - min_son_score = node_current->score; - - assert(son_pos + 1 == num_of_son); - - bool is_half = false; - if (level == 0 && szm_is_enabled(char_for_node)) { - node_current->spelling_idx = static_cast<uint16>(char_for_node - 'A' + 1); - - if (char_for_node > 'C') - node_current->spelling_idx++; - if (char_for_node > 'S') - node_current->spelling_idx++; - - h2f_num_[node_current->spelling_idx] = item_end - item_start_next; - is_half = true; - } else if (level == 1 && char_for_node == 'h') { - char ch_level0 = spelling_last_start[0]; - uint16 part_id = 0; - if (ch_level0 == 'C') - part_id = 'C' - 'A' + 1 + 1; - else if (ch_level0 == 'S') - part_id = 'S' - 'A' + 1 + 2; - else if (ch_level0 == 'Z') - part_id = 'Z' - 'A' + 1 + 3; - if (0 != part_id) { - node_current->spelling_idx = part_id; - h2f_num_[node_current->spelling_idx] = item_end - item_start_next; - is_half = true; - } - } - if (is_half) { - if (h2f_num_[node_current->spelling_idx] > 0) - h2f_start_[node_current->spelling_idx] = - item_start_next + kFullSplIdStart; - else - h2f_start_[node_current->spelling_idx] = 0; - } - - parent->num_of_son = num_of_son; - parent->score = min_son_score; - return first_son; -} - -bool SpellingTrie::save_spl_trie(FILE *fp) { - if (NULL == fp || NULL == spelling_buf_) - return false; - - if (fwrite(&spelling_size_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&spelling_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fwrite(&score_amplifier_, sizeof(float), 1, fp) != 1) - return false; - - if (fwrite(&average_score_, sizeof(unsigned char), 1, fp) != 1) - return false; - - if (fwrite(spelling_buf_, sizeof(char) * spelling_size_, - spelling_num_, fp) != spelling_num_) - return false; - - return true; -} - -bool SpellingTrie::load_spl_trie(FILE *fp) { - if (NULL == fp) - return false; - - if (fread(&spelling_size_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&spelling_num_, sizeof(uint32), 1, fp) != 1) - return false; - - if (fread(&score_amplifier_, sizeof(float), 1, fp) != 1) - return false; - - if (fread(&average_score_, sizeof(unsigned char), 1, fp) != 1) - return false; - - if (NULL != spelling_buf_) - delete [] spelling_buf_; - - spelling_buf_ = new char[spelling_size_ * spelling_num_]; - if (NULL == spelling_buf_) - return false; - - if (fread(spelling_buf_, sizeof(char) * spelling_size_, - spelling_num_, fp) != spelling_num_) - return false; - - return construct(spelling_buf_, spelling_size_, spelling_num_, - score_amplifier_, average_score_); -} - -bool SpellingTrie::build_f2h() { - if (NULL != f2h_) - delete [] f2h_; - f2h_ = new uint16[spelling_num_]; - if (NULL == f2h_) - return false; - - for (uint16 hid = 0; hid < kFullSplIdStart; hid++) { - for (uint16 fid = h2f_start_[hid]; - fid < h2f_start_[hid] + h2f_num_[hid]; fid++) - f2h_[fid - kFullSplIdStart] = hid; - } - - return true; -} - -size_t SpellingTrie::get_spelling_num() { - return spelling_num_; -} - -uint8 SpellingTrie::get_ym_id(const char *ym_str) { - if (NULL == ym_str || NULL == ym_buf_) - return 0; - - for (uint8 pos = 0; pos < ym_num_; pos++) - if (strcmp(ym_buf_ + ym_size_ * pos, ym_str) == 0) - return pos + 1; - - return 0; -} - -const char* SpellingTrie::get_spelling_str(uint16 splid) { - splstr_queried_[0] = '\0'; - - if (splid >= kFullSplIdStart) { - splid -= kFullSplIdStart; - snprintf(splstr_queried_, spelling_size_, "%s", - spelling_buf_ + splid * spelling_size_); - } else { - if (splid == 'C' - 'A' + 1 + 1) { - snprintf(splstr_queried_, spelling_size_, "%s", "Ch"); - } else if (splid == 'S' - 'A' + 1 + 2) { - snprintf(splstr_queried_, spelling_size_, "%s", "Sh"); - } else if (splid == 'Z' - 'A' + 1 + 3) { - snprintf(splstr_queried_, spelling_size_, "%s", "Zh"); - } else { - if (splid > 'C' - 'A' + 1) - splid--; - if (splid > 'S' - 'A' + 1) - splid--; - splstr_queried_[0] = 'A' + splid - 1; - splstr_queried_[1] = '\0'; - } - } - return splstr_queried_; -} - -const char16* SpellingTrie::get_spelling_str16(uint16 splid) { - splstr16_queried_[0] = '\0'; - - if (splid >= kFullSplIdStart) { - splid -= kFullSplIdStart; - for (size_t pos = 0; pos < spelling_size_; pos++) { - splstr16_queried_[pos] = static_cast<char16> - (spelling_buf_[splid * spelling_size_ + pos]); - } - } else { - if (splid == 'C' - 'A' + 1 + 1) { - splstr16_queried_[0] = static_cast<char16>('C'); - splstr16_queried_[1] = static_cast<char16>('h'); - splstr16_queried_[2] = static_cast<char16>('\0'); - } else if (splid == 'S' - 'A' + 1 + 2) { - splstr16_queried_[0] = static_cast<char16>('S'); - splstr16_queried_[1] = static_cast<char16>('h'); - splstr16_queried_[2] = static_cast<char16>('\0'); - } else if (splid == 'Z' - 'A' + 1 + 3) { - splstr16_queried_[0] = static_cast<char16>('Z'); - splstr16_queried_[1] = static_cast<char16>('h'); - splstr16_queried_[2] = static_cast<char16>('\0'); - } else { - if (splid > 'C' - 'A' + 1) - splid--; - if (splid > 'S' - 'A' + 1) - splid--; - splstr16_queried_[0] = 'A' + splid - 1; - splstr16_queried_[1] = '\0'; - } - } - return splstr16_queried_; -} - -size_t SpellingTrie::get_spelling_str16(uint16 splid, char16 *splstr16, - size_t splstr16_len) { - if (NULL == splstr16 || splstr16_len < kMaxPinyinSize + 1) return 0; - - if (splid >= kFullSplIdStart) { - splid -= kFullSplIdStart; - for (size_t pos = 0; pos <= kMaxPinyinSize; pos++) { - splstr16[pos] = static_cast<char16> - (spelling_buf_[splid * spelling_size_ + pos]); - if (static_cast<char16>('\0') == splstr16[pos]) { - return pos; - } - } - } else { - if (splid == 'C' - 'A' + 1 + 1) { - splstr16[0] = static_cast<char16>('C'); - splstr16[1] = static_cast<char16>('h'); - splstr16[2] = static_cast<char16>('\0'); - return 2; - } else if (splid == 'S' - 'A' + 1 + 2) { - splstr16[0] = static_cast<char16>('S'); - splstr16[1] = static_cast<char16>('h'); - splstr16[2] = static_cast<char16>('\0'); - return 2; - } else if (splid == 'Z' - 'A' + 1 + 3) { - splstr16[0] = static_cast<char16>('Z'); - splstr16[1] = static_cast<char16>('h'); - splstr16[2] = static_cast<char16>('\0'); - return 2; - } else { - if (splid > 'C' - 'A' + 1) - splid--; - if (splid > 'S' - 'A' + 1) - splid--; - splstr16[0] = 'A' + splid - 1; - splstr16[1] = '\0'; - return 1; - } - } - - // Not reachable. - return 0; -} - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp deleted file mode 100644 index d75aec6a..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <assert.h> -#include "../include/splparser.h" - -namespace ime_pinyin { - -SpellingParser::SpellingParser() { - spl_trie_ = SpellingTrie::get_cpinstance(); -} - -bool SpellingParser::is_valid_to_parse(char ch) { - return SpellingTrie::is_valid_spl_char(ch); -} - -uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, - uint16 spl_idx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre) { - if (NULL == splstr || 0 == max_size || 0 == str_len) - return 0; - - if (!SpellingTrie::is_valid_spl_char(splstr[0])) - return 0; - - last_is_pre = false; - - const SpellingNode *node_this = spl_trie_->root_; - - uint16 str_pos = 0; - uint16 idx_num = 0; - if (NULL != start_pos) - start_pos[0] = 0; - bool last_is_splitter = false; - - while (str_pos < str_len) { - char char_this = splstr[str_pos]; - // all characters outside of [a, z] are considered as splitters - if (!SpellingTrie::is_valid_spl_char(char_this)) { - // test if the current node is endable - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - spl_idx[idx_num] = id_this; - - idx_num++; - str_pos++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - if (idx_num >= max_size) - return idx_num; - - node_this = spl_trie_->root_; - last_is_splitter = true; - continue; - } else { - if (last_is_splitter) { - str_pos++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - continue; - } else { - return idx_num; - } - } - } - - last_is_splitter = false; - - SpellingNode *found_son = NULL; - - if (0 == str_pos) { - if (char_this >= 'a') - found_son = spl_trie_->level1_sons_[char_this - 'a']; - else - found_son = spl_trie_->level1_sons_[char_this - 'A']; - } else { - SpellingNode *first_son = node_this->first_son; - // Because for Zh/Ch/Sh nodes, they are the last in the buffer and - // frequently used, so we scan from the end. - for (int i = 0; i < node_this->num_of_son; i++) { - SpellingNode *this_son = first_son + i; - if (SpellingTrie::is_same_spl_char( - this_son->char_this_node, char_this)) { - found_son = this_son; - break; - } - } - } - - // found, just move the current node pointer to the the son - if (NULL != found_son) { - node_this = found_son; - } else { - // not found, test if it is endable - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - // endable, remember the index - spl_idx[idx_num] = id_this; - - idx_num++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - if (idx_num >= max_size) - return idx_num; - node_this = spl_trie_->root_; - continue; - } else { - return idx_num; - } - } - - str_pos++; - } - - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - // endable, remember the index - spl_idx[idx_num] = id_this; - - idx_num++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - } - - last_is_pre = !last_is_splitter; - - return idx_num; -} - -uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, - uint16 spl_idx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre) { - uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, - max_size, last_is_pre); - for (uint16 pos = 0; pos < idx_num; pos++) { - if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { - spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); - if (pos == idx_num - 1) { - last_is_pre = false; - } - } - } - return idx_num; -} - -uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, - uint16 spl_idx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre) { - if (NULL == splstr || 0 == max_size || 0 == str_len) - return 0; - - if (!SpellingTrie::is_valid_spl_char(splstr[0])) - return 0; - - last_is_pre = false; - - const SpellingNode *node_this = spl_trie_->root_; - - uint16 str_pos = 0; - uint16 idx_num = 0; - if (NULL != start_pos) - start_pos[0] = 0; - bool last_is_splitter = false; - - while (str_pos < str_len) { - char16 char_this = splstr[str_pos]; - // all characters outside of [a, z] are considered as splitters - if (!SpellingTrie::is_valid_spl_char(char_this)) { - // test if the current node is endable - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - spl_idx[idx_num] = id_this; - - idx_num++; - str_pos++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - if (idx_num >= max_size) - return idx_num; - - node_this = spl_trie_->root_; - last_is_splitter = true; - continue; - } else { - if (last_is_splitter) { - str_pos++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - continue; - } else { - return idx_num; - } - } - } - - last_is_splitter = false; - - SpellingNode *found_son = NULL; - - if (0 == str_pos) { - if (char_this >= 'a') - found_son = spl_trie_->level1_sons_[char_this - 'a']; - else - found_son = spl_trie_->level1_sons_[char_this - 'A']; - } else { - SpellingNode *first_son = node_this->first_son; - // Because for Zh/Ch/Sh nodes, they are the last in the buffer and - // frequently used, so we scan from the end. - for (int i = 0; i < node_this->num_of_son; i++) { - SpellingNode *this_son = first_son + i; - if (SpellingTrie::is_same_spl_char( - this_son->char_this_node, char_this)) { - found_son = this_son; - break; - } - } - } - - // found, just move the current node pointer to the the son - if (NULL != found_son) { - node_this = found_son; - } else { - // not found, test if it is endable - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - // endable, remember the index - spl_idx[idx_num] = id_this; - - idx_num++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - if (idx_num >= max_size) - return idx_num; - node_this = spl_trie_->root_; - continue; - } else { - return idx_num; - } - } - - str_pos++; - } - - uint16 id_this = node_this->spelling_idx; - if (spl_trie_->if_valid_id_update(&id_this)) { - // endable, remember the index - spl_idx[idx_num] = id_this; - - idx_num++; - if (NULL != start_pos) - start_pos[idx_num] = str_pos; - } - - last_is_pre = !last_is_splitter; - - return idx_num; -} - -uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, - uint16 spl_idx[], uint16 start_pos[], - uint16 max_size, bool &last_is_pre) { - uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, - max_size, last_is_pre); - for (uint16 pos = 0; pos < idx_num; pos++) { - if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { - spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); - if (pos == idx_num - 1) { - last_is_pre = false; - } - } - } - return idx_num; -} - -uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, - bool *is_pre) { - if (NULL == is_pre) - return 0; - - uint16 spl_idx[2]; - uint16 start_pos[3]; - - if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) - return 0; - - if (start_pos[1] != str_len) - return 0; - return spl_idx[0]; -} - -uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, - bool *is_pre) { - if (NULL == is_pre) - return 0; - - uint16 spl_idx[2]; - uint16 start_pos[3]; - - if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) - return 0; - - if (start_pos[1] != str_len) - return 0; - if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { - spl_trie_->half_to_full(spl_idx[0], spl_idx); - *is_pre = false; - } - - return spl_idx[0]; -} - -uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, - uint16 splidx[], uint16 max_size, - uint16 &full_id_num, bool &is_pre) { - if (max_size <= 0 || !is_valid_to_parse(splstr[0])) - return 0; - - splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); - full_id_num = 0; - if (0 != splidx[0]) { - if (splidx[0] >= kFullSplIdStart) - full_id_num = 1; - return 1; - } - return 0; -} - -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp deleted file mode 100644 index 91e27b88..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../include/sync.h" -#include <assert.h> -#include <string.h> - -#ifdef ___SYNC_ENABLED___ - -namespace ime_pinyin { - -Sync::Sync() - : userdict_(NULL), - dictfile_(NULL), - last_count_(0) { -}; - -Sync::~Sync() { -} - - -bool Sync::begin(const char * filename) { - if (userdict_) { - finish(); - } - - if (!filename) { - return false; - } - - dictfile_ = strdup(filename); - if (!dictfile_) { - return false; - } - - userdict_ = new UserDict(); - if (!userdict_) { - free(dictfile_); - dictfile_ = NULL; - return false; - } - - if (userdict_->load_dict((const char*)dictfile_, kUserDictIdStart, - kUserDictIdEnd) == false) { - delete userdict_; - userdict_ = NULL; - free(dictfile_); - dictfile_ = NULL; - return false; - } - - userdict_->set_limit(kUserDictMaxLemmaCount, kUserDictMaxLemmaSize, kUserDictRatio); - - return true; -} - -int Sync::put_lemmas(char16 * lemmas, int len) { - return userdict_->put_lemmas_no_sync_from_utf16le_string(lemmas, len); -} - -int Sync::get_lemmas(char16 * str, int size) { - return userdict_->get_sync_lemmas_in_utf16le_string_from_beginning(str, size, &last_count_); -} - -int Sync::get_last_got_count() { - return last_count_; -} - -int Sync::get_total_count() { - return userdict_->get_sync_count(); -} - -void Sync::clear_last_got() { - if (last_count_ < 0) { - return; - } - userdict_->clear_sync_lemmas(0, last_count_); - last_count_ = 0; -} - -void Sync::finish() { - if (userdict_) { - userdict_->close_dict(); - delete userdict_; - userdict_ = NULL; - free(dictfile_); - dictfile_ = NULL; - last_count_ = 0; - } -} - -int Sync::get_capacity() { - UserDict::UserDictStat stat; - userdict_->state(&stat); - return stat.limit_lemma_count - stat.lemma_count; -} - -} -#endif diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp deleted file mode 100644 index 4687da2d..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp +++ /dev/null @@ -1,2286 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../include/userdict.h" -#include "../include/splparser.h" -#include "../include/ngram.h" -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#ifdef ___DEBUG_PERF___ -#include <cutils/log.h> -#endif -#ifdef _WIN32 -#include <io.h> -#else -#include <unistd.h> -#endif -#include <fcntl.h> -#include <sys/stat.h> -#include <assert.h> -#include <ctype.h> -#include <sys/types.h> -#ifdef _WIN32 -#undef max -#undef min -#include <QDateTime> -#include <QMutex> -#else -#include <pthread.h> -#endif -#include <math.h> - -namespace ime_pinyin { - -#ifdef _WIN32 -static int gettimeofday(struct timeval *tp, void *) { - const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch(); - tp->tv_sec = (long)(current_msecs_since_epoch / 1000); - tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000); - return 0; -} -#endif - -#ifdef ___DEBUG_PERF___ -static uint64 _ellapse_ = 0; -static struct timeval _tv_start_, _tv_end_; -#define DEBUG_PERF_BEGIN \ - do { \ - gettimeofday(&_tv_start_, NULL); \ - } while (0) -#define DEBUG_PERF_END \ - do { \ - gettimeofday(&_tv_end_, NULL); \ - _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \ - (_tv_end_.tv_usec - _tv_start_.tv_usec); \ - } while (0) -#define LOGD_PERF(message) \ - ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); -#else -#define DEBUG_PERF_BEGIN -#define DEBUG_PERF_END -#define LOGD_PERF(message) -#endif - -// XXX File load and write are thread-safe by g_mutex_ -#ifdef _WIN32 -static QMutex g_mutex_; -#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock()) -#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock()) -#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0)) -#else -static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; -#endif -static struct timeval g_last_update_ = {0, 0}; - -inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) { - return (4 + info->lemma_size + (info->lemma_count << 3) -#ifdef ___PREDICT_ENABLED___ - + (info->lemma_count << 2) -#endif -#ifdef ___SYNC_ENABLED___ - + (info->sync_count << 2) -#endif - + sizeof(*info)); -} - -inline LmaScoreType UserDict::translate_score(int raw_score) { - // 1) ori_freq: original user frequency - uint32 ori_freq = extract_score_freq(raw_score); - // 2) lmt_off: lmt index (week offset for example) - uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); - if (kUserDictLMTBitWidth < 16) { - uint64 mask = ~(1 << kUserDictLMTBitWidth); - lmt_off &= mask; - } - // 3) now_off: current time index (current week offset for example) - // assuming load_time_ is around current time - uint64 now_off = load_time_.tv_sec; - now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; - now_off = (now_off << (64 - kUserDictLMTBitWidth)); - now_off = (now_off >> (64 - kUserDictLMTBitWidth)); - // 4) factor: decide expand-factor - int delta = now_off - lmt_off; - if (delta > 4) - delta = 4; - int factor = 80 - (delta << 4); - - double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); - return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) - * NGram::kLogValueAmplifier); -} - -inline int UserDict::extract_score_freq(int raw_score) { - // Frequence stored in lowest 16 bits - int freq = (raw_score & 0x0000ffff); - return freq; -} - -inline uint64 UserDict::extract_score_lmt(int raw_score) { - uint64 lmt = ((raw_score & 0xffff0000) >> 16); - if (kUserDictLMTBitWidth < 16) { - uint64 mask = ~(1 << kUserDictLMTBitWidth); - lmt &= mask; - } - lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; - return lmt; -} - -inline int UserDict::build_score(uint64 lmt, int freq) { - lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; - lmt = (lmt << (64 - kUserDictLMTBitWidth)); - lmt = (lmt >> (64 - kUserDictLMTBitWidth)); - uint16 lmt16 = (uint16)lmt; - int s = freq; - s &= 0x0000ffff; - s = (lmt16 << 16) | s; - return s; -} - -inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { - int64 ret = 0; - if (len <= 0) - return ret; - - int flag = 1; - const uint16 * endp = s + len; - if (*s == '-') { - flag = -1; - s++; - } else if (*s == '+') { - s++; - } - - while (*s >= '0' && *s <= '9' && s < endp) { - ret += ret * 10 + (*s) - '0'; - s++; - } - return ret * flag; -} - -inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { - if (!s || size <= 0) - return 0; - uint16 *endp = s + size; - int ret_len = 0; - if (v < 0) { - *(s++) = '-'; - ++ret_len; - v *= -1; - } - - uint16 *b = s; - while (s < endp && v != 0) { - *(s++) = '0' + (v % 10); - v = v / 10; - ++ret_len; - } - - if (v != 0) - return 0; - - --s; - - while (b < s) { - *b = *s; - ++b, --s; - } - - return ret_len; -} - -inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { - offset &= kUserDictOffsetMask; - lemmas_[offset] |= flag; -} - -inline char UserDict::get_lemma_flag(uint32 offset) { - offset &= kUserDictOffsetMask; - return (char)(lemmas_[offset]); -} - -inline char UserDict::get_lemma_nchar(uint32 offset) { - offset &= kUserDictOffsetMask; - return (char)(lemmas_[offset + 1]); -} - -inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) { - offset &= kUserDictOffsetMask; - return (uint16 *)(lemmas_ + offset + 2); -} - -inline uint16 * UserDict::get_lemma_word(uint32 offset) { - offset &= kUserDictOffsetMask; - uint8 nchar = get_lemma_nchar(offset); - return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); -} - -inline LemmaIdType UserDict::get_max_lemma_id() { - // When a lemma is deleted, we don't not claim its id back for - // simplicity and performance - return start_id_ + dict_info_.lemma_count - 1; -} - -inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { - if (id >= start_id_ && id <= get_max_lemma_id()) - return true; - return false; -} - -inline bool UserDict::is_valid_state() { - if (state_ == USER_DICT_NONE) - return false; - return true; -} - -UserDict::UserDict() - : start_id_(0), - version_(0), - lemmas_(NULL), - offsets_(NULL), - scores_(NULL), - ids_(NULL), -#ifdef ___PREDICT_ENABLED___ - predicts_(NULL), -#endif -#ifdef ___SYNC_ENABLED___ - syncs_(NULL), - sync_count_size_(0), -#endif - offsets_by_id_(NULL), - lemma_count_left_(0), - lemma_size_left_(0), - dict_file_(NULL), - state_(USER_DICT_NONE) { - memset(&dict_info_, 0, sizeof(dict_info_)); - memset(&load_time_, 0, sizeof(load_time_)); -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif -} - -UserDict::~UserDict() { - close_dict(); -} - -bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, - LemmaIdType end_id) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - dict_file_ = strdup(file_name); - if (!dict_file_) - return false; - - start_id_ = start_id; - - if (false == validate(file_name) && false == reset(file_name)) { - goto error; - } - if (false == load(file_name, start_id)) { - goto error; - } - - state_ = USER_DICT_SYNC; - - gettimeofday(&load_time_, NULL); - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("load_dict"); -#endif - return true; - error: - free((void*)dict_file_); - dict_file_ = NULL; - start_id_ = 0; - return false; -} - -bool UserDict::close_dict() { - if (state_ == USER_DICT_NONE) - return true; - if (state_ == USER_DICT_SYNC) - goto out; - - // If dictionary is written back by others, - // we can not simply write back here - // To do a safe flush, we have to discard all newly added - // lemmas and try to reload dict file. - pthread_mutex_lock(&g_mutex_); - if (load_time_.tv_sec > g_last_update_.tv_sec || - (load_time_.tv_sec == g_last_update_.tv_sec && - load_time_.tv_usec > g_last_update_.tv_usec)) { - write_back(); - gettimeofday(&g_last_update_, NULL); - } - pthread_mutex_unlock(&g_mutex_); - - out: - free((void*)dict_file_); - free(lemmas_); - free(offsets_); - free(offsets_by_id_); - free(scores_); - free(ids_); -#ifdef ___PREDICT_ENABLED___ - free(predicts_); -#endif - - version_ = 0; - dict_file_ = NULL; - lemmas_ = NULL; -#ifdef ___SYNC_ENABLED___ - syncs_ = NULL; - sync_count_size_ = 0; -#endif - offsets_ = NULL; - offsets_by_id_ = NULL; - scores_ = NULL; - ids_ = NULL; -#ifdef ___PREDICT_ENABLED___ - predicts_ = NULL; -#endif - - memset(&dict_info_, 0, sizeof(dict_info_)); - lemma_count_left_ = 0; - lemma_size_left_ = 0; - state_ = USER_DICT_NONE; - - return true; -} - -size_t UserDict::number_of_lemmas() { - return dict_info_.lemma_count; -} - -void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { - return; -} - -MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, - const DictExtPara *dep, - LmaPsbItem *lpi_items, - size_t lpi_max, size_t *lpi_num) { - if (is_valid_state() == false) - return 0; - - bool need_extend = false; - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, - lpi_items, lpi_max, &need_extend); -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("extend_dict"); -#endif - return ((*lpi_num > 0 || need_extend) ? 1 : 0); -} - -int UserDict::is_fuzzy_prefix_spell_id( - const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { - if (len1 < searchable->splids_len) - return 0; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (i = 0; i < searchable->splids_len; i++) { - const char py1 = *spl_trie.get_spelling_str(id1[i]); - uint16 off = 8 * (i % 4); - const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); - if (py1 == py2) - continue; - return 0; - } - return 1; -} - -int UserDict::fuzzy_compare_spell_id( - const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { - if (len1 < searchable->splids_len) - return -1; - if (len1 > searchable->splids_len) - return 1; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (i = 0; i < len1; i++) { - const char py1 = *spl_trie.get_spelling_str(id1[i]); - uint16 off = 8 * (i % 4); - const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); - if (py1 == py2) - continue; - if (py1 > py2) - return 1; - return -1; - } - return 0; -} - -bool UserDict::is_prefix_spell_id( - const uint16 * fullids, uint16 fulllen, - const UserDictSearchable *searchable) { - if (fulllen < searchable->splids_len) - return false; - - uint32 i = 0; - for (; i < searchable->splids_len; i++) { - uint16 start_id = searchable->splid_start[i]; - uint16 count = searchable->splid_count[i]; - if (fullids[i] >= start_id && fullids[i] < start_id + count) - continue; - else - return false; - } - return true; -} - -bool UserDict::equal_spell_id( - const uint16 * fullids, uint16 fulllen, - const UserDictSearchable *searchable) { - if (fulllen != searchable->splids_len) - return false; - - uint32 i = 0; - for (; i < fulllen; i++) { - uint16 start_id = searchable->splid_start[i]; - uint16 count = searchable->splid_count[i]; - if (fullids[i] >= start_id && fullids[i] < start_id + count) - continue; - else - return false; - } - return true; -} - -int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = -1; - - int32 first_prefix = middle; - int32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 * splids = get_lemma_spell_ids(offset); - int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); - int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); - - if (pre) - first_prefix = middle; - - if (cmp < 0) { - begin = middle + 1; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - last_matched = middle; - } - } - - return first_prefix; -} - -void UserDict::prepare_locate(UserDictSearchable *searchable, - const uint16 *splid_str, - uint16 splid_str_len) { - searchable->splids_len = splid_str_len; - memset(searchable->signature, 0, sizeof(searchable->signature)); - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (; i < splid_str_len; i++) { - if (spl_trie.is_half_id(splid_str[i])) { - searchable->splid_count[i] = - spl_trie.half_to_full(splid_str[i], - &(searchable->splid_start[i])); - } else { - searchable->splid_count[i] = 1; - searchable->splid_start[i] = splid_str[i]; - } - const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); - searchable->signature[i>>2] |= (py << (8 * (i % 4))); - } -} - -size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, - LmaPsbItem *lpi_items, size_t lpi_max) { - return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); -} - -size_t UserDict::_get_lpis(const uint16 *splid_str, - uint16 splid_str_len, LmaPsbItem *lpi_items, - size_t lpi_max, bool * need_extend) { - bool tmp_extend; - if (!need_extend) - need_extend = &tmp_extend; - - *need_extend = false; - - if (is_valid_state() == false) - return 0; - if (lpi_max <= 0) - return 0; - - if (0 == pthread_mutex_trylock(&g_mutex_)) { - if (load_time_.tv_sec < g_last_update_.tv_sec || - (load_time_.tv_sec == g_last_update_.tv_sec && - load_time_.tv_usec < g_last_update_.tv_usec)) { - // Others updated disk file, have to reload - pthread_mutex_unlock(&g_mutex_); - flush_cache(); - } else { - pthread_mutex_unlock(&g_mutex_); - } - } else { - } - - UserDictSearchable searchable; - prepare_locate(&searchable, splid_str, splid_str_len); - - uint32 max_off = dict_info_.lemma_count; -#ifdef ___CACHE_ENABLED___ - int32 middle; - uint32 start, count; - bool cached = cache_hit(&searchable, &start, &count); - if (cached) { - middle = start; - max_off = start + count; - } else { - middle = locate_first_in_offsets(&searchable); - start = middle; - } -#else - int32 middle = locate_first_in_offsets(&searchable); -#endif - - if (middle == -1) { -#ifdef ___CACHE_ENABLED___ - if (!cached) - cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); -#endif - return 0; - } - - size_t lpi_current = 0; - - bool fuzzy_break = false; - bool prefix_break = false; - while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { - if (lpi_current >= lpi_max) - break; - uint32 offset = offsets_[middle]; - // Ignore deleted lemmas - if (offset & kUserDictOffsetFlagRemove) { - middle++; - continue; - } - uint8 nchar = get_lemma_nchar(offset); - uint16 * splids = get_lemma_spell_ids(offset); -#ifdef ___CACHE_ENABLED___ - if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { -#else - if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { -#endif - fuzzy_break = true; - } - - if (prefix_break == false) { - if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { - if (*need_extend == false && - is_prefix_spell_id(splids, nchar, &searchable)) { - *need_extend = true; - } - } else { - prefix_break = true; - } - } - - if (equal_spell_id(splids, nchar, &searchable) == true) { - lpi_items[lpi_current].psb = translate_score(scores_[middle]); - lpi_items[lpi_current].id = ids_[middle]; - lpi_items[lpi_current].lma_len = nchar; - lpi_current++; - } - middle++; - } - -#ifdef ___CACHE_ENABLED___ - if (!cached) { - count = middle - start; - cache_push(USER_DICT_CACHE, &searchable, start, count); - } -#endif - - return lpi_current; -} - -uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf, - uint16 str_max) { - if (is_valid_state() == false) - return 0; - if (is_valid_lemma_id(id_lemma) == false) - return 0; - uint32 offset = offsets_by_id_[id_lemma - start_id_]; - uint8 nchar = get_lemma_nchar(offset); - char16 * str = get_lemma_word(offset); - uint16 m = nchar < str_max -1 ? nchar : str_max - 1; - int i = 0; - for (; i < m; i++) { - str_buf[i] = str[i]; - } - str_buf[i] = 0; - return m; -} - -uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, - uint16 splids_max, bool arg_valid) { - if (is_valid_lemma_id(id_lemma) == false) - return 0; - uint32 offset = offsets_by_id_[id_lemma - start_id_]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 * ids = get_lemma_spell_ids(offset); - int i = 0; - for (; i < nchar && i < splids_max; i++) - splids[i] = ids[i]; - return i; -} - -size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, - NPredictItem *npre_items, size_t npre_max, - size_t b4_used) { - uint32 new_added = 0; -#ifdef ___PREDICT_ENABLED___ - int32 end = dict_info_.lemma_count - 1; - int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len); - if (j == -1) - return 0; - - while (j <= end) { - uint32 offset = predicts_[j]; - // Ignore deleted lemmas - if (offset & kUserDictOffsetFlagRemove) { - j++; - continue; - } - uint32 nchar = get_lemma_nchar(offset); - uint16 * words = get_lemma_word(offset); - uint16 * splids = get_lemma_spell_ids(offset); - - if (nchar <= hzs_len) { - j++; - continue; - } - - if (memcmp(words, last_hzs, hzs_len << 1) == 0) { - if (new_added >= npre_max) { - return new_added; - } - uint32 cpy_len = - (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) - - (hzs_len << 1); - npre_items[new_added].his_len = hzs_len; - npre_items[new_added].psb = get_lemma_score(words, splids, nchar); - memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); - if ((cpy_len >> 1) < kMaxPredictSize) { - npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; - } - new_added++; - } else { - break; - } - - j++; - } -#endif - return new_added; -} - -int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], - uint16 lemma_len) { - int32 max_off = dict_info_.lemma_count; - - UserDictSearchable searchable; - prepare_locate(&searchable, splid_str, lemma_len); -#ifdef ___CACHE_ENABLED___ - int32 off; - uint32 start, count; - bool cached = load_cache(&searchable, &start, &count); - if (cached) { - off = start; - max_off = start + count; - } else { - off = locate_first_in_offsets(&searchable); - start = off; - } -#else - int32 off = locate_first_in_offsets(&searchable); -#endif - - if (off == -1) { - return off; - } - - while (off < max_off) { - uint32 offset = offsets_[off]; - if (offset & kUserDictOffsetFlagRemove) { - off++; - continue; - } - uint16 * splids = get_lemma_spell_ids(offset); -#ifdef ___CACHE_ENABLED___ - if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) - break; -#else - if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) - break; -#endif - if (equal_spell_id(splids, lemma_len, &searchable) == true) { - uint16 * str = get_lemma_word(offset); - uint32 i = 0; - for (i = 0; i < lemma_len; i++) { - if (str[i] == lemma_str[i]) - continue; - break; - } - if (i < lemma_len) { - off++; - continue; - } -#ifdef ___CACHE_ENABLED___ - // No need to save_cache here, since current function is invoked by - // put_lemma. It's rarely possible for a user input same lemma twice. - // That means first time user type a new lemma, it is newly added into - // user dictionary, then it's possible that user type the same lemma - // again. - // Another reason save_cache can not be invoked here is this function - // aborts when lemma is found, and it never knows the count. -#endif - return off; - } - off++; - } - - return -1; -} - -#ifdef ___PREDICT_ENABLED___ -uint32 UserDict::locate_where_to_insert_in_predicts( - const uint16 * words, int lemma_len) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = end; - - uint32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 * ws = get_lemma_word(offset); - - uint32 minl = nchar < lemma_len ? nchar : lemma_len; - uint32 k = 0; - int cmp = 0; - - for (; k < minl; k++) { - if (ws[k] < words[k]) { - cmp = -1; - break; - } else if (ws[k] > words[k]) { - cmp = 1; - break; - } - } - if (cmp == 0) { - if (nchar < lemma_len) - cmp = -1; - else if (nchar > lemma_len) - cmp = 1; - } - - if (cmp < 0) { - begin = middle + 1; - last_matched = middle; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - last_matched = middle; - } - } - - return last_matched; -} - -int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = -1; - - int32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 * ws = get_lemma_word(offset); - - uint32 minl = nchar < lemma_len ? nchar : lemma_len; - uint32 k = 0; - int cmp = 0; - - for (; k < minl; k++) { - if (ws[k] < words[k]) { - cmp = -1; - break; - } else if (ws[k] > words[k]) { - cmp = 1; - break; - } - } - if (cmp == 0) { - if (nchar >= lemma_len) - last_matched = middle; - if (nchar < lemma_len) - cmp = -1; - else if (nchar > lemma_len) - cmp = 1; - } - - if (cmp < 0) { - begin = middle + 1; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - } - } - - return last_matched; -} - -#endif - -LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) { - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off == -1) { - return 0; - } - - return ids_[off]; -} - -LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { - if (is_valid_state() == false) - return 0; - if (is_valid_lemma_id(lemma_id) == false) - return 0; - - return translate_score(_get_lemma_score(lemma_id)); -} - -LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) { - if (is_valid_state() == false) - return 0; - return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); -} - -int UserDict::_get_lemma_score(LemmaIdType lemma_id) { - if (is_valid_state() == false) - return 0; - if (is_valid_lemma_id(lemma_id) == false) - return 0; - - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - - uint32 nchar = get_lemma_nchar(offset); - uint16 * spl = get_lemma_spell_ids(offset); - uint16 * wrd = get_lemma_word(offset); - - int32 off = locate_in_offsets(wrd, spl, nchar); - if (off == -1) { - return 0; - } - - return scores_[off]; -} - -int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], - uint16 lemma_len) { - if (is_valid_state() == false) - return 0; - - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off == -1) { - return 0; - } - - return scores_[off]; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::remove_lemma_from_sync_list(uint32 offset) { - offset &= kUserDictOffsetMask; - uint32 i = 0; - for (; i < dict_info_.sync_count; i++) { - unsigned int off = (syncs_[i] & kUserDictOffsetMask); - if (off == offset) - break; - } - if (i < dict_info_.sync_count) { - syncs_[i] = syncs_[dict_info_.sync_count - 1]; - dict_info_.sync_count--; - } -} -#endif - -#ifdef ___PREDICT_ENABLED___ -void UserDict::remove_lemma_from_predict_list(uint32 offset) { - offset &= kUserDictOffsetMask; - uint32 i = 0; - for (; i < dict_info_.lemma_count; i++) { - unsigned int off = (predicts_[i] & kUserDictOffsetMask); - if (off == offset) { - predicts_[i] |= kUserDictOffsetFlagRemove; - break; - } - } -} -#endif - -bool UserDict::remove_lemma_by_offset_index(int offset_index) { - if (is_valid_state() == false) - return 0; - - int32 off = offset_index; - if (off == -1) { - return false; - } - - uint32 offset = offsets_[off]; - uint32 nchar = get_lemma_nchar(offset); - - offsets_[off] |= kUserDictOffsetFlagRemove; - -#ifdef ___SYNC_ENABLED___ - // Remove corresponding sync item - remove_lemma_from_sync_list(offset); -#endif - -#ifdef ___PREDICT_ENABLED___ - remove_lemma_from_predict_list(offset); -#endif - dict_info_.free_count++; - dict_info_.free_size += (2 + (nchar << 2)); - - if (state_ < USER_DICT_OFFSET_DIRTY) - state_ = USER_DICT_OFFSET_DIRTY; - return true; -} - -bool UserDict::remove_lemma(LemmaIdType lemma_id) { - if (is_valid_state() == false) - return 0; - if (is_valid_lemma_id(lemma_id) == false) - return false; - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - - uint32 nchar = get_lemma_nchar(offset); - uint16 * spl = get_lemma_spell_ids(offset); - uint16 * wrd = get_lemma_word(offset); - - int32 off = locate_in_offsets(wrd, spl, nchar); - - return remove_lemma_by_offset_index(off); -} - -void UserDict::flush_cache() { - LemmaIdType start_id = start_id_; - if (!dict_file_) - return; - const char * file = strdup(dict_file_); - if (!file) - return; - close_dict(); - load_dict(file, start_id, kUserDictIdEnd); - free((void*)file); -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif - return; -} - -bool UserDict::reset(const char *file) { - FILE *fp = fopen(file, "w+"); - if (!fp) { - return false; - } - uint32 version = kUserDictVersion; - size_t wred = fwrite(&version, 1, 4, fp); - UserDictInfo info; - memset(&info, 0, sizeof(info)); - // By default, no limitation for lemma count and size - // thereby, reclaim_ratio is never used - wred += fwrite(&info, 1, sizeof(info), fp); - if (wred != sizeof(info) + sizeof(version)) { - fclose(fp); - unlink(file); - return false; - } - fclose(fp); - return true; -} - -bool UserDict::validate(const char *file) { - // b is ignored in POSIX compatible os including Linux - // while b is important flag for Windows to specify binary mode - FILE *fp = fopen(file, "rb"); - if (!fp) { - return false; - } - - size_t size; - size_t readed; - uint32 version; - UserDictInfo dict_info; - - // validate - int err = fseek(fp, 0, SEEK_END); - if (err) { - goto error; - } - - size = ftell(fp); - if (size < 4 + sizeof(dict_info)) { - goto error; - } - - err = fseek(fp, 0, SEEK_SET); - if (err) { - goto error; - } - - readed = fread(&version, 1, sizeof(version), fp); - if (readed < sizeof(version)) { - goto error; - } - if (version != kUserDictVersion) { - goto error; - } - - err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); - if (err) { - goto error; - } - - readed = fread(&dict_info, 1, sizeof(dict_info), fp); - if (readed != sizeof(dict_info)) { - goto error; - } - - if (size != get_dict_file_size(&dict_info)) { - goto error; - } - - fclose(fp); - return true; - - error: - fclose(fp); - return false; -} - -bool UserDict::load(const char *file, LemmaIdType start_id) { - if (0 != pthread_mutex_trylock(&g_mutex_)) { - return false; - } - // b is ignored in POSIX compatible os including Linux - // while b is important flag for Windows to specify binary mode - FILE *fp = fopen(file, "rb"); - if (!fp) { - pthread_mutex_unlock(&g_mutex_); - return false; - } - - size_t readed, toread; - UserDictInfo dict_info; - uint8 *lemmas = NULL; - uint32 *offsets = NULL; -#ifdef ___SYNC_ENABLED___ - uint32 *syncs = NULL; -#endif - uint32 *scores = NULL; - uint32 *ids = NULL; - uint32 *offsets_by_id = NULL; -#ifdef ___PREDICT_ENABLED___ - uint32 *predicts = NULL; -#endif - size_t i; - int err; - - err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); - if (err) goto error; - - readed = fread(&dict_info, 1, sizeof(dict_info), fp); - if (readed != sizeof(dict_info)) goto error; - - lemmas = (uint8 *)malloc( - dict_info.lemma_size + - (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); - - if (!lemmas) goto error; - - offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!offsets) goto error; - -#ifdef ___PREDICT_ENABLED___ - predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!predicts) goto error; -#endif - -#ifdef ___SYNC_ENABLED___ - syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); - if (!syncs) goto error; -#endif - - scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!scores) goto error; - - ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!ids) goto error; - - offsets_by_id = (uint32 *)malloc( - (dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!offsets_by_id) goto error; - - err = fseek(fp, 4, SEEK_SET); - if (err) goto error; - - readed = 0; - while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { - readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); - } - if (readed < dict_info.lemma_size) - goto error; - - toread = (dict_info.lemma_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp); - } - if (readed < toread) - goto error; - -#ifdef ___PREDICT_ENABLED___ - toread = (dict_info.lemma_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp); - } - if (readed < toread) - goto error; -#endif - - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp); - } - if (readed < toread) - goto error; - -#ifdef ___SYNC_ENABLED___ - toread = (dict_info.sync_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp); - } - if (readed < toread) - goto error; -#endif - - for (i = 0; i < dict_info.lemma_count; i++) { - ids[i] = start_id + i; - offsets_by_id[i] = offsets[i]; - } - - lemmas_ = lemmas; - offsets_ = offsets; -#ifdef ___SYNC_ENABLED___ - syncs_ = syncs; - sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; -#endif - offsets_by_id_ = offsets_by_id; - scores_ = scores; - ids_ = ids; -#ifdef ___PREDICT_ENABLED___ - predicts_ = predicts; -#endif - lemma_count_left_ = kUserDictPreAlloc; - lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); - memcpy(&dict_info_, &dict_info, sizeof(dict_info)); - state_ = USER_DICT_SYNC; - - fclose(fp); - - pthread_mutex_unlock(&g_mutex_); - return true; - - error: - if (lemmas) free(lemmas); - if (offsets) free(offsets); -#ifdef ___SYNC_ENABLED___ - if (syncs) free(syncs); -#endif - if (scores) free(scores); - if (ids) free(ids); - if (offsets_by_id) free(offsets_by_id); -#ifdef ___PREDICT_ENABLED___ - if (predicts) free(predicts); -#endif - fclose(fp); - pthread_mutex_unlock(&g_mutex_); - return false; -} - -void UserDict::write_back() { - // XXX write back is only allowed from close_dict due to thread-safe sake - if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) - return; - int fd = open(dict_file_, O_WRONLY); - if (fd == -1) - return; - switch (state_) { - case USER_DICT_DEFRAGMENTED: - write_back_all(fd); - break; - case USER_DICT_LEMMA_DIRTY: - write_back_lemma(fd); - break; - case USER_DICT_OFFSET_DIRTY: - write_back_offset(fd); - break; - case USER_DICT_SCORE_DIRTY: - write_back_score(fd); - break; -#ifdef ___SYNC_ENABLED___ - case USER_DICT_SYNC_DIRTY: - write_back_sync(fd); - break; -#endif - default: - break; - } - // It seems truncate is not need on Linux, Windows except Mac - // I am doing it here anyway for safety. - off_t cur = lseek(fd, 0, SEEK_CUR); -#ifndef _WIN32 - ftruncate(fd, cur); -#endif - close(fd); - state_ = USER_DICT_SYNC; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::write_back_sync(int fd) { - int err = lseek(fd, 4 + dict_info_.lemma_size - + (dict_info_.lemma_count << 3) -#ifdef ___PREDICT_ENABLED___ - + (dict_info_.lemma_count << 2) -#endif - , SEEK_SET); - if (err == -1) - return; - write(fd, syncs_, dict_info_.sync_count << 2); - write(fd, &dict_info_, sizeof(dict_info_)); -} -#endif - -void UserDict::write_back_offset(int fd) { - int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); - if (err == -1) - return; - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_score(int fd) { - int err = lseek(fd, 4 + dict_info_.lemma_size - + (dict_info_.lemma_count << 2) -#ifdef ___PREDICT_ENABLED___ - + (dict_info_.lemma_count << 2) -#endif - , SEEK_SET); - if (err == -1) - return; - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_lemma(int fd) { - int err = lseek(fd, 4, SEEK_SET); - if (err == -1) - return; - // New lemmas are always appended, no need to write whole lemma block - size_t need_write = kUserDictPreAlloc * - (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; - err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); - if (err == -1) - return; - write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); - - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_all(int fd) { - // XXX lemma_size is handled differently in writeall - // and writelemma. I update lemma_size and lemma_count in different - // places for these two cases. Should fix it to make it consistent. - int err = lseek(fd, 4, SEEK_SET); - if (err == -1) - return; - write(fd, lemmas_, dict_info_.lemma_size); - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -#ifdef ___CACHE_ENABLED___ -bool UserDict::load_cache(UserDictSearchable *searchable, - uint32 *offset, uint32 *length) { - UserDictCache *cache = &caches_[searchable->splids_len - 1]; - if (cache->head == cache->tail) - return false; - - uint16 j, sig_len = kMaxLemmaSize / 4; - uint16 i = cache->head; - while (1) { - j = 0; - for (; j < sig_len; j++) { - if (cache->signatures[i][j] != searchable->signature[j]) - break; - } - if (j < sig_len) { - i++; - if (i >= kUserDictCacheSize) - i -= kUserDictCacheSize; - if (i == cache->tail) - break; - continue; - } - *offset = cache->offsets[i]; - *length = cache->lengths[i]; - return true; - } - return false; -} - -void UserDict::save_cache(UserDictSearchable *searchable, - uint32 offset, uint32 length) { - UserDictCache *cache = &caches_[searchable->splids_len - 1]; - uint16 next = cache->tail; - - cache->offsets[next] = offset; - cache->lengths[next] = length; - uint16 sig_len = kMaxLemmaSize / 4; - uint16 j = 0; - for (; j < sig_len; j++) { - cache->signatures[next][j] = searchable->signature[j]; - } - - if (++next >= kUserDictCacheSize) { - next -= kUserDictCacheSize; - } - if (next == cache->head) { - cache->head++; - if (cache->head >= kUserDictCacheSize) { - cache->head -= kUserDictCacheSize; - } - } - cache->tail = next; -} - -void UserDict::reset_cache() { - memset(caches_, 0, sizeof(caches_)); -} - -bool UserDict::load_miss_cache(UserDictSearchable *searchable) { - UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; - if (cache->head == cache->tail) - return false; - - uint16 j, sig_len = kMaxLemmaSize / 4; - uint16 i = cache->head; - while (1) { - j = 0; - for (; j < sig_len; j++) { - if (cache->signatures[i][j] != searchable->signature[j]) - break; - } - if (j < sig_len) { - i++; - if (i >= kUserDictMissCacheSize) - i -= kUserDictMissCacheSize; - if (i == cache->tail) - break; - continue; - } - return true; - } - return false; -} - -void UserDict::save_miss_cache(UserDictSearchable *searchable) { - UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; - uint16 next = cache->tail; - - uint16 sig_len = kMaxLemmaSize / 4; - uint16 j = 0; - for (; j < sig_len; j++) { - cache->signatures[next][j] = searchable->signature[j]; - } - - if (++next >= kUserDictMissCacheSize) { - next -= kUserDictMissCacheSize; - } - if (next == cache->head) { - cache->head++; - if (cache->head >= kUserDictMissCacheSize) { - cache->head -= kUserDictMissCacheSize; - } - } - cache->tail = next; -} - -void UserDict::reset_miss_cache() { - memset(miss_caches_, 0, sizeof(miss_caches_)); -} - -void UserDict::cache_init() { - reset_cache(); - reset_miss_cache(); -} - -bool UserDict::cache_hit(UserDictSearchable *searchable, - uint32 *offset, uint32 *length) { - bool hit = load_miss_cache(searchable); - if (hit) { - *offset = 0; - *length = 0; - return true; - } - hit = load_cache(searchable, offset, length); - if (hit) { - return true; - } - return false; -} - -void UserDict::cache_push(UserDictCacheType type, - UserDictSearchable *searchable, - uint32 offset, uint32 length) { - switch (type) { - case USER_DICT_MISS_CACHE: - save_miss_cache(searchable); - break; - case USER_DICT_CACHE: - save_cache(searchable, offset, length); - break; - default: - break; - } -} - -#endif - -void UserDict::defragment(void) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) - return; - // Fixup offsets_, set REMOVE flag to lemma's flag if needed - size_t first_freed = 0; - size_t first_inuse = 0; - while (first_freed < dict_info_.lemma_count) { - // Find first freed offset - while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && - first_freed < dict_info_.lemma_count) { - first_freed++; - } - if (first_freed < dict_info_.lemma_count) { - // Save REMOVE flag to lemma flag - int off = offsets_[first_freed]; - set_lemma_flag(off, kUserDictLemmaFlagRemove); - } else { - break; - } - // Find first inuse offse after first_freed - first_inuse = first_freed + 1; - while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && - (first_inuse < dict_info_.lemma_count)) { - // Save REMOVE flag to lemma flag - int off = offsets_[first_inuse]; - set_lemma_flag(off, kUserDictLemmaFlagRemove); - first_inuse++; - } - if (first_inuse >= dict_info_.lemma_count) { - break; - } - // Swap offsets_ - int tmp = offsets_[first_inuse]; - offsets_[first_inuse] = offsets_[first_freed]; - offsets_[first_freed] = tmp; - // Move scores_, no need to swap - tmp = scores_[first_inuse]; - scores_[first_inuse] = scores_[first_freed]; - scores_[first_freed] = tmp; - // Swap ids_ - LemmaIdType tmpid = ids_[first_inuse]; - ids_[first_inuse] = ids_[first_freed]; - ids_[first_freed] = tmpid; - // Go on - first_freed++; - } -#ifdef ___PREDICT_ENABLED___ - // Fixup predicts_ - first_freed = 0; - first_inuse = 0; - while (first_freed < dict_info_.lemma_count) { - // Find first freed offset - while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && - first_freed < dict_info_.lemma_count) { - first_freed++; - } - if (first_freed >= dict_info_.lemma_count) - break; - // Find first inuse offse after first_freed - first_inuse = first_freed + 1; - while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) - && (first_inuse < dict_info_.lemma_count)) { - first_inuse++; - } - if (first_inuse >= dict_info_.lemma_count) { - break; - } - // Swap offsets_ - int tmp = predicts_[first_inuse]; - predicts_[first_inuse] = predicts_[first_freed]; - predicts_[first_freed] = tmp; - // Go on - first_freed++; - } -#endif - dict_info_.lemma_count = first_freed; - // Fixup lemmas_ - size_t begin = 0; - size_t end = 0; - size_t dst = 0; - int total_size = dict_info_.lemma_size + lemma_size_left_; - int total_count = dict_info_.lemma_count + lemma_count_left_; - size_t real_size = total_size - lemma_size_left_; - while (dst < real_size) { - unsigned char flag = get_lemma_flag(dst); - unsigned char nchr = get_lemma_nchar(dst); - if ((flag & kUserDictLemmaFlagRemove) == 0) { - dst += nchr * 4 + 2; - continue; - } - break; - } - if (dst >= real_size) - return; - - end = dst; - while (end < real_size) { - begin = end + get_lemma_nchar(end) * 4 + 2; - repeat: - // not used any more - if (begin >= real_size) - break; - unsigned char flag = get_lemma_flag(begin); - unsigned char nchr = get_lemma_nchar(begin); - if (flag & kUserDictLemmaFlagRemove) { - begin += nchr * 4 + 2; - goto repeat; - } - end = begin + nchr * 4 + 2; - while (end < real_size) { - unsigned char eflag = get_lemma_flag(end); - unsigned char enchr = get_lemma_nchar(end); - if ((eflag & kUserDictLemmaFlagRemove) == 0) { - end += enchr * 4 + 2; - continue; - } - break; - } - memmove(lemmas_ + dst, lemmas_ + begin, end - begin); - for (size_t j = 0; j < dict_info_.lemma_count; j++) { - if (offsets_[j] >= begin && offsets_[j] < end) { - offsets_[j] -= (begin - dst); - offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; - } -#ifdef ___PREDICT_ENABLED___ - if (predicts_[j] >= begin && predicts_[j] < end) { - predicts_[j] -= (begin - dst); - } -#endif - } -#ifdef ___SYNC_ENABLED___ - for (size_t j = 0; j < dict_info_.sync_count; j++) { - if (syncs_[j] >= begin && syncs_[j] < end) { - syncs_[j] -= (begin - dst); - } - } -#endif - dst += (end - begin); - } - - dict_info_.free_count = 0; - dict_info_.free_size = 0; - dict_info_.lemma_size = dst; - lemma_size_left_ = total_size - dict_info_.lemma_size; - lemma_count_left_ = total_count - dict_info_.lemma_count; - - // XXX Without following code, - // offsets_by_id_ is not reordered. - // That's to say, all removed lemmas' ids are not collected back. - // There may not be room for addition of new lemmas due to - // offsests_by_id_ reason, although lemma_size_left_ is fixed. - // By default, we do want defrag as fast as possible, because - // during defrag procedure, other peers can not write new lemmas - // to user dictionary file. - // XXX If write-back is invoked immediately after - // this defragment, no need to fix up following in-mem data. - for (uint32 i = 0; i < dict_info_.lemma_count; i++) { - ids_[i] = start_id_ + i; - offsets_by_id_[i] = offsets_[i]; - } - - state_ = USER_DICT_DEFRAGMENTED; - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("defragment"); -#endif -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { - if (is_valid_state() == false) - return; - if (end > dict_info_.sync_count) - end = dict_info_.sync_count; - memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); - dict_info_.sync_count -= (end - start); - if (state_ < USER_DICT_SYNC_DIRTY) - state_ = USER_DICT_SYNC_DIRTY; -} - -int UserDict::get_sync_count() { - if (is_valid_state() == false) - return 0; - return dict_info_.sync_count; -} - -LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt) { - int again = 0; - begin: - LemmaIdType id; - uint32 * syncs_bak = syncs_; - syncs_ = NULL; - id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); - syncs_ = syncs_bak; - if (id == 0 && again == 0) { - if ((dict_info_.limit_lemma_count > 0 && - dict_info_.lemma_count >= dict_info_.limit_lemma_count) - || (dict_info_.limit_lemma_size > 0 && - dict_info_.lemma_size + (2 + (lemma_len << 2)) - > dict_info_.limit_lemma_size)) { - // XXX Always reclaim and defrag in sync code path - // sync thread is background thread and ok with heavy work - reclaim(); - defragment(); - flush_cache(); - again = 1; - goto begin; - } - } - return id; -} - -int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) { - int newly_added = 0; - - SpellingParser * spl_parser = new SpellingParser(); - if (!spl_parser) { - return 0; - } -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - char16 *ptr = lemmas; - - // Extract pinyin,words,frequence,last_mod_time - char16 * p = ptr, * py16 = ptr; - char16 * hz16 = NULL; - int py16_len = 0; - uint16 splid[kMaxLemmaSize]; - int splid_len = 0; - int hz16_len = 0; - char16 * fr16 = NULL; - int fr16_len = 0; - - while (p - ptr < len) { - // Pinyin - py16 = p; - splid_len = 0; - while (*p != 0x2c && (p - ptr) < len) { - if (*p == 0x20) - splid_len++; - p++; - } - splid_len++; - if (p - ptr == len) - break; - py16_len = p - py16; - if (kMaxLemmaSize < splid_len) { - break; - } - bool is_pre; - int splidl = spl_parser->splstr16_to_idxs_f( - py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); - if (splidl != splid_len) - break; - // Phrase - hz16 = ++p; - while (*p != 0x2c && (p - ptr) < len) { - p++; - } - hz16_len = p - hz16; - if (hz16_len != splid_len) - break; - // Frequency - fr16 = ++p; - fr16_len = 0; - while (*p != 0x2c && (p - ptr) < len) { - p++; - } - fr16_len = p - fr16; - uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); - // Last modified time - fr16 = ++p; - fr16_len = 0; - while (*p != 0x3b && (p - ptr) < len) { - p++; - } - fr16_len = p - fr16; - uint64 last_mod = utf16le_atoll(fr16, fr16_len); - - put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); - newly_added++; - - p++; - } - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); -#endif - return newly_added; -} - -int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning( - char16 * str, int size, int * count) { - int len = 0; - *count = 0; - - int left_len = size; - - if (is_valid_state() == false) - return len; - - SpellingTrie * spl_trie = &SpellingTrie::get_instance(); - if (!spl_trie) { - return 0; - } - - uint32 i; - for (i = 0; i < dict_info_.sync_count; i++) { - int offset = syncs_[i]; - uint32 nchar = get_lemma_nchar(offset); - uint16 *spl = get_lemma_spell_ids(offset); - uint16 *wrd = get_lemma_word(offset); - int score = _get_lemma_score(wrd, spl, nchar); - - static char score_temp[32], *pscore_temp = score_temp; - static char16 temp[256], *ptemp = temp; - - pscore_temp = score_temp; - ptemp = temp; - - uint32 j; - // Add pinyin - for (j = 0; j < nchar; j++) { - int ret_len = spl_trie->get_spelling_str16( - spl[j], ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) - break; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ' '; - } else { - j = 0; - break; - } - } - if (j < nchar) { - continue; - } - ptemp--; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add phrase - for (j = 0; j < nchar; j++) { - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = wrd[j]; - } else { - break; - } - } - if (j < nchar) { - continue; - } - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add frequency - uint32 intf = extract_score_freq(score); - int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) - continue; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add last modified time - uint64 last_mod = extract_score_lmt(score); - ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) - continue; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ';'; - } else { - continue; - } - - // Write to string - int need_len = ptemp - temp; - if (need_len > left_len) - break; - memcpy(str + len, temp, need_len * 2); - left_len -= need_len; - - len += need_len; - (*count)++; - } - - if (len > 0) { - if (state_ < USER_DICT_SYNC_DIRTY) - state_ = USER_DICT_SYNC_DIRTY; - } - return len; -} - -#endif - -bool UserDict::state(UserDictStat * stat) { - if (is_valid_state() == false) - return false; - if (!stat) - return false; - stat->version = version_; - stat->file_name = dict_file_; - stat->load_time.tv_sec = load_time_.tv_sec; - stat->load_time.tv_usec = load_time_.tv_usec; - pthread_mutex_lock(&g_mutex_); - stat->last_update.tv_sec = g_last_update_.tv_sec; - stat->last_update.tv_usec = g_last_update_.tv_usec; - pthread_mutex_unlock(&g_mutex_); - stat->disk_size = get_dict_file_size(&dict_info_); - stat->lemma_count = dict_info_.lemma_count; - stat->lemma_size = dict_info_.lemma_size; - stat->delete_count = dict_info_.free_count; - stat->delete_size = dict_info_.free_size; -#ifdef ___SYNC_ENABLED___ - stat->sync_count = dict_info_.sync_count; -#endif - stat->limit_lemma_count = dict_info_.limit_lemma_count; - stat->limit_lemma_size = dict_info_.limit_lemma_size; - stat->reclaim_ratio = dict_info_.reclaim_ratio; - return true; -} - -void UserDict::set_limit(uint32 max_lemma_count, - uint32 max_lemma_size, uint32 reclaim_ratio) { - dict_info_.limit_lemma_count = max_lemma_count; - dict_info_.limit_lemma_size = max_lemma_size; - if (reclaim_ratio > 100) - reclaim_ratio = 100; - dict_info_.reclaim_ratio = reclaim_ratio; -} - -void UserDict::reclaim() { - if (is_valid_state() == false) - return; - - switch (dict_info_.reclaim_ratio) { - case 0: - return; - case 100: - // TODO: CLEAR to be implemented - assert(false); - return; - default: - break; - } - - // XXX Reclaim is only based on count, not size - uint32 count = dict_info_.lemma_count; - int rc = count * dict_info_.reclaim_ratio / 100; - - UserDictScoreOffsetPair * score_offset_pairs = NULL; - score_offset_pairs = (UserDictScoreOffsetPair *)malloc( - sizeof(UserDictScoreOffsetPair) * rc); - if (score_offset_pairs == NULL) { - return; - } - - for (int i = 0; i < rc; i++) { - int s = scores_[i]; - score_offset_pairs[i].score = s; - score_offset_pairs[i].offset_index = i; - } - - for (int i = (rc + 1) / 2; i >= 0; i--) - shift_down(score_offset_pairs, i, rc); - - for (uint32 i = rc; i < dict_info_.lemma_count; i++) { - int s = scores_[i]; - if (s < score_offset_pairs[0].score) { - score_offset_pairs[0].score = s; - score_offset_pairs[0].offset_index = i; - shift_down(score_offset_pairs, 0, rc); - } - } - - for (int i = 0; i < rc; i++) { - int off = score_offset_pairs[i].offset_index; - remove_lemma_by_offset_index(off); - } - if (rc > 0) { - if (state_ < USER_DICT_OFFSET_DIRTY) - state_ = USER_DICT_OFFSET_DIRTY; - } - - free(score_offset_pairs); -} - -inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) { - int s = sop[i].score; - int p = sop[i].offset_index; - sop[i].score = sop[j].score; - sop[i].offset_index = sop[j].offset_index; - sop[j].score = s; - sop[j].offset_index = p; -} - -void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) { - int par = i; - while (par < n) { - int left = par * 2 + 1; - int right = left + 1; - if (left >= n && right >= n) - break; - if (right >= n) { - if (sop[left].score > sop[par].score) { - swap(sop, left, par); - par = left; - continue; - } - } else if (sop[left].score > sop[right].score && - sop[left].score > sop[par].score) { - swap(sop, left, par); - par = left; - continue; - } else if (sop[right].score > sop[left].score && - sop[right].score > sop[par].score) { - swap(sop, right, par); - par = right; - continue; - } - break; - } -} - -LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count) { - return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); -} - -LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) - return 0; - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off != -1) { - int delta_score = count - scores_[off]; - dict_info_.total_nfreq += delta_score; - scores_[off] = build_score(lmt, count); - if (state_ < USER_DICT_SCORE_DIRTY) - state_ = USER_DICT_SCORE_DIRTY; -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("_put_lemma(update)"); -#endif - return ids_[off]; - } else { - if ((dict_info_.limit_lemma_count > 0 && - dict_info_.lemma_count >= dict_info_.limit_lemma_count) - || (dict_info_.limit_lemma_size > 0 && - dict_info_.lemma_size + (2 + (lemma_len << 2)) - > dict_info_.limit_lemma_size)) { - // XXX Don't defragment here, it's too time-consuming. - return 0; - } - int flushed = 0; - if (lemma_count_left_ == 0 || - lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { - - // XXX When there is no space for new lemma, we flush to disk - // flush_cache() may be called by upper user - // and better place shoule be found instead of here - flush_cache(); - flushed = 1; - // Or simply return and do nothing - // return 0; - } -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); -#endif - LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); -#ifdef ___SYNC_ENABLED___ - if (syncs_ && id != 0) { - queue_lemma_for_sync(id); - } -#endif - return id; - } - return 0; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::queue_lemma_for_sync(LemmaIdType id) { - if (dict_info_.sync_count < sync_count_size_) { - syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; - } else { - uint32 * syncs = (uint32*)realloc( - syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); - if (syncs) { - sync_count_size_ += kUserDictPreAlloc; - syncs_ = syncs; - syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; - } - } -} -#endif - -LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, - bool selected) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) - return 0; - if (is_valid_lemma_id(lemma_id) == false) - return 0; - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - uint8 lemma_len = get_lemma_nchar(offset); - char16 * lemma_str = get_lemma_word(offset); - uint16 * splids = get_lemma_spell_ids(offset); - - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off != -1) { - int score = scores_[off]; - int count = extract_score_freq(score); - uint64 lmt = extract_score_lmt(score); - if (count + delta_count > kUserDictMaxFrequency || - count + delta_count < count) { - delta_count = kUserDictMaxFrequency - count; - } - count += delta_count; - dict_info_.total_nfreq += delta_count; - if (selected) { - lmt = time(NULL); - } - scores_[off] = build_score(lmt, count); - if (state_ < USER_DICT_SCORE_DIRTY) - state_ = USER_DICT_SCORE_DIRTY; -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("update_lemma"); -#endif -#ifdef ___SYNC_ENABLED___ - queue_lemma_for_sync(ids_[off]); -#endif - return ids_[off]; - } - return 0; -} - -size_t UserDict::get_total_lemma_count() { - return dict_info_.total_nfreq; -} - -void UserDict::set_total_lemma_count_of_others(size_t count) { - total_other_nfreq_ = count; -} - -LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], - uint16 lemma_len, uint16 count, uint64 lmt) { - LemmaIdType id = get_max_lemma_id() + 1; - size_t offset = dict_info_.lemma_size; - if (offset > kUserDictOffsetMask) - return 0; - - lemmas_[offset] = 0; - lemmas_[offset + 1] = (uint8)lemma_len; - for (size_t i = 0; i < lemma_len; i++) { - *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; - *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) - = lemma_str[i]; - } - uint32 off = dict_info_.lemma_count; - offsets_[off] = offset; - scores_[off] = build_score(lmt, count); - ids_[off] = id; -#ifdef ___PREDICT_ENABLED___ - predicts_[off] = offset; -#endif - - offsets_by_id_[id - start_id_] = offset; - - dict_info_.lemma_count++; - dict_info_.lemma_size += (2 + (lemma_len << 2)); - lemma_count_left_--; - lemma_size_left_ -= (2 + (lemma_len << 2)); - - // Sort - - UserDictSearchable searchable; - prepare_locate(&searchable, splids, lemma_len); - - size_t i = 0; - while (i < off) { - offset = offsets_[i]; - uint32 nchar = get_lemma_nchar(offset); - uint16 * spl = get_lemma_spell_ids(offset); - - if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) - break; - i++; - } - if (i != off) { - uint32 temp = offsets_[off]; - memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); - offsets_[i] = temp; - - temp = scores_[off]; - memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); - scores_[i] = temp; - - temp = ids_[off]; - memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); - ids_[i] = temp; - } - -#ifdef ___PREDICT_ENABLED___ - uint32 j = 0; - uint16 * words_new = get_lemma_word(predicts_[off]); - j = locate_where_to_insert_in_predicts(words_new, lemma_len); - if (j != off) { - uint32 temp = predicts_[off]; - memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); - predicts_[j] = temp; - } -#endif - - if (state_ < USER_DICT_LEMMA_DIRTY) - state_ = USER_DICT_LEMMA_DIRTY; - -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif - - dict_info_.total_nfreq += count; - return id; -} -} diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp deleted file mode 100644 index fadb6cf2..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include "../include/utf16char.h" - -namespace ime_pinyin { - -#ifdef __cplusplus -extern "C" { -#endif - - char16* utf16_strtok(char16 *utf16_str, size_t *token_size, - char16 **utf16_str_next) { - if (NULL == utf16_str || NULL == token_size || NULL == utf16_str_next) { - return NULL; - } - - // Skip the splitters - size_t pos = 0; - while ((char16)' ' == utf16_str[pos] || (char16)'\n' == utf16_str[pos] - || (char16)'\t' == utf16_str[pos]) - pos++; - - utf16_str += pos; - pos = 0; - - while ((char16)'\0' != utf16_str[pos] && (char16)' ' != utf16_str[pos] - && (char16)'\n' != utf16_str[pos] - && (char16)'\t' != utf16_str[pos]) { - pos++; - } - - char16 *ret_val = utf16_str; - if ((char16)'\0' == utf16_str[pos]) { - *utf16_str_next = NULL; - if (0 == pos) - return NULL; - } else { - *utf16_str_next = utf16_str + pos + 1; - } - - utf16_str[pos] = (char16)'\0'; - *token_size = pos; - - return ret_val; - } - - int utf16_atoi(const char16 *utf16_str) { - if (NULL == utf16_str) - return 0; - - int value = 0; - int sign = 1; - size_t pos = 0; - - if ((char16)'-' == utf16_str[pos]) { - sign = -1; - pos++; - } - - while ((char16)'0' <= utf16_str[pos] && - (char16)'9' >= utf16_str[pos]) { - value = value * 10 + static_cast<int>(utf16_str[pos] - (char16)'0'); - pos++; - } - - return value*sign; - } - - float utf16_atof(const char16 *utf16_str) { - // A temporary implemetation. - char char8[256]; - if (utf16_strlen(utf16_str) >= 256) return 0; - - utf16_strcpy_tochar(char8, utf16_str); - return atof(char8); - } - - size_t utf16_strlen(const char16 *utf16_str) { - if (NULL == utf16_str) - return 0; - - size_t size = 0; - while ((char16)'\0' != utf16_str[size]) - size++; - return size; - } - - int utf16_strcmp(const char16* str1, const char16* str2) { - size_t pos = 0; - while (str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) - pos++; - - return static_cast<int>(str1[pos]) - static_cast<int>(str2[pos]); - } - - int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size) { - size_t pos = 0; - while (pos < size && str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) - pos++; - - if (pos == size) - return 0; - - return static_cast<int>(str1[pos]) - static_cast<int>(str2[pos]); - } - - // we do not consider overlapping - char16* utf16_strcpy(char16 *dst, const char16 *src) { - if (NULL == src || NULL == dst) - return NULL; - - char16* cp = dst; - - while ((char16)'\0' != *src) { - *cp = *src; - cp++; - src++; - } - - *cp = *src; - - return dst; - } - - char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size) { - if (NULL == src || NULL == dst || 0 == size) - return NULL; - - if (src == dst) - return dst; - - char16* cp = dst; - - if (dst < src || (dst > src && dst >= src + size)) { - while (size-- && (*cp++ = *src++)) - ; - } else { - cp += size - 1; - src += size - 1; - while (size-- && (*cp-- == *src--)) - ; - } - return dst; - } - - // We do not handle complicated cases like overlapping, because in this - // codebase, it is not necessary. - char* utf16_strcpy_tochar(char *dst, const char16 *src) { - if (NULL == src || NULL == dst) - return NULL; - - char* cp = dst; - - while ((char16)'\0' != *src) { - *cp = static_cast<char>(*src); - cp++; - src++; - } - *cp = *src; - - return dst; - } - -#ifdef __cplusplus -} -#endif -} // namespace ime_pinyin diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp deleted file mode 100644 index d8e5de59..00000000 --- a/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../include/utf16reader.h" - -namespace ime_pinyin { - -#define MIN_BUF_LEN 128 -#define MAX_BUF_LEN 65535 - -Utf16Reader::Utf16Reader() { - fp_ = NULL; - buffer_ = NULL; - buffer_total_len_ = 0; - buffer_next_pos_ = 0; - buffer_valid_len_ = 0; -} - -Utf16Reader::~Utf16Reader() { - if (NULL != fp_) - fclose(fp_); - - if (NULL != buffer_) - delete [] buffer_; -} - - -bool Utf16Reader::open(const char* filename, size_t buffer_len) { - if (filename == NULL) - return false; - - if (buffer_len < MIN_BUF_LEN) - buffer_len = MIN_BUF_LEN; - else if (buffer_len > MAX_BUF_LEN) - buffer_len = MAX_BUF_LEN; - - buffer_total_len_ = buffer_len; - - if (NULL != buffer_) - delete [] buffer_; - buffer_ = new char16[buffer_total_len_]; - if (NULL == buffer_) - return false; - - if ((fp_ = fopen(filename, "rb")) == NULL) - return false; - - // the UTF16 file header, skip - char16 header; - if (fread(&header, sizeof(header), 1, fp_) != 1 || header != 0xfeff) { - fclose(fp_); - fp_ = NULL; - return false; - } - - return true; -} - -char16* Utf16Reader::readline(char16* read_buf, size_t max_len) { - if (NULL == fp_ || NULL == read_buf || 0 == max_len) - return NULL; - - size_t ret_len = 0; - - do { - if (buffer_valid_len_ == 0) { - buffer_next_pos_ = 0; - buffer_valid_len_ = fread(buffer_, sizeof(char16), - buffer_total_len_, fp_); - if (buffer_valid_len_ == 0) { - if (0 == ret_len) - return NULL; - read_buf[ret_len] = (char16)'\0'; - return read_buf; - } - } - - for (size_t i = 0; i < buffer_valid_len_; i++) { - if (i == max_len - 1 || - buffer_[buffer_next_pos_ + i] == (char16)'\n') { - if (ret_len + i > 0 && read_buf[ret_len + i - 1] == (char16)'\r') { - read_buf[ret_len + i - 1] = (char16)'\0'; - } else { - read_buf[ret_len + i] = (char16)'\0'; - } - - i++; - buffer_next_pos_ += i; - buffer_valid_len_ -= i; - if (buffer_next_pos_ == buffer_total_len_) { - buffer_next_pos_ = 0; - buffer_valid_len_ = 0; - } - return read_buf; - } else { - read_buf[ret_len + i] = buffer_[buffer_next_pos_ + i]; - } - } - - ret_len += buffer_valid_len_; - buffer_valid_len_ = 0; - } while (true); - - // Never reach here - return NULL; -} - -bool Utf16Reader::close() { - if (NULL != fp_) - fclose(fp_); - fp_ = NULL; - - if (NULL != buffer_) - delete [] buffer_; - buffer_ = NULL; - return true; -} -} // namespace ime_pinyin |