aboutsummaryrefslogtreecommitdiffstats
path: root/src/virtualkeyboard/3rdparty/pinyin
diff options
context:
space:
mode:
Diffstat (limited to 'src/virtualkeyboard/3rdparty/pinyin')
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/NOTICE190
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/command/Makefile33
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp56
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.datbin1068442 -> 0 bytes
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txtbin3570346 -> 0 bytes
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txtbin32934 -> 0 bytes
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h269
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h171
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h157
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h120
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h233
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h62
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h460
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h32
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/ngram.h96
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h223
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h142
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h111
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h258
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/splparser.h96
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/sync.h85
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/userdict.h434
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h56
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h48
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/pinyin.pro59
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json13
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp1070
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp446
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp941
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp81
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp1981
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp34
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp342
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp197
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp210
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp313
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp832
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp341
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp112
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp2286
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp182
-rw-r--r--src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp131
42 files changed, 0 insertions, 12903 deletions
diff --git a/src/virtualkeyboard/3rdparty/pinyin/NOTICE b/src/virtualkeyboard/3rdparty/pinyin/NOTICE
deleted file mode 100644
index 64aaa8db..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/NOTICE
+++ /dev/null
@@ -1,190 +0,0 @@
-
- Copyright (c) 2009, The Android Open Source Project
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
diff --git a/src/virtualkeyboard/3rdparty/pinyin/command/Makefile b/src/virtualkeyboard/3rdparty/pinyin/command/Makefile
deleted file mode 100644
index 8ef2315c..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/command/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-CC=gcc
-CFLAGS= -g -Wall -std=c99
-CPP=g++
-CPPFLAGS= -g3 -Wall -lpthread -D___BUILD_MODEL___
-
-PINYINIME_DICTBUILDER=pinyinime_dictbuilder
-
-LIBRARY_SRC= \
- ../share/dictbuilder.cpp \
- ../share/dictlist.cpp \
- ../share/dicttrie.cpp \
- ../share/lpicache.cpp \
- ../share/mystdlib.cpp \
- ../share/ngram.cpp \
- ../share/searchutility.cpp \
- ../share/spellingtable.cpp \
- ../share/spellingtrie.cpp \
- ../share/splparser.cpp \
- ../share/utf16char.cpp \
- ../share/utf16reader.cpp \
-
-all: engine
-
-engine: $(PINYINIME_DICTBUILDER)
-
-$(PINYINIME_DICTBUILDER): $(LIBRARY_SRC) pinyinime_dictbuilder.cpp
- @$(CPP) $(CPPFLAGS) -o $@ $?
-
-
-clean:
- -rm -rf $(PINYINIME_DICTBUILDER)
-
-.PHONY: clean
diff --git a/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp b/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp
deleted file mode 100644
index 41ea648d..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/command/pinyinime_dictbuilder.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include <unistd.h>
-#include "../include/dicttrie.h"
-
-using namespace ime_pinyin;
-
-/**
- * Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined
- * in dictdef.h.
- */
-int main(int argc, char* argv[]) {
- DictTrie* dict_trie = new DictTrie();
- bool success;
- if (argc >= 3)
- success = dict_trie->build_dict(argv[1], argv[2]);
- else
- success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt",
- "../data/valid_utf16.txt");
-
- if (success) {
- printf("Build dictionary successfully.\n");
- } else {
- printf("Build dictionary unsuccessfully.\n");
- return -1;
- }
-
- success = dict_trie->save_dict("../data/dict_pinyin.dat");
-
- if (success) {
- printf("Save dictionary successfully.\n");
- } else {
- printf("Save dictionary unsuccessfully.\n");
- return -1;
- }
-
- return 0;
-}
diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat b/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat
deleted file mode 100644
index 1be3f9c7..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/data/dict_pinyin.dat
+++ /dev/null
Binary files differ
diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt b/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt
deleted file mode 100644
index 28805ba6..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/data/rawdict_utf16_65105_freq.txt
+++ /dev/null
Binary files differ
diff --git a/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt b/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt
deleted file mode 100644
index fecc67eb..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/data/valid_utf16.txt
+++ /dev/null
Binary files differ
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h b/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h
deleted file mode 100644
index 0a70a510..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/atomdictbase.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * This class defines AtomDictBase class which is the base class for all atom
- * dictionaries. Atom dictionaries are managed by the decoder class
- * MatrixSearch.
- *
- * When the user appends a new character to the Pinyin string, all enabled atom
- * dictionaries' extend_dict() will be called at least once to get candidates
- * ended in this step (the information of starting step is also given in the
- * parameter). Usually, when extend_dict() is called, a MileStoneHandle object
- * returned by a previous calling for a earlier step is given to speed up the
- * look-up process, and a new MileStoneHandle object will be returned if
- * the extension is successful.
- *
- * A returned MileStoneHandle object should keep alive until Function
- * reset_milestones() is called and this object is noticed to be reset.
- *
- * Usually, the atom dictionary can use step information to manage its
- * MileStoneHandle objects, or it can make the objects in ascendant order to
- * make the reset easier.
- *
- * When the decoder loads the dictionary, it will give a starting lemma id for
- * this atom dictionary to map a inner id to a global id. Global ids should be
- * used when an atom dictionary talks to any component outside.
- */
-#ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__
-#define PINYINIME_INCLUDE_ATOMDICTBASE_H__
-
-#include <stdlib.h>
-#include "./dictdef.h"
-#include "./searchutility.h"
-
-namespace ime_pinyin {
-class AtomDictBase {
- public:
- virtual ~AtomDictBase() {}
-
- /**
- * Load an atom dictionary from a file.
- *
- * @param file_name The file name to load dictionary.
- * @param start_id The starting id used for this atom dictionary.
- * @param end_id The end id (included) which can be used for this atom
- * dictionary. User dictionary will always use the last id space, so it can
- * ignore this paramter. All other atom dictionaries should check this
- * parameter.
- * @return True if succeed.
- */
- virtual bool load_dict(const char *file_name, LemmaIdType start_id,
- LemmaIdType end_id) = 0;
-
- /**
- * Close this atom dictionary.
- *
- * @return True if succeed.
- */
- virtual bool close_dict() = 0;
-
- /**
- * Get the total number of lemmas in this atom dictionary.
- *
- * @return The total number of lemmas.
- */
- virtual size_t number_of_lemmas() = 0;
-
- /**
- * This function is called by the decoder when user deletes a character from
- * the input string, or begins a new input string.
- *
- * Different atom dictionaries may implement this function in different way.
- * an atom dictionary can use one of these two parameters (or both) to reset
- * its corresponding MileStoneHandle objects according its detailed
- * implementation.
- *
- * For example, if an atom dictionary uses step information to manage its
- * MileStoneHandle objects, parameter from_step can be used to identify which
- * objects should be reset; otherwise, if another atom dictionary does not
- * use the detailed step information, it only uses ascendant handles
- * (according to step. For the same step, earlier call, smaller handle), it
- * can easily reset those MileStoneHandle which are larger than from_handle.
- *
- * The decoder always reset the decoding state by step. So when it begins
- * resetting, it will call reset_milestones() of its atom dictionaries with
- * the step information, and the MileStoneHandle objects returned by the
- * earliest calling of extend_dict() for that step.
- *
- * If an atom dictionary does not implement incremental search, this function
- * can be totally ignored.
- *
- * @param from_step From which step(included) the MileStoneHandle
- * objects should be reset.
- * @param from_handle The ealiest MileStoneHandle object for step from_step
- */
- virtual void reset_milestones(uint16 from_step,
- MileStoneHandle from_handle) = 0;
-
- /**
- * Used to extend in this dictionary. The handle returned should keep valid
- * until reset_milestones() is called.
- *
- * @param from_handle Its previous returned extended handle without the new
- * spelling id, it can be used to speed up the extending.
- * @param dep The paramter used for extending.
- * @param lpi_items Used to fill in the lemmas matched.
- * @param lpi_max The length of the buffer
- * @param lpi_num Used to return the newly added items.
- * @return The new mile stone for this extending. 0 if fail.
- */
- virtual MileStoneHandle extend_dict(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num) = 0;
-
- /**
- * Get lemma items with scores according to a spelling id stream.
- * This atom dictionary does not need to sort the returned items.
- *
- * @param splid_str The spelling id stream buffer.
- * @param splid_str_len The length of the spelling id stream buffer.
- * @param lpi_items Used to return matched lemma items with scores.
- * @param lpi_max The maximum size of the buffer to return result.
- * @return The number of matched items which have been filled in to lpi_items.
- */
- virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
- LmaPsbItem *lpi_items, size_t lpi_max) = 0;
-
- /**
- * Get a lemma string (The Chinese string) by the given lemma id.
- *
- * @param id_lemma The lemma id to get the string.
- * @param str_buf The buffer to return the Chinese string.
- * @param str_max The maximum size of the buffer.
- * @return The length of the string, 0 if fail.
- */
- virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
- uint16 str_max) = 0;
-
- /**
- * Get the full spelling ids for the given lemma id.
- * If the given buffer is too short, return 0.
- *
- * @param splids Used to return the spelling ids.
- * @param splids_max The maximum buffer length of splids.
- * @param arg_valid Used to indicate if the incoming parameters have been
- * initialized are valid. If it is true, the splids and splids_max are valid
- * and there may be half ids in splids to be updated to full ids. In this
- * case, splids_max is the number of valid ids in splids.
- * @return The number of ids in the buffer.
- */
- virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid) = 0;
-
- /**
- * Function used for prediction.
- * No need to sort the newly added items.
- *
- * @param last_hzs The last n Chinese chracters(called Hanzi), its length
- * should be less than or equal to kMaxPredictSize.
- * @param hzs_len specifies the length(<= kMaxPredictSize) of the history.
- * @param npre_items Used used to return the result.
- * @param npre_max The length of the buffer to return result
- * @param b4_used Number of prediction result (from npre_items[-b4_used])
- * from other atom dictionaries. A atom ditionary can just ignore it.
- * @return The number of prediction result from this atom dictionary.
- */
- virtual size_t predict(const char16 last_hzs[], uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used) = 0;
-
- /**
- * Add a lemma to the dictionary. If the dictionary allows to add new
- * items and this item does not exist, add it.
- *
- * @param lemma_str The Chinese string of the lemma.
- * @param splids The spelling ids of the lemma.
- * @param lemma_len The length of the Chinese lemma.
- * @param count The frequency count for this lemma.
- */
- virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count) = 0;
-
- /**
- * Update a lemma's occuring count.
- *
- * @param lemma_id The lemma id to update.
- * @param delta_count The frequnecy count to ajust.
- * @param selected Indicate whether this lemma is selected by user and
- * submitted to target edit box.
- * @return The id if succeed, 0 if fail.
- */
- virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
- bool selected) = 0;
-
- /**
- * Get the lemma id for the given lemma.
- *
- * @param lemma_str The Chinese string of the lemma.
- * @param splids The spelling ids of the lemma.
- * @param lemma_len The length of the lemma.
- * @return The matched lemma id, or 0 if fail.
- */
- virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len) = 0;
-
- /**
- * Get the lemma score.
- *
- * @param lemma_id The lemma id to get score.
- * @return The score of the lemma, or 0 if fail.
- */
- virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0;
-
- /**
- * Get the lemma score.
- *
- * @param lemma_str The Chinese string of the lemma.
- * @param splids The spelling ids of the lemma.
- * @param lemma_len The length of the lemma.
- * @return The score of the lamm, or 0 if fail.
- */
- virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len) = 0;
-
- /**
- * If the dictionary allowed, remove a lemma from it.
- *
- * @param lemma_id The id of the lemma to remove.
- * @return True if succeed.
- */
- virtual bool remove_lemma(LemmaIdType lemma_id) = 0;
-
- /**
- * Get the total occuring count of this atom dictionary.
- *
- * @return The total occuring count of this atom dictionary.
- */
- virtual size_t get_total_lemma_count() = 0;
-
- /**
- * Set the total occuring count of other atom dictionaries.
- *
- * @param count The total occuring count of other atom dictionaies.
- */
- virtual void set_total_lemma_count_of_others(size_t count) = 0;
-
- /**
- * Notify this atom dictionary to flush the cached data to persistent storage
- * if necessary.
- */
- virtual void flush_cache() = 0;
-};
-}
-
-#endif // PINYINIME_INCLUDE_ATOMDICTBASE_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h
deleted file mode 100644
index da0d6cd3..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/dictbuilder.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
-#define PINYINIME_INCLUDE_DICTBUILDER_H__
-
-#include <stdlib.h>
-#include "./utf16char.h"
-#include "./dictdef.h"
-#include "./dictlist.h"
-#include "./spellingtable.h"
-#include "./spellingtrie.h"
-#include "./splparser.h"
-
-namespace ime_pinyin {
-
-#ifdef ___BUILD_MODEL___
-
-#define ___DO_STATISTICS___
-
-class DictTrie;
-
-class DictBuilder {
- private:
- // The raw lemma array buffer.
- LemmaEntry *lemma_arr_;
- size_t lemma_num_;
-
- // Used to store all possible single char items.
- // Two items may have the same Hanzi while their spelling ids are different.
- SingleCharItem *scis_;
- size_t scis_num_;
-
- // In the tree, root's level is -1.
- // Lemma nodes for root, and level 0
- LmaNodeLE0 *lma_nodes_le0_;
-
- // Lemma nodes for layers whose levels are deeper than 0
- LmaNodeGE1 *lma_nodes_ge1_;
-
- // Number of used lemma nodes
- size_t lma_nds_used_num_le0_;
- size_t lma_nds_used_num_ge1_;
-
- // Used to store homophonies' ids.
- LemmaIdType *homo_idx_buf_;
- // Number of homophonies each of which only contains one Chinese character.
- size_t homo_idx_num_eq1_;
- // Number of homophonies each of which contains more than one character.
- size_t homo_idx_num_gt1_;
-
- // The items with highest scores.
- LemmaEntry *top_lmas_;
- size_t top_lmas_num_;
-
- SpellingTable *spl_table_;
- SpellingParser *spl_parser_;
-
-#ifdef ___DO_STATISTICS___
- size_t max_sonbuf_len_[kMaxLemmaSize];
- size_t max_homobuf_len_[kMaxLemmaSize];
-
- size_t total_son_num_[kMaxLemmaSize];
- size_t total_node_hasson_[kMaxLemmaSize];
- size_t total_sonbuf_num_[kMaxLemmaSize];
- size_t total_sonbuf_allnoson_[kMaxLemmaSize];
- size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
- size_t total_homo_num_[kMaxLemmaSize];
-
- size_t sonbufs_num1_; // Number of son buffer with only 1 son
- size_t sonbufs_numgt1_; // Number of son buffer with more 1 son;
-
- size_t total_lma_node_num_;
-
- void stat_init();
- void stat_print();
-#endif
-
- public:
-
- DictBuilder();
- ~DictBuilder();
-
- // Build dictionary trie from the file fn_raw. File fn_validhzs provides
- // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
- // included.
- bool build_dict(const char* fn_raw, const char* fn_validhzs,
- DictTrie *dict_trie);
-
- private:
- // Fill in the buffer with id. The caller guarantees that the paramters are
- // vaild.
- void id_to_charbuf(unsigned char *buf, LemmaIdType id);
-
- // Update the offset of sons for a node.
- void set_son_offset(LmaNodeGE1 *node, size_t offset);
-
- // Update the offset of homophonies' ids for a node.
- void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
-
- // Format a speling string.
- void format_spelling_str(char *spl_str);
-
- // Sort the lemma_arr by the hanzi string, and give each of unique items
- // a id. Why we need to sort the lemma list according to their Hanzi string
- // is to find items started by a given prefix string to do prediction.
- // Actually, the single char items are be in other order, for example,
- // in spelling id order, etc.
- // Return value is next un-allocated idx available.
- LemmaIdType sort_lemmas_by_hz();
-
- // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
- // lemma buffer lemma_arr_.
- // This function should be called after the lemma array is ready.
- // Return the number of unique SingleCharItem elements.
- size_t build_scis();
-
- // Construct a subtree using a subset of the spelling array (from
- // item_star to item_end)
- // parent is the parent node to update the necessary information
- // parent can be a member of LmaNodeLE0 or LmaNodeGE1
- bool construct_subset(void* parent, LemmaEntry* lemma_arr,
- size_t item_start, size_t item_end, size_t level);
-
-
- // Read valid Chinese Hanzis from the given file.
- // num is used to return number of chars.
- // The return buffer is sorted and caller needs to free the returned buffer.
- char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
-
-
- // Read a raw dictionary. max_item is the maximum number of items. If there
- // are more items in the ditionary, only the first max_item will be read.
- // Returned value is the number of items successfully read from the file.
- size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
- size_t max_item);
-
- // Try to find if a character is in hzs buffer.
- bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
-
- // Try to find if all characters in str are in hzs buffer.
- bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
- const char16 *str, size_t str_len);
-
- // Get these lemmas with toppest scores.
- void get_top_lemmas();
-
- // Allocate resource to build dictionary.
- // lma_num is the number of items to be loaded
- bool alloc_resource(size_t lma_num);
-
- // Free resource.
- void free_resource();
-};
-#endif // ___BUILD_MODEL___
-}
-
-#endif // PINYINIME_INCLUDE_DICTBUILDER_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h
deleted file mode 100644
index 5e1d7818..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/dictdef.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_DICTDEF_H__
-#define PINYINIME_INCLUDE_DICTDEF_H__
-
-#include <stdlib.h>
-#include "./utf16char.h"
-
-namespace ime_pinyin {
-
-// Enable the following line when building the binary dictionary model.
-// #define ___BUILD_MODEL___
-
-typedef unsigned char uint8;
-typedef unsigned short uint16;
-typedef unsigned int uint32;
-
-typedef signed char int8;
-typedef short int16;
-typedef int int32;
-typedef long long int64;
-typedef unsigned long long uint64;
-
-const bool kPrintDebug0 = false;
-const bool kPrintDebug1 = false;
-const bool kPrintDebug2 = false;
-
-// The max length of a lemma.
-const size_t kMaxLemmaSize = 8;
-
-// The max length of a Pinyin (spelling).
-const size_t kMaxPinyinSize = 6;
-
-// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
-// See SpellingTrie.h for details.
-const size_t kHalfSpellingIdNum = 29;
-
-// The maximum number of full spellings. For Chinese Pinyin, there are only
-// about 410 spellings.
-// If change this value is bigger(needs more bits), please also update
-// other structures like SpellingNode, to make sure than a spelling id can be
-// stored.
-// -1 is because that 0 is never used.
-const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
-const size_t kMaxSearchSteps = 40;
-
-// One character predicts its following characters.
-const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
-
-// LemmaIdType must always be size_t.
-typedef size_t LemmaIdType;
-const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage.
-const size_t kLemmaIdComposing = 0xffffff;
-
-typedef uint16 LmaScoreType;
-typedef uint16 KeyScoreType;
-
-// Number of items with highest score are kept for prediction purpose.
-const size_t kTopScoreLemmaNum = 10;
-
-const size_t kMaxPredictNumByGt3 = 1;
-const size_t kMaxPredictNumBy3 = 2;
-const size_t kMaxPredictNumBy2 = 2;
-
-// The last lemma id (included) for the system dictionary. The system
-// dictionary's ids always start from 1.
-const LemmaIdType kSysDictIdEnd = 500000;
-
-// The first lemma id for the user dictionary.
-const LemmaIdType kUserDictIdStart = 500001;
-
-// The last lemma id (included) for the user dictionary.
-const LemmaIdType kUserDictIdEnd = 600000;
-
-typedef struct {
- uint16 half_splid:5;
- uint16 full_splid:11;
-} SpellingId, *PSpellingId;
-
-
-/**
- * We use different node types for different layers
- * Statistical data of the building result for a testing dictionary:
- * root, level 0, level 1, level 2, level 3
- * max son num of one node: 406 280 41 2 -
- * max homo num of one node: 0 90 23 2 2
- * total node num of a layer: 1 406 31766 13516 993
- * total homo num of a layer: 9 5674 44609 12667 995
- *
- * The node number for root and level 0 won't be larger than 500
- * According to the information above, two kinds of nodes can be used; one for
- * root and level 0, the other for these layers deeper than 0.
- *
- * LE = less and equal,
- * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
- */
-struct LmaNodeLE0 {
- uint32 son_1st_off;
- uint32 homo_idx_buf_off;
- uint16 spl_idx;
- uint16 num_of_son;
- uint16 num_of_homo;
-};
-
-/**
- * GE = great and equal
- * A node occupies 8 bytes.
- */
-struct LmaNodeGE1 {
- uint16 son_1st_off_l; // Low bits of the son_1st_off
- uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1
- uint16 spl_idx;
- unsigned char num_of_son; // number of son nodes
- unsigned char num_of_homo; // number of homo words
- unsigned char son_1st_off_h; // high bits of the son_1st_off
- unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off
-};
-
-#ifdef ___BUILD_MODEL___
-struct SingleCharItem {
- float freq;
- char16 hz;
- SpellingId splid;
-};
-
-struct LemmaEntry {
- LemmaIdType idx_by_py;
- LemmaIdType idx_by_hz;
- char16 hanzi_str[kMaxLemmaSize + 1];
-
- // The SingleCharItem id for each Hanzi.
- uint16 hanzi_scis_ids[kMaxLemmaSize];
-
- uint16 spl_idx_arr[kMaxLemmaSize + 1];
- char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
- unsigned char hz_str_len;
- float freq;
-};
-#endif // ___BUILD_MODEL___
-
-} // namespace ime_pinyin
-
-#endif // PINYINIME_INCLUDE_DICTDEF_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h b/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h
deleted file mode 100644
index 27fa6d8e..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/dictlist.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_DICTLIST_H__
-#define PINYINIME_INCLUDE_DICTLIST_H__
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "./dictdef.h"
-#include "./searchutility.h"
-#include "./spellingtrie.h"
-#include "./utf16char.h"
-
-namespace ime_pinyin {
-
-class DictList {
- private:
- bool initialized_;
-
- const SpellingTrie *spl_trie_;
-
- // Number of SingCharItem. The first is blank, because id 0 is invalid.
- uint32 scis_num_;
- char16 *scis_hz_;
- SpellingId *scis_splid_;
-
- // The large memory block to store the word list.
- char16 *buf_;
-
- // Starting position of those words whose lengths are i+1, counted in
- // char16
- uint32 start_pos_[kMaxLemmaSize + 1];
-
- uint32 start_id_[kMaxLemmaSize + 1];
-
- int (*cmp_func_[kMaxLemmaSize])(const void *, const void *);
-
- bool alloc_resource(size_t buf_size, size_t scim_num);
-
- void free_resource();
-
-#ifdef ___BUILD_MODEL___
- // Calculate the requsted memory, including the start_pos[] buffer.
- size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num);
-
- void fill_scis(const SingleCharItem *scis, size_t scis_num);
-
- // Copy the related content to the inner buffer
- // It should be called after calculate_size()
- void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num);
-
- // Find the starting position for the buffer of those 2-character Chinese word
- // whose first character is the given Chinese character.
- char16* find_pos2_startedbyhz(char16 hz_char);
-#endif
-
- // Find the starting position for the buffer of those words whose lengths are
- // word_len. The given parameter cmp_func decides how many characters from
- // beginning will be used to compare.
- char16* find_pos_startedbyhzs(const char16 last_hzs[],
- size_t word_Len,
- int (*cmp_func)(const void *, const void *));
-
- public:
-
- DictList();
- ~DictList();
-
- bool save_list(FILE *fp);
- bool load_list(FILE *fp);
-
-#ifdef ___BUILD_MODEL___
- // Init the list from the LemmaEntry array.
- // lemma_arr should have been sorted by the hanzi_str, and have been given
- // ids from 1
- bool init_list(const SingleCharItem *scis, size_t scis_num,
- const LemmaEntry *lemma_arr, size_t lemma_num);
-#endif
-
- // Get the hanzi string for the given id
- uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max);
-
- void convert_to_hanzis(char16 *str, uint16 str_len);
-
- void convert_to_scis_ids(char16 *str, uint16 str_len);
-
- // last_hzs stores the last n Chinese characters history, its length should be
- // less or equal than kMaxPredictSize.
- // hzs_len specifies the length(<= kMaxPredictSize).
- // predict_buf is used to store the result.
- // buf_len specifies the buffer length.
- // b4_used specifies how many items before predict_buf have been used.
- // Returned value is the number of newly added items.
- size_t predict(const char16 last_hzs[], uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used);
-
- // If half_splid is a valid half spelling id, return those full spelling
- // ids which share this half id.
- uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
- uint16 *splids, uint16 max_splids);
-
- LemmaIdType get_lemma_id(const char16 *str, uint16 str_len);
-};
-}
-
-#endif // PINYINIME_INCLUDE_DICTLIST_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h
deleted file mode 100644
index 75b7ee05..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/dicttrie.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
-#define PINYINIME_INCLUDE_DICTTRIE_H__
-
-#include <stdlib.h>
-#include "./atomdictbase.h"
-#include "./dictdef.h"
-#include "./dictlist.h"
-#include "./searchutility.h"
-
-namespace ime_pinyin {
-
-class DictTrie : AtomDictBase {
- private:
- struct ParsingMark {
- size_t node_offset:24;
- size_t node_num:8; // Number of nodes with this spelling id given
- // by spl_id. If spl_id is a Shengmu, for nodes
- // in the first layer of DictTrie, it equals to
- // SpellingTrie::shm2full_num(); but for those
- // nodes which are not in the first layer,
- // node_num < SpellingTrie::shm2full_num().
- // For a full spelling id, node_num = 1;
- };
-
- // Used to indicate an extended mile stone.
- // An extended mile stone is used to mark a partial match in the dictionary
- // trie to speed up further potential extending.
- // For example, when the user inputs "w", a mile stone is created to mark the
- // partial match status, so that when user inputs another char 'm', it will be
- // faster to extend search space based on this mile stone.
- //
- // For partial match status of "wm", there can be more than one sub mile
- // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
- // there may be more one parsing mark used to mark these partial matchings.
- // A mile stone records the starting position in the mark list and number of
- // marks.
- struct MileStone {
- uint16 mark_start;
- uint16 mark_num;
- };
-
- DictList* dict_list_;
-
- const SpellingTrie *spl_trie_;
-
- LmaNodeLE0* root_; // Nodes for root and the first layer.
- LmaNodeGE1* nodes_ge1_; // Nodes for other layers.
-
- // An quick index from spelling id to the LmaNodeLE0 node buffer, or
- // to the root_ buffer.
- // Index length:
- // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
- // to get the end.
- // All Shengmu ids are not indexed because they will be converted into
- // corresponding full ids.
- // So, given an id splid, the son is:
- // root_[splid_le0_index_[splid - kFullSplIdStart]]
- uint16 *splid_le0_index_;
-
- uint32 lma_node_num_le0_;
- uint32 lma_node_num_ge1_;
-
- // The first part is for homophnies, and the last top_lma_num_ items are
- // lemmas with highest scores.
- unsigned char *lma_idx_buf_;
- uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte.
- uint32 total_lma_num_; // Total number of lemmas in this dictionary.
- uint32 top_lmas_num_; // Number of lemma with highest scores.
-
- // Parsing mark list used to mark the detailed extended statuses.
- ParsingMark *parsing_marks_;
- // The position for next available mark.
- uint16 parsing_marks_pos_;
-
- // Mile stone list used to mark the extended status.
- MileStone *mile_stones_;
- // The position for the next available mile stone. We use positions (except 0)
- // as handles.
- MileStoneHandle mile_stones_pos_;
-
- // Get the offset of sons for a node.
- inline size_t get_son_offset(const LmaNodeGE1 *node);
-
- // Get the offset of homonious ids for a node.
- inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
-
- // Get the lemma id by the offset.
- inline LemmaIdType get_lemma_id(size_t id_offset);
-
- void free_resource(bool free_dict_list);
-
- bool load_dict(FILE *fp);
-
- // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
- // them into the lpi_items buffer.
- // This function is called by the search engine.
- size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
- LmaNodeLE0 *node);
-
- // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
- // them into the lpi_items buffer.
- // This function is called by inner functions extend_dict0(), extend_dict1()
- // and extend_dict2().
- size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
- size_t homo_buf_off, LmaNodeGE1 *node,
- uint16 lma_len);
-
- // Extend in the trie from level 0.
- MileStoneHandle extend_dict0(MileStoneHandle from_handle,
- const DictExtPara *dep, LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num);
-
- // Extend in the trie from level 1.
- MileStoneHandle extend_dict1(MileStoneHandle from_handle,
- const DictExtPara *dep, LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num);
-
- // Extend in the trie from level 2.
- MileStoneHandle extend_dict2(MileStoneHandle from_handle,
- const DictExtPara *dep, LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num);
-
- // Try to extend the given spelling id buffer, and if the given id_lemma can
- // be successfully gotten, return true;
- // The given spelling ids are all valid full ids.
- bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
-
-#ifdef ___BUILD_MODEL___
- bool save_dict(FILE *fp);
-#endif // ___BUILD_MODEL___
-
- static const int kMaxMileStone = 100;
- static const int kMaxParsingMark = 600;
- static const MileStoneHandle kFirstValidMileStoneHandle = 1;
-
- friend class DictParser;
- friend class DictBuilder;
-
- public:
-
- DictTrie();
- ~DictTrie();
-
-#ifdef ___BUILD_MODEL___
- // Construct the tree from the file fn_raw.
- // fn_validhzs provide the valid hanzi list. If fn_validhzs is
- // NULL, only chars in GB2312 will be included.
- bool build_dict(const char *fn_raw, const char *fn_validhzs);
-
- // Save the binary dictionary
- // Actually, the SpellingTrie/DictList instance will be also saved.
- bool save_dict(const char *filename);
-#endif // ___BUILD_MODEL___
-
- void convert_to_hanzis(char16 *str, uint16 str_len);
-
- void convert_to_scis_ids(char16 *str, uint16 str_len);
-
- // Load a binary dictionary
- // The SpellingTrie instance/DictList will be also loaded
- bool load_dict(const char *filename, LemmaIdType start_id,
- LemmaIdType end_id);
- bool load_dict_fd(int sys_fd, long start_offset, long length,
- LemmaIdType start_id, LemmaIdType end_id);
- bool close_dict() {return true;}
- size_t number_of_lemmas() {return 0;}
-
- void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
-
- MileStoneHandle extend_dict(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num);
-
- size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
- LmaPsbItem *lpi_items, size_t lpi_max);
-
- uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
-
- uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid);
-
- size_t predict(const char16 *last_hzs, uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used);
-
- LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[],
- uint16 /*lemma_len*/, uint16 /*count*/) {return 0;}
-
- LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/,
- bool /*selected*/) {return 0;}
-
- LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[],
- uint16 /*lemma_len*/) {return 0;}
-
- LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;}
-
- LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[],
- uint16 /*lemma_len*/) {return 0;}
-
- bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;}
-
- size_t get_total_lemma_count() {return 0;}
- void set_total_lemma_count_of_others(size_t count);
-
- void flush_cache() {}
-
- LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
-
- // Fill the lemmas with highest scores to the prediction buffer.
- // his_len is the history length to fill in the prediction buffer.
- size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
- size_t npre_max, size_t b4_used);
-};
-}
-
-#endif // PINYINIME_INCLUDE_DICTTRIE_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h b/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h
deleted file mode 100644
index 60735971..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/lpicache.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__
-#define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__
-
-#include <stdlib.h>
-#include "./searchutility.h"
-#include "./spellingtrie.h"
-
-namespace ime_pinyin {
-
-// Used to cache LmaPsbItem list for half spelling ids.
-class LpiCache {
- private:
- static LpiCache *instance_;
- static const int kMaxLpiCachePerId = 15;
-
- LmaPsbItem *lpi_cache_;
- uint16 *lpi_cache_len_;
-
- public:
- LpiCache();
- ~LpiCache();
-
- static LpiCache& get_instance();
-
- // Test if the LPI list of the given splid has been cached.
- // If splid is a full spelling id, it returns false, because we only cache
- // list for half ids.
- bool is_cached(uint16 splid);
-
- // Put LPI list to cahce. If the length of the list, lpi_num, is longer than
- // the cache buffer. the list will be truncated, and function returns the
- // maximum length of the cache buffer.
- // Note: splid must be a half id, and lpi_items must be not NULL. The
- // caller of this function should guarantee this.
- size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num);
-
- // Get the cached list for the given half id.
- // Return the length of the cached buffer.
- // Note: splid must be a half id, and lpi_items must be not NULL. The
- // caller of this function should guarantee this.
- size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max);
-};
-
-} // namespace
-
-#endif // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h b/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h
deleted file mode 100644
index 61e78aa6..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/matrixsearch.h
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
-#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
-
-#include <stdlib.h>
-#include "./atomdictbase.h"
-#include "./dicttrie.h"
-#include "./searchutility.h"
-#include "./spellingtrie.h"
-#include "./splparser.h"
-
-namespace ime_pinyin {
-
-static const size_t kMaxRowNum = kMaxSearchSteps;
-
-typedef struct {
- // MileStoneHandle objects for the system and user dictionaries.
- MileStoneHandle dict_handles[2];
- // From which DMI node. -1 means it's from root.
- PoolPosType dmi_fr;
- // The spelling id for the Pinyin string from the previous DMI to this node.
- // If it is a half id like Shengmu, the node pointed by dict_node is the first
- // node with this Shengmu,
- uint16 spl_id;
- // What's the level of the dict node. Level of root is 0, but root is never
- // recorded by dict_node.
- unsigned char dict_level:7;
- // If this node is for composing phrase, this bit is 1.
- unsigned char c_phrase:1;
- // Whether the spl_id is parsed with a split character at the end.
- unsigned char splid_end_split:1;
- // What's the length of the spelling string for this match, for the whole
- // word.
- unsigned char splstr_len:7;
- // Used to indicate whether all spelling ids from the root are full spelling
- // ids. This information is useful for keymapping mode(not finished). Because
- // in this mode, there is no clear boundaries, we prefer those results which
- // have full spelling ids.
- unsigned char all_full_id:1;
-} DictMatchInfo, *PDictMatchInfo;
-
-typedef struct MatrixNode {
- LemmaIdType id;
- float score;
- MatrixNode *from;
- // From which DMI node. Used to trace the spelling segmentation.
- PoolPosType dmi_fr;
- uint16 step;
-} MatrixNode, *PMatrixNode;
-
-typedef struct {
- // The MatrixNode position in the matrix pool
- PoolPosType mtrx_nd_pos;
- // The DictMatchInfo position in the DictMatchInfo pool.
- PoolPosType dmi_pos;
- uint16 mtrx_nd_num;
- uint16 dmi_num:15;
- // Used to indicate whether there are dmi nodes in this step with full
- // spelling id. This information is used to decide whether a substring of a
- // valid Pinyin should be extended.
- //
- // Example1: shoudao
- // When the last char 'o' is added, the parser will find "dao" is a valid
- // Pinyin, and because all dmi nodes at location 'd' (including those for
- // "shoud", and those for "d") have Shengmu id only, so it is not necessary
- // to extend "ao", otherwise the result may be "shoud ao", that is not
- // reasonable.
- //
- // Example2: hengao
- // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
- // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
- // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
- // "heng ao" can also be the result.
- //
- // Similarly, "ganga" is expanded to "gang a".
- //
- // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
- // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
- // is not valid either). If the parser uses break in the loop, the result
- // always be "xian"; but if the parser uses continue in the loop, "xi an" will
- // also be tried. This behaviour can be set via the function
- // set_xi_an_switch().
- uint16 dmi_has_full_id:1;
- // Points to a MatrixNode of the current step to indicate which choice the
- // user selects.
- MatrixNode *mtrx_nd_fixed;
-} MatrixRow, *PMatrixRow;
-
-// When user inputs and selects candidates, the fixed lemma ids are stored in
-// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
-// lemmas from the beginning are fixed. If user deletes Pinyin characters one
-// by one from the end, these fixed lemmas can be unlocked one by one when
-// necessary. Whenever user deletes a Chinese character and its spelling string
-// in these fixed lemmas, all fixed lemmas will be merged together into a unit
-// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
-// phrase will be the first lemma in the sentence. Because it contains some
-// modified lemmas (by deleting a character), these merged lemmas are called
-// sub lemmas (sublma), and each of them are represented individually, so that
-// when user deletes Pinyin characters from the end, these sub lemmas can also
-// be unlocked one by one.
-typedef struct {
- uint16 spl_ids[kMaxRowNum];
- uint16 spl_start[kMaxRowNum];
- char16 chn_str[kMaxRowNum]; // Chinese string.
- uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters.
- size_t sublma_num;
- uint16 length; // Counted in Chinese characters.
-} ComposingPhrase, *TComposingPhrase;
-
-class MatrixSearch {
- private:
- // If it is true, prediction list by string whose length is greater than 1
- // will be limited to a reasonable number.
- static const bool kPredictLimitGt1 = false;
-
- // If it is true, the engine will prefer long history based prediction,
- // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
- // based on the two-character history.
- static const bool kPreferLongHistoryPredict = true;
-
- // If it is true, prediction will only be based on user dictionary. this flag
- // is for debug purpose.
- static const bool kOnlyUserDictPredict = false;
-
- // The maximum buffer to store LmaPsbItems.
- static const size_t kMaxLmaPsbItems = 1450;
-
- // How many rows for each step.
- static const size_t kMaxNodeARow = 5;
-
- // The maximum length of the sentence candidates counted in chinese
- // characters
- static const size_t kMaxSentenceLength = 16;
-
- // The size of the matrix node pool.
- static const size_t kMtrxNdPoolSize = 200;
-
- // The size of the DMI node pool.
- static const size_t kDmiPoolSize = 800;
-
- // Used to indicate whether this object has been initialized.
- bool inited_;
-
- // Spelling trie.
- const SpellingTrie *spl_trie_;
-
- // Used to indicate this switcher status: when "xian" is parseed, should
- // "xi an" also be extended. Default is false.
- // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
- // should be valid for a FULL spelling, or a combination of two spellings,
- // first of which is a FULL id too. So even it is true, "da" will never be
- // split into "d a", because "d" is not a full spelling id.
- bool xi_an_enabled_;
-
- // System dictionary.
- DictTrie* dict_trie_;
-
- // User dictionary.
- AtomDictBase* user_dict_;
-
- // Spelling parser.
- SpellingParser* spl_parser_;
-
- // The maximum allowed length of spelling string (such as a Pinyin string).
- size_t max_sps_len_;
-
- // The maximum allowed length of a result Chinese string.
- size_t max_hzs_len_;
-
- // Pinyin string. Max length: kMaxRowNum - 1
- char pys_[kMaxRowNum];
-
- // The length of the string that has been decoded successfully.
- size_t pys_decoded_len_;
-
- // Shared buffer for multiple purposes.
- size_t *share_buf_;
-
- MatrixNode *mtrx_nd_pool_;
- PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool
- DictMatchInfo *dmi_pool_;
- PoolPosType dmi_pool_used_; // How many items used in the pool
-
- MatrixRow *matrix_; // The first row is for starting
-
- DictExtPara *dep_; // Parameter used to extend DMI nodes.
-
- NPredictItem *npre_items_; // Used to do prediction
- size_t npre_items_len_;
-
- // The starting positions and lemma ids for the full sentence candidate.
- size_t lma_id_num_;
- uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids.
- LemmaIdType lma_id_[kMaxRowNum];
- size_t fixed_lmas_;
-
- // If fixed_lmas_ is bigger than i, Element i is used to indicate whether
- // the i'th lemma id in lma_id_ is the first candidate for that step.
- // If all candidates are the first one for that step, the whole string can be
- // decoded by the engine automatically, so no need to add it to user
- // dictionary. (We are considering to add it to user dictionary in the
- // future).
- uint8 fixed_lmas_no1_[kMaxRowNum];
-
- // Composing phrase
- ComposingPhrase c_phrase_;
-
- // If dmi_c_phrase_ is true, the decoder will try to match the
- // composing phrase (And definitely it will match successfully). If it
- // is false, the decoder will try to match lemmas items in dictionaries.
- bool dmi_c_phrase_;
-
- // The starting positions and spelling ids for the first full sentence
- // candidate.
- size_t spl_id_num_; // Number of splling ids
- uint16 spl_start_[kMaxRowNum]; // Starting positions
- uint16 spl_id_[kMaxRowNum]; // Spelling ids
- // Used to remember the last fixed position, counted in Hanzi.
- size_t fixed_hzs_;
-
- // Lemma Items with possibility score, two purposes:
- // 1. In Viterbi decoding, this buffer is used to get all possible candidates
- // for current step;
- // 2. When the search is done, this buffer is used to get candiates from the
- // first un-fixed step and show them to the user.
- LmaPsbItem lpi_items_[kMaxLmaPsbItems];
- size_t lpi_total_;
-
- // Assign the pointers with NULL. The caller makes sure that all pointers are
- // not valid before calling it. This function only will be called in the
- // construction function and free_resource().
- void reset_pointers_to_null();
-
- bool alloc_resource();
-
- void free_resource();
-
- // Reset the search space totally.
- bool reset_search0();
-
- // Reset the search space from ch_pos step. For example, if the original
- // input Pinyin is "an", reset_search(1) will reset the search space to the
- // result of "a". If the given position is out of range, return false.
- // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
- // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
- // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
- // The DMI nodes will be kept.
- //
- // Note: this function should not destroy content of pys_.
- bool reset_search(size_t ch_pos, bool clear_fixed_this_step,
- bool clear_dmi_this_step, bool clear_mtrx_this_step);
-
- // Delete a part of the content in pys_.
- void del_in_pys(size_t start, size_t len);
-
- // Delete a spelling id and its corresponding Chinese character, and merge
- // the fixed lemmas into the composing phrase.
- // del_spl_pos indicates which spelling id needs to be delete.
- // This function will update the lemma and spelling segmentation information.
- // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
- // the fixed lemmas.
- void merge_fixed_lmas(size_t del_spl_pos);
-
- // Get spelling start posistions and ids. The result will be stored in
- // spl_id_num_, spl_start_[], spl_id_[].
- // fixed_hzs_ will be also assigned.
- void get_spl_start_id();
-
- // Get all lemma ids with match the given spelling id stream(shorter than the
- // maximum length of a word).
- // If pfullsent is not NULL, means the full sentence candidate may be the
- // same with the coming lemma string, if so, remove that lemma.
- // The result is sorted in descendant order by the frequency score.
- size_t get_lpis(const uint16* splid_str, size_t splid_str_len,
- LmaPsbItem* lma_buf, size_t max_lma_buf,
- const char16 *pfullsent, bool sort_by_psb);
-
- uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
-
- uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid);
-
-
- // Extend a DMI node with a spelling id. ext_len is the length of the rows
- // to extend, actually, it is the size of the spelling string of splid.
- // return value can be 1 or 0.
- // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
- // the pool).
- // 0 means either the dmi node can not be extended with splid, or the splid
- // is a Shengmu id, which is only used to get lpi_items, or the result node
- // in DictTrie has no son, it is not nccessary to keep the new DMI.
- //
- // This function modifies the content of lpi_items_ and lpi_total_.
- // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
- // The function's returned value has no relation with the value of lpi_num.
- //
- // If dmi == NULL, this function will extend the root node of DictTrie
- //
- // This function will not change dmi_nd_pool_used_. Please change it after
- // calling this function if necessary.
- //
- // The caller should guarantees that NULL != dep.
- size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);
-
- // Extend dmi for the composing phrase.
- size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);
-
- // Extend a MatrixNode with the give LmaPsbItem list.
- // res_row is the destination row number.
- // This function does not change mtrx_nd_pool_used_. Please change it after
- // calling this function if necessary.
- // return 0 always.
- size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
- size_t lpi_num, PoolPosType dmi_fr, size_t res_row);
-
-
- // Try to find a dmi node at step_to position, and the found dmi node should
- // match the given spelling id strings.
- PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num);
-
- bool add_char(char ch);
- bool prepare_add_char(char ch);
-
- // Called after prepare_add_char, so the input char has been saved.
- bool add_char_qwerty();
-
- // Prepare candidates from the last fixed hanzi position.
- void prepare_candidates();
-
- // Is the character in step pos a splitter character?
- // The caller guarantees that the position is valid.
- bool is_split_at(uint16 pos);
-
- void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
- PoolPosType dmi_fr,
- uint16 spl_id, uint16 node_num, unsigned char dict_level,
- bool splid_end_split, unsigned char splstr_len,
- unsigned char all_full_id);
-
- size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
- char16 predict_buf[][kMaxPredictSize + 1],
- size_t buf_len);
-
- // Add the first candidate to the user dictionary.
- bool try_add_cand0_to_userdict();
-
- // Add a user lemma to the user dictionary. This lemma is a subset of
- // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
- // number of lemmas to be combined together as a new lemma. The caller
- // gurantees that the combined new lemma's length is less or equal to
- // kMaxLemmaSize.
- bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);
-
- // Update dictionary frequencies.
- void update_dict_freq();
-
- void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);
-
- public:
- MatrixSearch();
- ~MatrixSearch();
-
- bool init(const char *fn_sys_dict, const char *fn_usr_dict);
-
- bool init_fd(int sys_fd, long start_offset, long length,
- const char *fn_usr_dict);
-
- void init_user_dictionary(const char *fn_usr_dict);
-
- bool is_user_dictionary_enabled() const;
-
- void set_max_lens(size_t max_sps_len, size_t max_hzs_len);
-
- void close();
-
- void flush_cache();
-
- void set_xi_an_switch(bool xi_an_enabled);
-
- bool get_xi_an_switch();
-
- // Reset the search space. Equivalent to reset_search(0).
- // If inited, always return true;
- bool reset_search();
-
- // Search a Pinyin string.
- // Return value is the position successfully parsed.
- size_t search(const char *py, size_t py_len);
-
- // Used to delete something in the Pinyin string kept by the engine, and do
- // a re-search.
- // Return value is the new length of Pinyin string kept by the engine which
- // is parsed successfully.
- // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
- // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
- // characters for pos-th spelling id needs to be deleted.
- // If the deleted character(s) is just after a fixed lemma or sub lemma in
- // composing phrase, clear_fixed_this_step indicates whether we needs to
- // unlock the last fixed lemma or sub lemma.
- // If is_pos_in_splid is false, and pos-th character is in the range for the
- // fixed lemmas or composing string, this function will do nothing and just
- // return the result of the previous search.
- size_t delsearch(size_t pos, bool is_pos_in_splid,
- bool clear_fixed_this_step);
-
- // Get the number of candiates, called after search().
- size_t get_candidate_num();
-
- // Get the Pinyin string stored by the engine.
- // *decoded_len returns the length of the successfully decoded string.
- const char* get_pystr(size_t *decoded_len);
-
- // Get the spelling boundaries for the first sentence candidate.
- // Number of spellings will be returned. The number of valid elements in
- // spl_start is one more than the return value because the last one is used
- // to indicate the beginning of the next un-input speling.
- // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
- size_t get_spl_start(const uint16 *&spl_start);
-
- // Get one candiate string. If full sentence candidate is available, it will
- // be the first one.
- char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len);
-
- // Get the first candiate, which is a "full sentence".
- // retstr_len is not NULL, it will be used to return the string length.
- // If only_unfixed is true, only unfixed part will be fetched.
- char16* get_candidate0(char16* cand_str, size_t max_len,
- uint16 *retstr_len, bool only_unfixed);
-
- // Choose a candidate. The decoder will do a search after the fixed position.
- size_t choose(size_t cand_id);
-
- // Cancel the last choosing operation, and return the new number of choices.
- size_t cancel_last_choice();
-
- // Get the length of fixed Hanzis.
- size_t get_fixedlen();
-
- size_t get_predicts(const char16 fixed_buf[],
- char16 predict_buf[][kMaxPredictSize + 1],
- size_t buf_len);
-};
-}
-
-#endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h b/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h
deleted file mode 100644
index dfcf980b..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/mystdlib.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_MYSTDLIB_H__
-#define PINYINIME_INCLUDE_MYSTDLIB_H__
-
-#include <stdlib.h>
-
-namespace ime_pinyin {
-
-void myqsort(void *p, size_t n, size_t es,
- int (*cmp)(const void *, const void *));
-
-void *mybsearch(const void *key, const void *base,
- size_t nmemb, size_t size,
- int (*compar)(const void *, const void *));
-}
-
-#endif // PINYINIME_INCLUDE_MYSTDLIB_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h b/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h
deleted file mode 100644
index 7adb46d8..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/ngram.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_NGRAM_H__
-#define PINYINIME_INCLUDE_NGRAM_H__
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./dictdef.h"
-
-namespace ime_pinyin {
-
-typedef unsigned char CODEBOOK_TYPE;
-
-static const size_t kCodeBookSize = 256;
-
-class NGram {
- public:
- // The maximum score of a lemma item.
- static const LmaScoreType kMaxScore = 0x3fff;
-
- // In order to reduce the storage size, the original log value is amplified by
- // kScoreAmplifier, and we use LmaScoreType to store.
- // After this process, an item with a lower score has a higher frequency.
- static const int kLogValueAmplifier = -800;
-
- // System words' total frequency. It is not the real total frequency, instead,
- // It is only used to adjust system lemmas' scores when the user dictionary's
- // total frequency changes.
- // In this version, frequencies of system lemmas are fixed. We are considering
- // to make them changable in next version.
- static const size_t kSysDictTotalFreq = 100000000;
-
- private:
-
- static NGram* instance_;
-
- bool initialized_;
- uint32 idx_num_;
-
- size_t total_freq_none_sys_;
-
- // Score compensation for system dictionary lemmas.
- // Because after user adds some user lemmas, the total frequency changes, and
- // we use this value to normalize the score.
- float sys_score_compensation_;
-
-#ifdef ___BUILD_MODEL___
- double *freq_codes_df_;
-#endif
- LmaScoreType *freq_codes_;
- CODEBOOK_TYPE *lma_freq_idx_;
-
- public:
- NGram();
- ~NGram();
-
- static NGram& get_instance();
-
- bool save_ngram(FILE *fp);
- bool load_ngram(FILE *fp);
-
- // Set the total frequency of all none system dictionaries.
- void set_total_freq_none_sys(size_t freq_none_sys);
-
- float get_uni_psb(LemmaIdType lma_id);
-
- // Convert a probability to score. Actually, the score will be limited to
- // kMaxScore, but at runtime, we also need float expression to get accurate
- // value of the score.
- // After the conversion, a lower score indicates a higher probability of the
- // item.
- static float convert_psb_to_score(double psb);
-
-#ifdef ___BUILD_MODEL___
- // For constructing the unigram mode model.
- bool build_unigram(LemmaEntry *lemma_arr, size_t num,
- LemmaIdType next_idx_unused);
-#endif
-};
-}
-
-#endif // PINYINIME_INCLUDE_NGRAM_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h b/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h
deleted file mode 100644
index e376c20c..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/pinyinime.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_ANDPYIME_H__
-#define PINYINIME_INCLUDE_ANDPYIME_H__
-
-#include <stdlib.h>
-#include "./dictdef.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- namespace ime_pinyin {
-
- /**
- * Open the decoder engine via the system and user dictionary file names.
- *
- * @param fn_sys_dict The file name of the system dictionary.
- * @param fn_usr_dict The file name of the user dictionary.
- * @return true if open the decoder engine successfully.
- */
- bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict);
-
- /**
- * Open the decoder engine via the system dictionary FD and user dictionary
- * file name. Because on Android, the system dictionary is embedded in the
- * whole application apk file.
- *
- * @param sys_fd The file in which the system dictionary is embedded.
- * @param start_offset The starting position of the system dictionary in the
- * file sys_fd.
- * @param length The length of the system dictionary in the file sys_fd,
- * counted in byte.
- * @return true if succeed.
- */
- bool im_open_decoder_fd(int sys_fd, long start_offset, long length,
- const char *fn_usr_dict);
-
- /**
- * Close the decoder engine.
- */
- void im_close_decoder();
-
- /**
- * Set maximum limitations for decoding. If this function is not called,
- * default values will be used. For example, due to screen size limitation,
- * the UI engine of the IME can only show a certain number of letters(input)
- * to decode, and a certain number of Chinese characters(output). If after
- * user adds a new letter, the input or the output string is longer than the
- * limitations, the engine will discard the recent letter.
- *
- * @param max_sps_len Maximum length of the spelling string(Pinyin string).
- * @max_hzs_len Maximum length of the decoded Chinese character string.
- */
- void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len);
-
- /**
- * Flush cached data to persistent memory. Because at runtime, in order to
- * achieve best performance, some data is only store in memory.
- */
- void im_flush_cache();
-
- /**
- * Use a spelling string(Pinyin string) to search. The engine will try to do
- * an incremental search based on its previous search result, so if the new
- * string has the same prefix with the previous one stored in the decoder,
- * the decoder will only continue the search from the end of the prefix.
- * If the caller needs to do a brand new search, please call im_reset_search()
- * first. Calling im_search() is equivalent to calling im_add_letter() one by
- * one.
- *
- * @param sps_buf The spelling string buffer to decode.
- * @param sps_len The length of the spelling string buffer.
- * @return The number of candidates.
- */
- size_t im_search(const char* sps_buf, size_t sps_len);
-
- /**
- * Make a delete operation in the current search result, and make research if
- * necessary.
- *
- * @param pos The posistion of char in spelling string to delete, or the
- * position of spelling id in result string to delete.
- * @param is_pos_in_splid Indicate whether the pos parameter is the position
- * in the spelling string, or the position in the result spelling id string.
- * @return The number of candidates.
- */
- size_t im_delsearch(size_t pos, bool is_pos_in_splid,
- bool clear_fixed_this_step);
-
- /**
- * Reset the previous search result.
- */
- void im_reset_search();
-
- /**
- * Add a Pinyin letter to the current spelling string kept by decoder. If the
- * decoder fails in adding the letter, it will do nothing. im_get_sps_str()
- * can be used to get the spelling string kept by decoder currently.
- *
- * @param ch The letter to add.
- * @return The number of candidates.
- */
- size_t im_add_letter(char ch);
-
- /**
- * Get the spelling string kept by the decoder.
- *
- * @param decoded_len Used to return how many characters in the spelling
- * string is successfully parsed.
- * @return The spelling string kept by the decoder.
- */
- const char *im_get_sps_str(size_t *decoded_len);
-
- /**
- * Get a candidate(or choice) string.
- *
- * @param cand_id The id to get a candidate. Started from 0. Usually, id 0
- * is a sentence-level candidate.
- * @param cand_str The buffer to store the candidate.
- * @param max_len The maximum length of the buffer.
- * @return cand_str if succeeds, otherwise NULL.
- */
- char16* im_get_candidate(size_t cand_id, char16* cand_str,
- size_t max_len);
-
- /**
- * Get the segmentation information(the starting positions) of the spelling
- * string.
- *
- * @param spl_start Used to return the starting posistions.
- * @return The number of spelling ids. If it is L, there will be L+1 valid
- * elements in spl_start, and spl_start[L] is the posistion after the end of
- * the last spelling id.
- */
- size_t im_get_spl_start_pos(const uint16 *&spl_start);
-
- /**
- * Choose a candidate and make it fixed. If the candidate does not match
- * the end of all spelling ids, new candidates will be provided from the
- * first unfixed position. If the candidate matches the end of the all
- * spelling ids, there will be only one new candidates, or the whole fixed
- * sentence.
- *
- * @param cand_id The id of candidate to select and make it fixed.
- * @return The number of candidates. If after the selection, the whole result
- * string has been fixed, there will be only one candidate.
- */
- size_t im_choose(size_t cand_id);
-
- /**
- * Cancel the last selection, or revert the last operation of im_choose().
- *
- * @return The number of candidates.
- */
- size_t im_cancel_last_choice();
-
- /**
- * Get the number of fixed spelling ids, or Chinese characters.
- *
- * @return The number of fixed spelling ids, of Chinese characters.
- */
- size_t im_get_fixed_len();
-
- /**
- * Cancel the input state and reset the search workspace.
- */
- bool im_cancel_input();
-
- /**
- * Get prediction candiates based on the given fixed Chinese string as the
- * history.
- *
- * @param his_buf The history buffer to do the prediction. It should be ended
- * with '\0'.
- * @param pre_buf Used to return prediction result list.
- * @return The number of predicted result string.
- */
- size_t im_get_predicts(const char16 *his_buf,
- char16 (*&pre_buf)[kMaxPredictSize + 1]);
-
- /**
- * Enable Shengmus in ShouZiMu mode.
- */
- void im_enable_shm_as_szm(bool enable);
-
- /**
- * Enable Yunmus in ShouZiMu mode.
- */
- void im_enable_ym_as_szm(bool enable);
-
- /**
- * Initializes or uninitializes the user dictionary.
- *
- * @param fn_usr_dict The file name of the user dictionary.
- */
- void im_init_user_dictionary(const char *fn_usr_dict);
-
- /**
- * Returns the current status of user dictinary.
- */
- bool im_is_user_dictionary_enabled(void);
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // PINYINIME_INCLUDE_ANDPYIME_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h b/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h
deleted file mode 100644
index f1357107..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/searchutility.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__
-#define PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__
-
-#include <stdlib.h>
-#include "./spellingtrie.h"
-
-namespace ime_pinyin {
-
-// Type used to identify the size of a pool, such as id pool, etc.
-typedef uint16 PoolPosType;
-
-// Type used to identify a parsing mile stone in an atom dictionary.
-typedef uint16 MileStoneHandle;
-
-// Type used to express a lemma and its probability score.
-typedef struct {
- size_t id:(kLemmaIdSize * 8);
- size_t lma_len:4;
- uint16 psb; // The score, the lower psb, the higher possibility.
- // For single character items, we may also need Hanzi.
- // For multiple characer items, ignore it.
- char16 hanzi;
-} LmaPsbItem, *PLmaPsbItem;
-
-// LmaPsbItem extended with string.
-typedef struct {
- LmaPsbItem lpi;
- char16 str[kMaxLemmaSize + 1];
-} LmaPsbStrItem, *PLmaPsbStrItem;
-
-
-typedef struct {
- float psb;
- char16 pre_hzs[kMaxPredictSize];
- uint16 his_len; // The length of the history used to do the prediction.
-} NPredictItem, *PNPredictItem;
-
-// Parameter structure used to extend in a dictionary. All dictionaries
-// receives the same DictExtPara and a dictionary specific MileStoneHandle for
-// extending.
-//
-// When the user inputs a new character, AtomDictBase::extend_dict() will be
-// called at least once for each dictionary.
-//
-// For example, when the user inputs "wm", extend_dict() will be called twice,
-// and the DictExtPara parameter are as follows respectively:
-// 1. splids = {w, m}; splids_extended = 1; ext_len = 1; step_no = 1;
-// splid_end_split = false; id_start = wa(the first id start with 'w');
-// id_num = number of ids starting with 'w'.
-// 2. splids = {m}; splids_extended = 0; ext_len = 1; step_no = 1;
-// splid_end_split = false; id_start = wa; id_num = number of ids starting with
-// 'w'.
-//
-// For string "women", one of the cases of the DictExtPara parameter is:
-// splids = {wo, men}, splids_extended = 1, ext_len = 3 (length of "men"),
-// step_no = 4; splid_end_split = false; id_start = men, id_num = 1.
-//
-typedef struct {
- // Spelling ids for extending, there are splids_extended + 1 ids in the
- // buffer.
- // For a normal lemma, there can only be kMaxLemmaSize spelling ids in max,
- // but for a composing phrase, there can kMaxSearchSteps spelling ids.
- uint16 splids[kMaxSearchSteps];
-
- // Number of ids that have been used before. splids[splids_extended] is the
- // newly added id for the current extension.
- uint16 splids_extended;
-
- // The step span of the extension. It is also the size of the string for
- // the newly added spelling id.
- uint16 ext_len;
-
- // The step number for the current extension. It is also the ending position
- // in the input Pinyin string for the substring of spelling ids in splids[].
- // For example, when the user inputs "women", step_no = 4.
- // This parameter may useful to manage the MileStoneHandle list for each
- // step. When the user deletes a character from the string, MileStoneHandle
- // objects for the the steps after that character should be reset; when the
- // user begins a new string, all MileStoneHandle objects should be reset.
- uint16 step_no;
-
- // Indicate whether the newly added spelling ends with a splitting character
- bool splid_end_split;
-
- // If the newly added id is a half id, id_start is the first id of the
- // corresponding full ids; if the newly added id is a full id, id_start is
- // that id.
- uint16 id_start;
-
- // If the newly added id is a half id, id_num is the number of corresponding
- // ids; if it is a full id, id_num == 1.
- uint16 id_num;
-}DictExtPara, *PDictExtPara;
-
-bool is_system_lemma(LemmaIdType lma_id);
-bool is_user_lemma(LemmaIdType lma_id);
-bool is_composing_lemma(LemmaIdType lma_id);
-
-int cmp_lpi_with_psb(const void *p1, const void *p2);
-int cmp_lpi_with_unified_psb(const void *p1, const void *p2);
-int cmp_lpi_with_id(const void *p1, const void *p2);
-int cmp_lpi_with_hanzi(const void *p1, const void *p2);
-
-int cmp_lpsi_with_str(const void *p1, const void *p2);
-
-int cmp_hanzis_1(const void *p1, const void *p2);
-int cmp_hanzis_2(const void *p1, const void *p2);
-int cmp_hanzis_3(const void *p1, const void *p2);
-int cmp_hanzis_4(const void *p1, const void *p2);
-int cmp_hanzis_5(const void *p1, const void *p2);
-int cmp_hanzis_6(const void *p1, const void *p2);
-int cmp_hanzis_7(const void *p1, const void *p2);
-int cmp_hanzis_8(const void *p1, const void *p2);
-
-int cmp_npre_by_score(const void *p1, const void *p2);
-int cmp_npre_by_hislen_score(const void *p1, const void *p2);
-int cmp_npre_by_hanzi_score(const void *p1, const void *p2);
-
-
-size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num);
-
-size_t align_to_size_t(size_t size);
-
-} // namespace
-
-#endif // PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h
deleted file mode 100644
index fd79c6ef..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtable.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_SPELLINGTABLE_H__
-#define PINYINIME_INCLUDE_SPELLINGTABLE_H__
-
-#include <stdlib.h>
-#include "./dictdef.h"
-
-namespace ime_pinyin {
-
-#ifdef ___BUILD_MODEL___
-
-const size_t kMaxSpellingSize = kMaxPinyinSize;
-
-typedef struct {
- char str[kMaxSpellingSize + 1];
- double freq;
-} RawSpelling, *PRawSpelling;
-
-// This class is used to store the spelling strings
-// The length of the input spelling string should be less or equal to the
-// spelling_size_ (set by init_table). If the input string is too long,
-// we only keep its first spelling_size_ chars.
-class SpellingTable {
- private:
- static const size_t kNotSupportNum = 3;
- static const char kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1];
-
- bool need_score_;
-
- size_t spelling_max_num_;
-
- RawSpelling *raw_spellings_;
-
- // Used to store spelling strings. If the spelling table needs to calculate
- // score, an extra char after each spelling string is the score.
- // An item with a lower score has a higher probability.
- char *spelling_buf_;
- size_t spelling_size_;
-
- double total_freq_;
-
- size_t spelling_num_;
-
- double score_amplifier_;
-
- unsigned char average_score_;
-
- // If frozen is true, put_spelling() and contain() are not allowed to call.
- bool frozen_;
-
- size_t get_hash_pos(const char* spelling_str);
- size_t hash_pos_next(size_t hash_pos);
- void free_resource();
- public:
- SpellingTable();
- ~SpellingTable();
-
- // pure_spl_size is the pure maximum spelling string size. For example,
- // "zhuang" is the longgest item in Pinyin, so pure_spl_size should be 6.
- // spl_max_num is the maximum number of spelling strings to store.
- // need_score is used to indicate whether the caller needs to calculate a
- // score for each spelling.
- bool init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score);
-
- // Put a spelling string to the table.
- // It always returns false if called after arrange() withtout a new
- // init_table() operation.
- // freq is the spelling's occuring count.
- // If the spelling has been in the table, occuring count will accumulated.
- bool put_spelling(const char* spelling_str, double spl_count);
-
- // Test whether a spelling string is in the table.
- // It always returns false, when being called after arrange() withtout a new
- // init_table() operation.
- bool contain(const char* spelling_str);
-
- // Sort the spelling strings and put them from the begin of the buffer.
- // Return the pointer of the sorted spelling strings.
- // item_size and spl_num return the item size and number of spelling.
- // Because each spelling uses a '\0' as terminator, the returned item_size is
- // at least one char longer than the spl_size parameter specified by
- // init_table(). If the table is initialized to calculate score, item_size
- // will be increased by 1, and current_spl_str[item_size - 1] stores an
- // unsinged char score.
- // An item with a lower score has a higher probability.
- // Do not call put_spelling() and contains() after arrange().
- const char* arrange(size_t *item_size, size_t *spl_num);
-
- float get_score_amplifier();
-
- unsigned char get_average_score();
-};
-#endif // ___BUILD_MODEL___
-}
-
-#endif // PINYINIME_INCLUDE_SPELLINGTABLE_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h b/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h
deleted file mode 100644
index 03510ed3..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/spellingtrie.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
-#define PINYINIME_INCLUDE_SPELLINGTRIE_H__
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./dictdef.h"
-
-namespace ime_pinyin {
-
-static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;
-
-// Node used for the trie of spellings
-struct SpellingNode {
- SpellingNode *first_son;
- // The spelling id for each node. If you need more bits to store
- // spelling id, please adjust this structure.
- uint16 spelling_idx:11;
- uint16 num_of_son:5;
- char char_this_node;
- unsigned char score;
-};
-
-class SpellingTrie {
- private:
- static const int kMaxYmNum = 64;
- static const size_t kValidSplCharNum = 26;
-
- static const uint16 kHalfIdShengmuMask = 0x01;
- static const uint16 kHalfIdYunmuMask = 0x02;
- static const uint16 kHalfIdSzmMask = 0x04;
-
- // Map from half spelling id to single char.
- // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
- // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
- // 28 to 'Z', 29 to 'z'.
- // [0] is not used to achieve better efficiency.
- static const char kHalfId2Sc_[kFullSplIdStart + 1];
-
- static unsigned char char_flags_[];
- static SpellingTrie* instance_;
-
- // The spelling table
- char *spelling_buf_;
-
- // The size of longest spelling string, includes '\0' and an extra char to
- // store score. For example, "zhuang" is the longgest item in Pinyin list,
- // so spelling_size_ is 8.
- // Structure: The string ended with '\0' + score char.
- // An item with a lower score has a higher probability.
- uint32 spelling_size_;
-
- // Number of full spelling ids.
- uint32 spelling_num_;
-
- float score_amplifier_;
- unsigned char average_score_;
-
- // The Yunmu id list for the spelling ids (for half ids of Shengmu,
- // the Yunmu id is 0).
- // The length of the list is spelling_num_ + kFullSplIdStart,
- // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
- uint8 *spl_ym_ids_;
-
- // The Yunmu table.
- // Each Yunmu will be assigned with Yunmu id from 1.
- char *ym_buf_;
- size_t ym_size_; // The size of longest Yunmu string, '\0'included.
- size_t ym_num_;
-
- // The spelling string just queried
- char *splstr_queried_;
-
- // The spelling string just queried
- char16 *splstr16_queried_;
-
- // The root node of the spelling tree
- SpellingNode* root_;
-
- // If a none qwerty key such as a fnction key like ENTER is given, this node
- // will be used to indicate that this is not a QWERTY node.
- SpellingNode* dumb_node_;
-
- // If a splitter key is pressed, this node will be used to indicate that this
- // is a splitter key.
- SpellingNode* splitter_node_;
-
- // Used to get the first level sons.
- SpellingNode* level1_sons_[kValidSplCharNum];
-
- // The full spl_id range for specific half id.
- // h2f means half to full.
- // A half id can be a ShouZiMu id (id to represent the first char of a full
- // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
- // [1..kFullSplIdStart-1] is the arrange of half id.
- uint16 h2f_start_[kFullSplIdStart];
- uint16 h2f_num_[kFullSplIdStart];
-
- // Map from full id to half id.
- uint16 *f2h_;
-
-#ifdef ___BUILD_MODEL___
- // How many node used to build the trie.
- size_t node_num_;
-#endif
-
- SpellingTrie();
-
- void free_son_trie(SpellingNode* node);
-
- // Construct a subtree using a subset of the spelling array (from
- // item_star to item_end).
- // Member spelliing_buf_ and spelling_size_ should be valid.
- // parent is used to update its num_of_son and score.
- SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end,
- size_t level, SpellingNode *parent);
- bool build_f2h();
-
- // The caller should guarantee ch >= 'A' && ch <= 'Z'
- bool is_shengmu_char(char ch) const;
-
- // The caller should guarantee ch >= 'A' && ch <= 'Z'
- bool is_yunmu_char(char ch) const;
-
-#ifdef ___BUILD_MODEL___
- // Given a spelling string, return its Yunmu string.
- // The caller guaratees spl_str is valid.
- const char* get_ym_str(const char *spl_str);
-
- // Build the Yunmu list, and the mapping relation between the full ids and the
- // Yunmu ids. This functin is called after the spelling trie is built.
- bool build_ym_info();
-#endif
-
- friend class SpellingParser;
- friend class SmartSplParser;
- friend class SmartSplParser2;
-
- public:
- ~SpellingTrie();
-
- inline static bool is_valid_spl_char(char ch) {
- return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
- }
-
- // The caller guarantees that the two chars are valid spelling chars.
- inline static bool is_same_spl_char(char ch1, char ch2) {
- return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
- }
-
- // Construct the tree from the input pinyin array
- // The given string list should have been sorted.
- // score_amplifier is used to convert a possibility value into score.
- // average_score is the average_score of all spellings. The dumb node is
- // assigned with this score.
- bool construct(const char* spelling_arr, size_t item_size, size_t item_num,
- float score_amplifier, unsigned char average_score);
-
- // Test if the given id is a valid spelling id.
- // If function returns true, the given splid may be updated like this:
- // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
- // first given as a half id 1, but because 'A' is a one-char Yunmu and
- // it is a valid id, it needs to updated to its corresponding full id.
- bool if_valid_id_update(uint16 *splid) const;
-
- // Test if the given id is a half id.
- bool is_half_id(uint16 splid) const;
-
- bool is_full_id(uint16 splid) const;
-
- // Test if the given id is a one-char Yunmu id (obviously, it is also a half
- // id), such as 'A', 'E' and 'O'.
- bool is_half_id_yunmu(uint16 splid) const;
-
- // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
- // For Pinyin, only i/u/v is not a ShouZiMu char.
- // The caller should guarantee that ch >= 'A' && ch <= 'Z'
- bool is_szm_char(char ch) const;
-
- // Test If this char is enabled in ShouZiMu mode.
- // The caller should guarantee that ch >= 'A' && ch <= 'Z'
- bool szm_is_enabled(char ch) const;
-
- // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
- // to input).
- void szm_enable_shm(bool enable);
-
- // Enable/disable Yunmus in ShouZiMu mode.
- void szm_enable_ym(bool enable);
-
- // Test if this char is enabled in ShouZiMu mode.
- // The caller should guarantee ch >= 'A' && ch <= 'Z'
- bool is_szm_enabled(char ch) const;
-
- // Return the number of full ids for the given half id.
- uint16 half2full_num(uint16 half_id) const;
-
- // Return the number of full ids for the given half id, and fill spl_id_start
- // to return the first full id.
- uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;
-
- // Return the corresponding half id for the given full id.
- // Not frequently used, low efficient.
- // Return 0 if fails.
- uint16 full_to_half(uint16 full_id) const;
-
- // To test whether a half id is compatible with a full id.
- // Generally, when half_id == full_to_half(full_id), return true.
- // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
- // with a full id like "Zhe". (Fussy mode is not ready).
- bool half_full_compatible(uint16 half_id, uint16 full_id) const;
-
- static const SpellingTrie* get_cpinstance();
-
- static SpellingTrie& get_instance();
-
- // Save to the file stream
- bool save_spl_trie(FILE *fp);
-
- // Load from the file stream
- bool load_spl_trie(FILE *fp);
-
- // Get the number of spellings
- size_t get_spelling_num();
-
- // Return the Yunmu id for the given Yunmu string.
- // If the string is not valid, return 0;
- uint8 get_ym_id(const char* ym_str);
-
- // Get the readonly Pinyin string for a given spelling id
- const char* get_spelling_str(uint16 splid);
-
- // Get the readonly Pinyin string for a given spelling id
- const char16* get_spelling_str16(uint16 splid);
-
- // Get Pinyin string for a given spelling id. Return the length of the
- // string, and fill-in '\0' at the end.
- size_t get_spelling_str16(uint16 splid, char16 *splstr16,
- size_t splstr16_len);
-};
-}
-
-#endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h b/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h
deleted file mode 100644
index d783bd73..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/splparser.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_SPLPARSER_H__
-#define PINYINIME_INCLUDE_SPLPARSER_H__
-
-#include "./dictdef.h"
-#include "./spellingtrie.h"
-
-namespace ime_pinyin {
-
-class SpellingParser {
- protected:
- const SpellingTrie *spl_trie_;
-
- public:
- SpellingParser();
-
- // Given a string, parse it into a spelling id stream.
- // If the whole string are sucessfully parsed, last_is_pre will be true;
- // if the whole string is not fullly parsed, last_is_pre will return whether
- // the last part of the string is a prefix of a full spelling string. For
- // example, given string "zhengzhon", "zhon" is not a valid speling, but it is
- // the prefix of "zhong".
- //
- // If splstr starts with a character not in ['a'-z'] (it is a split char),
- // return 0.
- // Split char can only appear in the middle of the string or at the end.
- uint16 splstr_to_idxs(const char *splstr, uint16 str_len, uint16 splidx[],
- uint16 start_pos[], uint16 max_size, bool &last_is_pre);
-
- // Similar to splstr_to_idxs(), the only difference is that splstr_to_idxs()
- // convert single-character Yunmus into half ids, while this function converts
- // them into full ids.
- uint16 splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 splidx[],
- uint16 start_pos[], uint16 max_size, bool &last_is_pre);
-
- // Similar to splstr_to_idxs(), the only difference is that this function
- // uses char16 instead of char8.
- uint16 splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 splidx[],
- uint16 start_pos[], uint16 max_size, bool &last_is_pre);
-
- // Similar to splstr_to_idxs_f(), the only difference is that this function
- // uses char16 instead of char8.
- uint16 splstr16_to_idxs_f(const char16 *splstr16, uint16 str_len,
- uint16 splidx[], uint16 start_pos[],
- uint16 max_size, bool &last_is_pre);
-
- // If the given string is a spelling, return the id, others, return 0.
- // If the give string is a single char Yunmus like "A", and the char is
- // enabled in ShouZiMu mode, the returned spelling id will be a half id.
- // When the returned spelling id is a half id, *is_pre returns whether it
- // is a prefix of a full spelling string.
- uint16 get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre);
-
- // If the given string is a spelling, return the id, others, return 0.
- // If the give string is a single char Yunmus like "a", no matter the char
- // is enabled in ShouZiMu mode or not, the returned spelling id will be
- // a full id.
- // When the returned spelling id is a half id, *p_is_pre returns whether it
- // is a prefix of a full spelling string.
- uint16 get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre);
-
- // Splitter chars are not included.
- bool is_valid_to_parse(char ch);
-
- // When auto-correction is not enabled, get_splid_by_str() will be called to
- // return the single result. When auto-correction is enabled, this function
- // will be called to get the results. Auto-correction is not ready.
- // full_id_num returns number of full spelling ids.
- // is_pre returns whether the given string is the prefix of a full spelling
- // string.
- // If splstr starts with a character not in [a-zA-Z] (it is a split char),
- // return 0.
- // Split char can only appear in the middle of the string or at the end.
- // The caller should guarantee NULL != splstr && str_len > 0 && NULL != splidx
- uint16 get_splids_parallel(const char *splstr, uint16 str_len,
- uint16 splidx[], uint16 max_size,
- uint16 &full_id_num, bool &is_pre);
-};
-}
-
-#endif // PINYINIME_INCLUDE_SPLPARSER_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h b/src/virtualkeyboard/3rdparty/pinyin/include/sync.h
deleted file mode 100644
index bf42d1f1..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/sync.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_SYNC_H__
-#define PINYINIME_INCLUDE_SYNC_H__
-
-#define ___SYNC_ENABLED___
-
-#ifdef ___SYNC_ENABLED___
-
-#include "userdict.h"
-
-namespace ime_pinyin {
-
-// Class for user dictionary synchronization
-// This class is not thread safe
-// Normal invoking flow will be
-// begin() ->
-// put_lemmas() x N ->
-// {
-// get_lemmas() ->
-// [ get_last_got_count() ] ->
-// clear_last_got() ->
-// } x N ->
-// finish()
-class Sync {
- public:
- Sync();
- ~Sync();
-
- static const int kUserDictMaxLemmaCount = 5000;
- static const int kUserDictMaxLemmaSize = 200000;
- static const int kUserDictRatio = 20;
-
- bool begin(const char * filename);
-
- // Merge lemmas downloaded from sync server into local dictionary
- // lemmas, lemmas string encoded in UTF16LE
- // len, length of lemmas string
- // Return how many lemmas merged successfully
- int put_lemmas(char16 * lemmas, int len);
-
- // Get local new user lemmas into UTF16LE string
- // str, buffer ptr to store new user lemmas
- // size, size of buffer
- // Return length of returned buffer in measure of UTF16LE
- int get_lemmas(char16 * str, int size);
-
- // Return lemmas count in last get_lemmas()
- int get_last_got_count();
-
- // Return total lemmas count need get_lemmas()
- int get_total_count();
-
- // Clear lemmas got by recent get_lemmas()
- void clear_last_got();
-
- void finish();
-
- int get_capacity();
-
- private:
- UserDict * userdict_;
- char * dictfile_;
- int last_count_;
-};
-
-}
-
-#endif
-
-#endif // PINYINIME_INCLUDE_SYNC_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h b/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h
deleted file mode 100644
index db010912..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/userdict.h
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_USERDICT_H__
-#define PINYINIME_INCLUDE_USERDICT_H__
-
-#define ___CACHE_ENABLED___
-#define ___SYNC_ENABLED___
-#define ___PREDICT_ENABLED___
-
-// Debug performance for operations
-// #define ___DEBUG_PERF___
-
-#ifdef _WIN32
-#include <time.h>
-#include <winsock.h> // timeval
-#else
-#include <pthread.h>
-#include <sys/time.h>
-#endif
-#include "atomdictbase.h"
-
-namespace ime_pinyin {
-
-class UserDict : public AtomDictBase {
- public:
- UserDict();
- ~UserDict();
-
- bool load_dict(const char *file_name, LemmaIdType start_id,
- LemmaIdType end_id);
-
- bool close_dict();
-
- size_t number_of_lemmas();
-
- void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
-
- MileStoneHandle extend_dict(MileStoneHandle from_handle,
- const DictExtPara *dep, LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num);
-
- size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
- LmaPsbItem *lpi_items, size_t lpi_max);
-
- uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
- uint16 str_max);
-
- uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid);
-
- size_t predict(const char16 last_hzs[], uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used);
-
- // Full spelling ids are required
- LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count);
-
- LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
- bool selected);
-
- LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len);
-
- LmaScoreType get_lemma_score(LemmaIdType lemma_id);
-
- LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len);
-
- bool remove_lemma(LemmaIdType lemma_id);
-
- size_t get_total_lemma_count();
- void set_total_lemma_count_of_others(size_t count);
-
- void flush_cache();
-
- void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
- uint32 reclaim_ratio);
-
- void reclaim();
-
- void defragment();
-
-#ifdef ___SYNC_ENABLED___
- void clear_sync_lemmas(unsigned int start, unsigned int end);
-
- int get_sync_count();
-
- LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt);
- /**
- * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
- *
- * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
- * @param len length of lemmas string in UTF-16LE
- * @return newly added lemma count
- */
- int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
-
- /**
- * Get lemmas need sync to a UTF-16LE string of above format.
- * Note: input buffer (str) must not be too small. If str is too small to
- * contain single one lemma, there might be a dead loop.
- *
- * @param str buffer to write lemmas
- * @param size buffer size in UTF-16LE
- * @param count output value of lemma returned
- * @return UTF-16LE string length
- */
- int get_sync_lemmas_in_utf16le_string_from_beginning(
- char16 * str, int size, int * count);
-
-#endif
-
- struct UserDictStat {
- uint32 version;
- const char * file_name;
- struct timeval load_time;
- struct timeval last_update;
- uint32 disk_size;
- uint32 lemma_count;
- uint32 lemma_size;
- uint32 delete_count;
- uint32 delete_size;
-#ifdef ___SYNC_ENABLED___
- uint32 sync_count;
-#endif
- uint32 reclaim_ratio;
- uint32 limit_lemma_count;
- uint32 limit_lemma_size;
- };
-
- bool state(UserDictStat * stat);
-
- private:
- uint32 total_other_nfreq_;
- struct timeval load_time_;
- LemmaIdType start_id_;
- uint32 version_;
- uint8 * lemmas_;
-
- // In-Memory-Only flag for each lemma
- static const uint8 kUserDictLemmaFlagRemove = 1;
- // Inuse lemmas' offset
- uint32 * offsets_;
- // Highest bit in offset tells whether corresponding lemma is removed
- static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
- // Maximum possible for the offset
- static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
- // Bit width for last modified time, from 1 to 16
- static const uint32 kUserDictLMTBitWidth = 16;
- // Granularity for last modified time in second
- static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
- // Maximum frequency count
- static const uint16 kUserDictMaxFrequency = 0xFFFF;
-
-#define COARSE_UTC(year, month, day, hour, minute, second) \
- ( \
- (year - 1970) * 365 * 24 * 60 * 60 + \
- (month - 1) * 30 * 24 * 60 * 60 + \
- (day - 1) * 24 * 60 * 60 + \
- (hour - 0) * 60 * 60 + \
- (minute - 0) * 60 + \
- (second - 0) \
- )
- static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
-
- // Correspond to offsets_
- uint32 * scores_;
- // Following two fields are only valid in memory
- uint32 * ids_;
-#ifdef ___PREDICT_ENABLED___
- uint32 * predicts_;
-#endif
-#ifdef ___SYNC_ENABLED___
- uint32 * syncs_;
- size_t sync_count_size_;
-#endif
- uint32 * offsets_by_id_;
-
- size_t lemma_count_left_;
- size_t lemma_size_left_;
-
- const char * dict_file_;
-
- // Be sure size is 4xN
- struct UserDictInfo {
- // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
- uint32 reclaim_ratio;
- // maximum lemma count, 0 means no limitation
- uint32 limit_lemma_count;
- // Maximum lemma size, it's different from
- // whole disk file size or in-mem dict size
- // 0 means no limitation
- uint32 limit_lemma_size;
- // Total lemma count including deleted and inuse
- // Also indicate offsets_ size
- uint32 lemma_count;
- // Total size of lemmas including used and freed
- uint32 lemma_size;
- // Freed lemma count
- uint32 free_count;
- // Freed lemma size in byte
- uint32 free_size;
-#ifdef ___SYNC_ENABLED___
- uint32 sync_count;
-#endif
- int32 total_nfreq;
- } dict_info_;
-
- static const uint32 kUserDictVersion = 0x0ABCDEF0;
-
- static const uint32 kUserDictPreAlloc = 32;
- static const uint32 kUserDictAverageNchar = 8;
-
- enum UserDictState {
- // Keep in order
- USER_DICT_NONE = 0,
- USER_DICT_SYNC,
-#ifdef ___SYNC_ENABLED___
- USER_DICT_SYNC_DIRTY,
-#endif
- USER_DICT_SCORE_DIRTY,
- USER_DICT_OFFSET_DIRTY,
- USER_DICT_LEMMA_DIRTY,
-
- USER_DICT_DEFRAGMENTED,
- } state_;
-
- struct UserDictSearchable {
- uint16 splids_len;
- uint16 splid_start[kMaxLemmaSize];
- uint16 splid_count[kMaxLemmaSize];
- // Compact inital letters for both FuzzyCompareSpellId and cache system
- uint32 signature[kMaxLemmaSize / 4];
- };
-
-#ifdef ___CACHE_ENABLED___
- enum UserDictCacheType {
- USER_DICT_CACHE,
- USER_DICT_MISS_CACHE,
- };
-
- static const int kUserDictCacheSize = 4;
- static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
-
- struct UserDictMissCache {
- uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
- uint16 head, tail;
- } miss_caches_[kMaxLemmaSize];
-
- struct UserDictCache {
- uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
- uint32 offsets[kUserDictCacheSize];
- uint32 lengths[kUserDictCacheSize];
- // Ring buffer
- uint16 head, tail;
- } caches_[kMaxLemmaSize];
-
- void cache_init();
-
- void cache_push(UserDictCacheType type,
- UserDictSearchable *searchable,
- uint32 offset, uint32 length);
-
- bool cache_hit(UserDictSearchable *searchable,
- uint32 *offset, uint32 *length);
-
- bool load_cache(UserDictSearchable *searchable,
- uint32 *offset, uint32 *length);
-
- void save_cache(UserDictSearchable *searchable,
- uint32 offset, uint32 length);
-
- void reset_cache();
-
- bool load_miss_cache(UserDictSearchable *searchable);
-
- void save_miss_cache(UserDictSearchable *searchable);
-
- void reset_miss_cache();
-#endif
-
- LmaScoreType translate_score(int f);
-
- int extract_score_freq(int raw_score);
-
- uint64 extract_score_lmt(int raw_score);
-
- inline int build_score(uint64 lmt, int freq);
-
- inline int64 utf16le_atoll(uint16 *s, int len);
-
- inline int utf16le_lltoa(int64 v, uint16 *s, int size);
-
- LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt);
-
- size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
- LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
-
- int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
-
- int _get_lemma_score(LemmaIdType lemma_id);
-
- int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
- const UserDictSearchable *searchable);
-
- bool is_prefix_spell_id(const uint16 * fullids,
- uint16 fulllen, const UserDictSearchable *searchable);
-
- uint32 get_dict_file_size(UserDictInfo * info);
-
- bool reset(const char *file);
-
- bool validate(const char *file);
-
- bool load(const char *file, LemmaIdType start_id);
-
- bool is_valid_state();
-
- bool is_valid_lemma_id(LemmaIdType id);
-
- LemmaIdType get_max_lemma_id();
-
- void set_lemma_flag(uint32 offset, uint8 flag);
-
- char get_lemma_flag(uint32 offset);
-
- char get_lemma_nchar(uint32 offset);
-
- uint16 * get_lemma_spell_ids(uint32 offset);
-
- uint16 * get_lemma_word(uint32 offset);
-
- // Prepare searchable to fasten locate process
- void prepare_locate(UserDictSearchable *searchable,
- const uint16 * splids, uint16 len);
-
- // Compare initial letters only
- int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
- const UserDictSearchable *searchable);
-
- // Compare exactly two spell ids
- // First argument must be a full id spell id
- bool equal_spell_id(const uint16 * fullids,
- uint16 fulllen, const UserDictSearchable *searchable);
-
- // Find first item by initial letters
- int32 locate_first_in_offsets(const UserDictSearchable *searchable);
-
- LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt);
-
- // Check if a lemma is in dictionary
- int32 locate_in_offsets(char16 lemma_str[],
- uint16 splid_str[], uint16 lemma_len);
-
- bool remove_lemma_by_offset_index(int offset_index);
-#ifdef ___PREDICT_ENABLED___
- uint32 locate_where_to_insert_in_predicts(const uint16 * words,
- int lemma_len);
-
- int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
-
- void remove_lemma_from_predict_list(uint32 offset);
-#endif
-#ifdef ___SYNC_ENABLED___
- void queue_lemma_for_sync(LemmaIdType id);
-
- void remove_lemma_from_sync_list(uint32 offset);
-
- void write_back_sync(int fd);
-#endif
- void write_back_score(int fd);
- void write_back_offset(int fd);
- void write_back_lemma(int fd);
- void write_back_all(int fd);
- void write_back();
-
- struct UserDictScoreOffsetPair {
- int score;
- uint32 offset_index;
- };
-
- inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
-
- void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
-
- // On-disk format for each lemma
- // +-------------+
- // | Version (4) |
- // +-------------+
- // +-----------+-----------+--------------------+-------------------+
- // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
- // +-----------+-----------+--------------------+-------------------+
- // ...
- // +-----------------------+ +-------------+ <---Offset of offset
- // | Offset1 by_splids (4) | ... | OffsetN (4) |
- // +-----------------------+ +-------------+
-#ifdef ___PREDICT_ENABLED___
- // +----------------------+ +-------------+
- // | Offset1 by_lemma (4) | ... | OffsetN (4) |
- // +----------------------+ +-------------+
-#endif
- // +------------+ +------------+
- // | Score1 (4) | ... | ScoreN (4) |
- // +------------+ +------------+
-#ifdef ___SYNC_ENABLED___
- // +-------------+ +-------------+
- // | NewAdd1 (4) | ... | NewAddN (4) |
- // +-------------+ +-------------+
-#endif
- // +----------------+
- // | Dict Info (4x) |
- // +----------------+
-};
-}
-
-#endif
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h
deleted file mode 100644
index 7e957db5..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16char.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_UTF16CHAR_H__
-#define PINYINIME_INCLUDE_UTF16CHAR_H__
-
-#include <stdlib.h>
-
-namespace ime_pinyin {
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- typedef unsigned short char16;
-
- // Get a token from utf16_str,
- // Returned pointer is a '\0'-terminated utf16 string, or NULL
- // *utf16_str_next returns the next part of the string for further tokenizing
- char16* utf16_strtok(char16 *utf16_str, size_t *token_size,
- char16 **utf16_str_next);
-
- int utf16_atoi(const char16 *utf16_str);
-
- float utf16_atof(const char16 *utf16_str);
-
- size_t utf16_strlen(const char16 *utf16_str);
-
- int utf16_strcmp(const char16 *str1, const char16 *str2);
- int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size);
-
- char16* utf16_strcpy(char16 *dst, const char16 *src);
- char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size);
-
-
- char* utf16_strcpy_tochar(char *dst, const char16 *src);
-
-#ifdef __cplusplus
-}
-#endif
-}
-
-#endif // PINYINIME_INCLUDE_UTF16CHAR_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h b/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h
deleted file mode 100644
index b6d6719e..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/include/utf16reader.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PINYINIME_INCLUDE_UTF16READER_H__
-#define PINYINIME_INCLUDE_UTF16READER_H__
-
-#include <stdio.h>
-#include "./utf16char.h"
-
-namespace ime_pinyin {
-
-class Utf16Reader {
- private:
- FILE *fp_;
- char16 *buffer_;
- size_t buffer_total_len_;
- size_t buffer_next_pos_;
-
- // Always less than buffer_total_len_ - buffer_next_pos_
- size_t buffer_valid_len_;
-
- public:
- Utf16Reader();
- ~Utf16Reader();
-
- // filename is the name of the file to open.
- // buffer_len specifies how long buffer should be allocated to speed up the
- // future reading
- bool open(const char* filename, size_t buffer_len);
- char16* readline(char16* read_buf, size_t max_len);
- bool close();
-};
-}
-
-#endif // PINYINIME_INCLUDE_UTF16READER_H__
diff --git a/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro b/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro
deleted file mode 100644
index 9ad9a318..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/pinyin.pro
+++ /dev/null
@@ -1,59 +0,0 @@
-TARGET = qtpinyin
-
-VERSION = 1.0.0
-CONFIG += static
-CONFIG += warn_off
-
-MODULE_INCLUDEPATH = $$PWD/include
-
-SOURCES += \
- share/dictbuilder.cpp \
- share/dictlist.cpp \
- share/dicttrie.cpp \
- share/lpicache.cpp \
- share/matrixsearch.cpp \
- share/mystdlib.cpp \
- share/ngram.cpp \
- share/pinyinime.cpp \
- share/searchutility.cpp \
- share/spellingtable.cpp \
- share/spellingtrie.cpp \
- share/splparser.cpp \
- share/sync.cpp \
- share/userdict.cpp \
- share/utf16char.cpp \
- share/utf16reader.cpp
-
-HEADERS += \
- include/atomdictbase.h \
- include/dictbuilder.h \
- include/dictdef.h \
- include/dictlist.h \
- include/dicttrie.h \
- include/lpicache.h \
- include/matrixsearch.h \
- include/mystdlib.h \
- include/ngram.h \
- include/pinyinime.h \
- include/searchutility.h \
- include/spellingtable.h \
- include/spellingtrie.h \
- include/splparser.h \
- include/sync.h \
- include/userdict.h \
- include/utf16char.h \
- include/utf16reader.h
-
-OTHER_FILES +=\
- data/rawdict_utf16_65105_freq.txt \
- data/valid_utf16.txt
-
-load(qt_helper_lib)
-
-# On Windows, the library uses Qt for platform abstraction.
-win32 {
- CONFIG += qt
- QT = core
-} else {
- CONFIG *= thread
-}
diff --git a/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json b/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json
deleted file mode 100644
index c739749f..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/qt_attribution.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "Id": "pinyin",
- "Name": "PinyinIME",
- "QDocModule": "qtvirtualkeyboard",
- "Description": "PinyinIME is an input method engine for Pinyin (the official romanization system for Standard Chinese
-in mainland China, Malaysia, Singapore, and Taiwan) from the Android Open Source Project.",
- "QtUsage": "Optionally used in Qt Virtual Keyboard.",
-
- "License": "Apache License 2.0",
- "LicenseId": "Apache-2.0",
- "LicenseFile": "NOTICE",
- "Copyright": "Copyright (C) 2009 The Android Open Source Project"
-}
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp
deleted file mode 100644
index 6f0bd4f7..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/dictbuilder.cpp
+++ /dev/null
@@ -1,1070 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "../include/dictbuilder.h"
-#include "../include/dicttrie.h"
-#include "../include/mystdlib.h"
-#include "../include/ngram.h"
-#include "../include/searchutility.h"
-#include "../include/spellingtable.h"
-#include "../include/spellingtrie.h"
-#include "../include/splparser.h"
-#include "../include/utf16reader.h"
-
-namespace ime_pinyin {
-
-#ifdef ___BUILD_MODEL___
-
-static const size_t kReadBufLen = 512;
-static const size_t kSplTableHashLen = 2000;
-
-// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
-// frequencies.
-int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
- const SingleCharItem *s1, *s2;
- s1 = static_cast<const SingleCharItem*>(p1);
- s2 = static_cast<const SingleCharItem*>(p2);
-
- if (s1->hz < s2->hz)
- return -1;
- if (s1->hz > s2->hz)
- return 1;
-
- if (s1->splid.half_splid < s2->splid.half_splid)
- return -1;
- if (s1->splid.half_splid > s2->splid.half_splid)
- return 1;
-
- if (s1->splid.full_splid < s2->splid.full_splid)
- return -1;
- if (s1->splid.full_splid > s2->splid.full_splid)
- return 1;
-
- if (s1->freq > s2->freq)
- return -1;
- if (s1->freq < s2->freq)
- return 1;
- return 0;
-}
-
-int cmp_scis_hz_splid(const void* p1, const void* p2) {
- const SingleCharItem *s1, *s2;
- s1 = static_cast<const SingleCharItem*>(p1);
- s2 = static_cast<const SingleCharItem*>(p2);
-
- if (s1->hz < s2->hz)
- return -1;
- if (s1->hz > s2->hz)
- return 1;
-
- if (s1->splid.half_splid < s2->splid.half_splid)
- return -1;
- if (s1->splid.half_splid > s2->splid.half_splid)
- return 1;
-
- if (s1->splid.full_splid < s2->splid.full_splid)
- return -1;
- if (s1->splid.full_splid > s2->splid.full_splid)
- return 1;
-
- return 0;
-}
-
-int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
- size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
- size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
- if (size1 < size2)
- return -1;
- else if (size1 > size2)
- return 1;
-
- return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
- ((const LemmaEntry*)p2)->hanzi_str);
-}
-
-int compare_char16(const void* p1, const void* p2) {
- if (*((const char16*)p1) < *((const char16*)p2))
- return -1;
- if (*((const char16*)p1) > *((const char16*)p2))
- return 1;
- return 0;
-}
-
-int compare_py(const void* p1, const void* p2) {
- int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
- ((const LemmaEntry*)p2)->spl_idx_arr);
-
- if (0 != ret)
- return ret;
-
- return static_cast<int>(((const LemmaEntry*)p2)->freq) -
- static_cast<int>(((const LemmaEntry*)p1)->freq);
-}
-
-// First hanzi, if the same, then Pinyin
-int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
- size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
- size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
- if (size1 < size2)
- return -1;
- else if (size1 > size2)
- return 1;
- int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
- ((const LemmaEntry*)p2)->hanzi_str);
-
- if (0 != ret)
- return ret;
-
- ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
- ((const LemmaEntry*)p2)->spl_idx_arr);
- return ret;
-}
-
-int compare_splid2(const void* p1, const void* p2) {
- int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
- ((const LemmaEntry*)p2)->spl_idx_arr);
- return ret;
-}
-
-DictBuilder::DictBuilder() {
- lemma_arr_ = NULL;
- lemma_num_ = 0;
-
- scis_ = NULL;
- scis_num_ = 0;
-
- lma_nodes_le0_ = NULL;
- lma_nodes_ge1_ = NULL;
-
- lma_nds_used_num_le0_ = 0;
- lma_nds_used_num_ge1_ = 0;
-
- homo_idx_buf_ = NULL;
- homo_idx_num_eq1_ = 0;
- homo_idx_num_gt1_ = 0;
-
- top_lmas_ = NULL;
- top_lmas_num_ = 0;
-
- spl_table_ = NULL;
- spl_parser_ = NULL;
-}
-
-DictBuilder::~DictBuilder() {
- free_resource();
-}
-
-bool DictBuilder::alloc_resource(size_t lma_num) {
- if (0 == lma_num)
- return false;
-
- free_resource();
-
- lemma_num_ = lma_num;
- lemma_arr_ = new LemmaEntry[lemma_num_];
-
- top_lmas_num_ = 0;
- top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
-
- // New the scis_ buffer to the possible maximum size.
- scis_num_ = lemma_num_ * kMaxLemmaSize;
- scis_ = new SingleCharItem[scis_num_];
-
- // The root and first level nodes is less than kMaxSpellingNum + 1
- lma_nds_used_num_le0_ = 0;
- lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
-
- // Other nodes is less than lemma_num
- lma_nds_used_num_ge1_ = 0;
- lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
-
- homo_idx_buf_ = new LemmaIdType[lemma_num_];
- spl_table_ = new SpellingTable();
- spl_parser_ = new SpellingParser();
-
- if (NULL == lemma_arr_ || NULL == top_lmas_ ||
- NULL == scis_ || NULL == spl_table_ ||
- NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
- NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
- free_resource();
- return false;
- }
-
- memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
- memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
- memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
- memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
- memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
- spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
-
- return true;
-}
-
-char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) {
- if (NULL == fn_validhzs || NULL == num)
- return NULL;
-
- *num = 0;
- FILE *fp = fopen(fn_validhzs, "rb");
- if (NULL == fp)
- return NULL;
-
- char16 utf16header;
- if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
- 0xfeff != utf16header) {
- fclose(fp);
- return NULL;
- }
-
- fseek(fp, 0, SEEK_END);
- *num = ftell(fp) / sizeof(char16);
- assert(*num >= 1);
- *num -= 1;
-
- char16 *hzs = new char16[*num];
- if (NULL == hzs) {
- fclose(fp);
- return NULL;
- }
-
- fseek(fp, 2, SEEK_SET);
-
- if (fread(hzs, sizeof(char16), *num, fp) != *num) {
- fclose(fp);
- delete [] hzs;
- return NULL;
- }
- fclose(fp);
-
- myqsort(hzs, *num, sizeof(char16), compare_char16);
- return hzs;
-}
-
-bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
- char16 hz) {
- if (NULL == hzs)
- return false;
-
- char16 *found;
- found = static_cast<char16*>(
- mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
- if (NULL == found)
- return false;
-
- assert(*found == hz);
- return true;
-}
-
-// The caller makes sure that the parameters are valid.
-bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
- const char16 *str, size_t str_len) {
- if (NULL == hzs || NULL == str)
- return false;
-
- for (size_t pos = 0; pos < str_len; pos++) {
- if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
- return false;
- }
- return true;
-}
-
-void DictBuilder::get_top_lemmas() {
- top_lmas_num_ = 0;
- if (NULL == lemma_arr_)
- return;
-
- for (size_t pos = 0; pos < lemma_num_; pos++) {
- if (0 == top_lmas_num_) {
- top_lmas_[0] = lemma_arr_[pos];
- top_lmas_num_ = 1;
- continue;
- }
-
- if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
- if (kTopScoreLemmaNum > top_lmas_num_)
- top_lmas_num_ += 1;
-
- size_t move_pos;
- for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
- top_lmas_[move_pos] = top_lmas_[move_pos - 1];
- if (0 == move_pos - 1 ||
- (move_pos - 1 > 0 &&
- top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
- break;
- }
- }
- assert(move_pos > 0);
- top_lmas_[move_pos - 1] = lemma_arr_[pos];
- } else if (kTopScoreLemmaNum > top_lmas_num_) {
- top_lmas_[top_lmas_num_] = lemma_arr_[pos];
- top_lmas_num_ += 1;
- }
- }
-
- if (kPrintDebug0) {
- printf("\n------Top Lemmas------------------\n");
- for (size_t pos = 0; pos < top_lmas_num_; pos++) {
- printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
- top_lmas_[pos].freq);
- }
- }
-}
-
-void DictBuilder::free_resource() {
- if (NULL != lemma_arr_)
- delete [] lemma_arr_;
-
- if (NULL != scis_)
- delete [] scis_;
-
- if (NULL != lma_nodes_le0_)
- delete [] lma_nodes_le0_;
-
- if (NULL != lma_nodes_ge1_)
- delete [] lma_nodes_ge1_;
-
- if (NULL != homo_idx_buf_)
- delete [] homo_idx_buf_;
-
- if (NULL != spl_table_)
- delete spl_table_;
-
- if (NULL != spl_parser_)
- delete spl_parser_;
-
- lemma_arr_ = NULL;
- scis_ = NULL;
- lma_nodes_le0_ = NULL;
- lma_nodes_ge1_ = NULL;
- homo_idx_buf_ = NULL;
- spl_table_ = NULL;
- spl_parser_ = NULL;
-
- lemma_num_ = 0;
- lma_nds_used_num_le0_ = 0;
- lma_nds_used_num_ge1_ = 0;
- homo_idx_num_eq1_ = 0;
- homo_idx_num_gt1_ = 0;
-}
-
-size_t DictBuilder::read_raw_dict(const char* fn_raw,
- const char *fn_validhzs,
- size_t max_item) {
- if (NULL == fn_raw) return 0;
-
- Utf16Reader utf16_reader;
- if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
- return false;
-
- char16 read_buf[kReadBufLen];
-
- // Read the number of lemmas in the file
- size_t lemma_num = 240000;
-
- // allocate resource required
- if (!alloc_resource(lemma_num)) {
- utf16_reader.close();
- }
-
- // Read the valid Hanzi list.
- char16 *valid_hzs = NULL;
- size_t valid_hzs_num = 0;
- valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
-
- // Begin reading the lemma entries
- for (size_t i = 0; i < max_item; i++) {
- // read next entry
- if (!utf16_reader.readline(read_buf, kReadBufLen)) {
- lemma_num = i;
- break;
- }
-
- size_t token_size;
- char16 *token;
- char16 *to_tokenize = read_buf;
-
- // Get the Hanzi string
- token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
- if (NULL == token) {
- free_resource();
- utf16_reader.close();
- return false;
- }
-
- size_t lemma_size = utf16_strlen(token);
-
- if (lemma_size > kMaxLemmaSize) {
- i--;
- continue;
- }
-
- if (lemma_size > 4) {
- i--;
- continue;
- }
-
- // Copy to the lemma entry
- utf16_strcpy(lemma_arr_[i].hanzi_str, token);
-
- lemma_arr_[i].hz_str_len = token_size;
-
- // Get the freq string
- token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
- if (NULL == token) {
- free_resource();
- utf16_reader.close();
- return false;
- }
- lemma_arr_[i].freq = utf16_atof(token);
-
- if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
- i--;
- continue;
- }
-
- // Get GBK mark, if no valid Hanzi list available, all items which contains
- // GBK characters will be discarded. Otherwise, all items which contains
- // characters outside of the valid Hanzi list will be discarded.
- token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
- assert(NULL != token);
- int gbk_flag = utf16_atoi(token);
- if (NULL == valid_hzs || 0 == valid_hzs_num) {
- if (0 != gbk_flag) {
- i--;
- continue;
- }
- } else {
- if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
- lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
- i--;
- continue;
- }
- }
-
- // Get spelling String
- bool spelling_not_support = false;
- for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
- hz_pos++) {
- // Get a Pinyin
- token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
- if (NULL == token) {
- free_resource();
- utf16_reader.close();
- return false;
- }
-
- assert(utf16_strlen(token) <= kMaxPinyinSize);
-
- utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
-
- format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
-
- // Put the pinyin to the spelling table
- if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
- lemma_arr_[i].freq)) {
- spelling_not_support = true;
- break;
- }
- }
-
- // The whole line must have been parsed fully, otherwise discard this one.
- token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
- if (spelling_not_support || NULL != token) {
- i--;
- continue;
- }
- }
-
- delete [] valid_hzs;
- utf16_reader.close();
-
- printf("read succesfully, lemma num: %d\n", lemma_num);
-
- return lemma_num;
-}
-
-bool DictBuilder::build_dict(const char *fn_raw,
- const char *fn_validhzs,
- DictTrie *dict_trie) {
- if (NULL == fn_raw || NULL == dict_trie)
- return false;
-
- lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
- if (0 == lemma_num_)
- return false;
-
- // Arrange the spelling table, and build a spelling tree
- // The size of an spelling. '\0' is included. If the spelling table is
- // initialized to calculate the spelling scores, the last char in the
- // spelling string will be score, and it is also included in spl_item_size.
- size_t spl_item_size;
- size_t spl_num;
- const char* spl_buf;
- spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
- if (NULL == spl_buf) {
- free_resource();
- return false;
- }
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
-
- if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
- spl_table_->get_score_amplifier(),
- spl_table_->get_average_score())) {
- free_resource();
- return false;
- }
-
- printf("spelling tree construct successfully.\n");
-
- // Convert the spelling string to idxs
- for (size_t i = 0; i < lemma_num_; i++) {
- for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
- hz_pos++) {
- uint16 spl_idxs[2];
- uint16 spl_start_pos[3];
- bool is_pre = true;
- int spl_idx_num =
- spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
- strlen(lemma_arr_[i].pinyin_str[hz_pos]),
- spl_idxs, spl_start_pos, 2, is_pre);
- assert(1 == spl_idx_num);
-
- if (spl_trie.is_half_id(spl_idxs[0])) {
- uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
- assert(0 != num);
- }
- lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
- }
- }
-
- // Sort the lemma items according to the hanzi, and give each unique item a
- // id
- sort_lemmas_by_hz();
-
- scis_num_ = build_scis();
-
- // Construct the dict list
- dict_trie->dict_list_ = new DictList();
- bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
- lemma_arr_, lemma_num_);
- assert(dl_success);
-
- // Construct the NGram information
- NGram& ngram = NGram::get_instance();
- ngram.build_unigram(lemma_arr_, lemma_num_,
- lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
-
- // sort the lemma items according to the spelling idx string
- myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
-
- get_top_lemmas();
-
-#ifdef ___DO_STATISTICS___
- stat_init();
-#endif
-
- lma_nds_used_num_le0_ = 1; // The root node
- bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
- lemma_arr_, 0, lemma_num_, 0);
- if (!dt_success) {
- free_resource();
- return false;
- }
-
-#ifdef ___DO_STATISTICS___
- stat_print();
-#endif
-
- // Move the node data and homo data to the DictTrie
- dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
- dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
- size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
- dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
- assert(NULL != dict_trie->root_);
- assert(NULL != dict_trie->lma_idx_buf_);
- dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
- dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
- dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
- dict_trie->top_lmas_num_ = top_lmas_num_;
-
- memcpy(dict_trie->root_, lma_nodes_le0_,
- sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
- memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
- sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
-
- for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
- id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
- homo_idx_buf_[pos]);
- }
-
- for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
- pos < lma_idx_num; pos++) {
- LemmaIdType idx =
- top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
- id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
- }
-
- if (kPrintDebug0) {
- printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
- printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
- printf("top_lmas_num_: %d\n", top_lmas_num_);
- }
-
- free_resource();
-
- if (kPrintDebug0) {
- printf("Building dict succeds\n");
- }
- return dt_success;
-}
-
-void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
- if (NULL == buf) return;
- for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
- (buf)[pos] = (unsigned char)(id >> (pos * 8));
- }
-}
-
-void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) {
- node->son_1st_off_l = static_cast<uint16>(offset);
- node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
-}
-
-void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) {
- node->homo_idx_buf_off_l = static_cast<uint16>(offset);
- node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
-
-}
-
-// All spelling strings will be converted to upper case, except that
-// spellings started with "ZH"/"CH"/"SH" will be converted to
-// "Zh"/"Ch"/"Sh"
-void DictBuilder::format_spelling_str(char *spl_str) {
- if (NULL == spl_str)
- return;
-
- uint16 pos = 0;
- while ('\0' != spl_str[pos]) {
- if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
- spl_str[pos] = spl_str[pos] - 'a' + 'A';
-
- if (1 == pos && 'H' == spl_str[pos]) {
- if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
- spl_str[pos] = 'h';
- }
- }
- pos++;
- }
-}
-
-LemmaIdType DictBuilder::sort_lemmas_by_hz() {
- if (NULL == lemma_arr_ || 0 == lemma_num_)
- return 0;
-
- myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
-
- lemma_arr_[0].idx_by_hz = 1;
- LemmaIdType idx_max = 1;
- for (size_t i = 1; i < lemma_num_; i++) {
- if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
- idx_max++;
- lemma_arr_[i].idx_by_hz = idx_max;
- } else {
- idx_max++;
- lemma_arr_[i].idx_by_hz = idx_max;
- }
- }
- return idx_max + 1;
-}
-
-size_t DictBuilder::build_scis() {
- if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
- return 0;
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
-
- // This first one is blank, because id 0 is invalid.
- scis_[0].freq = 0;
- scis_[0].hz = 0;
- scis_[0].splid.full_splid = 0;
- scis_[0].splid.half_splid = 0;
- scis_num_ = 1;
-
- // Copy the hanzis to the buffer
- for (size_t pos = 0; pos < lemma_num_; pos++) {
- size_t hz_num = lemma_arr_[pos].hz_str_len;
- for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
- scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
- scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
- scis_[scis_num_].splid.half_splid =
- spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
- if (1 == hz_num)
- scis_[scis_num_].freq = lemma_arr_[pos].freq;
- else
- scis_[scis_num_].freq = 0.000001;
- scis_num_++;
- }
- }
-
- myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
-
- // Remove repeated items
- size_t unique_scis_num = 1;
- for (size_t pos = 1; pos < scis_num_; pos++) {
- if (scis_[pos].hz == scis_[pos - 1].hz &&
- scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
- continue;
- scis_[unique_scis_num] = scis_[pos];
- scis_[unique_scis_num].splid.half_splid =
- spl_trie.full_to_half(scis_[pos].splid.full_splid);
- unique_scis_num++;
- }
-
- scis_num_ = unique_scis_num;
-
- // Update the lemma list.
- for (size_t pos = 0; pos < lemma_num_; pos++) {
- size_t hz_num = lemma_arr_[pos].hz_str_len;
- for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
- SingleCharItem key;
- key.hz = lemma_arr_[pos].hanzi_str[hzpos];
- key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
- key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
-
- SingleCharItem *found;
- found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
- unique_scis_num,
- sizeof(SingleCharItem),
- cmp_scis_hz_splid));
-
- assert(found);
-
- lemma_arr_[pos].hanzi_scis_ids[hzpos] =
- static_cast<uint16>(found - scis_);
- lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
- }
- }
-
- return scis_num_;
-}
-
-bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
- size_t item_start, size_t item_end,
- size_t level) {
- if (level >= kMaxLemmaSize || item_end <= item_start)
- return false;
-
- // 1. Scan for how many sons
- size_t parent_son_num = 0;
- // LemmaNode *son_1st = NULL;
- // parent.num_of_son = 0;
-
- LemmaEntry *lma_last_start = lemma_arr_ + item_start;
- uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
-
- // Scan for how many sons to be allocaed
- for (size_t i = item_start + 1; i< item_end; i++) {
- LemmaEntry *lma_current = lemma_arr + i;
- uint16 spl_idx_current = lma_current->spl_idx_arr[level];
- if (spl_idx_current != spl_idx_node) {
- parent_son_num++;
- spl_idx_node = spl_idx_current;
- }
- }
- parent_son_num++;
-
-#ifdef ___DO_STATISTICS___
- // Use to indicate whether all nodes of this layer have no son.
- bool allson_noson = true;
-
- assert(level < kMaxLemmaSize);
- if (parent_son_num > max_sonbuf_len_[level])
- max_sonbuf_len_[level] = parent_son_num;
-
- total_son_num_[level] += parent_son_num;
- total_sonbuf_num_[level] += 1;
-
- if (parent_son_num == 1)
- sonbufs_num1_++;
- else
- sonbufs_numgt1_++;
- total_lma_node_num_ += parent_son_num;
-#endif
-
- // 2. Update the parent's information
- // Update the parent's son list;
- LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used
- LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used.
- if (0 == level) { // the parent is root
- (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
- lma_nds_used_num_le0_;
- son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
- lma_nds_used_num_le0_ += parent_son_num;
-
- assert(parent_son_num <= 65535);
- (static_cast<LmaNodeLE0*>(parent))->num_of_son =
- static_cast<uint16>(parent_son_num);
- } else if (1 == level) { // the parent is a son of root
- (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
- lma_nds_used_num_ge1_;
- son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
- lma_nds_used_num_ge1_ += parent_son_num;
-
- assert(parent_son_num <= 65535);
- (static_cast<LmaNodeLE0*>(parent))->num_of_son =
- static_cast<uint16>(parent_son_num);
- } else {
- set_son_offset((static_cast<LmaNodeGE1*>(parent)),
- lma_nds_used_num_ge1_);
- son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
- lma_nds_used_num_ge1_ += parent_son_num;
-
- assert(parent_son_num <= 255);
- (static_cast<LmaNodeGE1*>(parent))->num_of_son =
- (unsigned char)parent_son_num;
- }
-
- // 3. Now begin to construct the son one by one
- size_t son_pos = 0;
-
- lma_last_start = lemma_arr_ + item_start;
- spl_idx_node = lma_last_start->spl_idx_arr[level];
-
- size_t homo_num = 0;
- if (lma_last_start->spl_idx_arr[level + 1] == 0)
- homo_num = 1;
-
- size_t item_start_next = item_start;
-
- for (size_t i = item_start + 1; i < item_end; i++) {
- LemmaEntry* lma_current = lemma_arr_ + i;
- uint16 spl_idx_current = lma_current->spl_idx_arr[level];
-
- if (spl_idx_current == spl_idx_node) {
- if (lma_current->spl_idx_arr[level + 1] == 0)
- homo_num++;
- } else {
- // Construct a node
- LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
- LmaNodeGE1 *node_cur_ge1 = NULL;
- if (0 == level) {
- node_cur_le0 = son_1st_le0 + son_pos;
- node_cur_le0->spl_idx = spl_idx_node;
- node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
- node_cur_le0->son_1st_off = 0;
- homo_idx_num_eq1_ += homo_num;
- } else {
- node_cur_ge1 = son_1st_ge1 + son_pos;
- node_cur_ge1->spl_idx = spl_idx_node;
-
- set_homo_id_buf_offset(node_cur_ge1,
- (homo_idx_num_eq1_ + homo_idx_num_gt1_));
- set_son_offset(node_cur_ge1, 0);
- homo_idx_num_gt1_ += homo_num;
- }
-
- if (homo_num > 0) {
- LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
- homo_idx_num_gt1_ - homo_num;
- if (0 == level) {
- assert(homo_num <= 65535);
- node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
- } else {
- assert(homo_num <= 255);
- node_cur_ge1->num_of_homo = (unsigned char)homo_num;
- }
-
- for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
- idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
- }
-
-#ifdef ___DO_STATISTICS___
- if (homo_num > max_homobuf_len_[level])
- max_homobuf_len_[level] = homo_num;
-
- total_homo_num_[level] += homo_num;
-#endif
- }
-
- if (i - item_start_next > homo_num) {
- void *next_parent;
- if (0 == level)
- next_parent = static_cast<void*>(node_cur_le0);
- else
- next_parent = static_cast<void*>(node_cur_ge1);
- construct_subset(next_parent, lemma_arr,
- item_start_next + homo_num, i, level + 1);
-#ifdef ___DO_STATISTICS___
-
- total_node_hasson_[level] += 1;
- allson_noson = false;
-#endif
- }
-
- // for the next son
- lma_last_start = lma_current;
- spl_idx_node = spl_idx_current;
- item_start_next = i;
- homo_num = 0;
- if (lma_current->spl_idx_arr[level + 1] == 0)
- homo_num = 1;
-
- son_pos++;
- }
- }
-
- // 4. The last one to construct
- LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
- LmaNodeGE1 *node_cur_ge1 = NULL;
- if (0 == level) {
- node_cur_le0 = son_1st_le0 + son_pos;
- node_cur_le0->spl_idx = spl_idx_node;
- node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
- node_cur_le0->son_1st_off = 0;
- homo_idx_num_eq1_ += homo_num;
- } else {
- node_cur_ge1 = son_1st_ge1 + son_pos;
- node_cur_ge1->spl_idx = spl_idx_node;
-
- set_homo_id_buf_offset(node_cur_ge1,
- (homo_idx_num_eq1_ + homo_idx_num_gt1_));
- set_son_offset(node_cur_ge1, 0);
- homo_idx_num_gt1_ += homo_num;
- }
-
- if (homo_num > 0) {
- LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
- homo_idx_num_gt1_ - homo_num;
- if (0 == level) {
- assert(homo_num <= 65535);
- node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
- } else {
- assert(homo_num <= 255);
- node_cur_ge1->num_of_homo = (unsigned char)homo_num;
- }
-
- for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
- idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
- }
-
-#ifdef ___DO_STATISTICS___
- if (homo_num > max_homobuf_len_[level])
- max_homobuf_len_[level] = homo_num;
-
- total_homo_num_[level] += homo_num;
-#endif
- }
-
- if (item_end - item_start_next > homo_num) {
- void *next_parent;
- if (0 == level)
- next_parent = static_cast<void*>(node_cur_le0);
- else
- next_parent = static_cast<void*>(node_cur_ge1);
- construct_subset(next_parent, lemma_arr,
- item_start_next + homo_num, item_end, level + 1);
-#ifdef ___DO_STATISTICS___
-
- total_node_hasson_[level] += 1;
- allson_noson = false;
-#endif
- }
-
-#ifdef ___DO_STATISTICS___
- if (allson_noson) {
- total_sonbuf_allnoson_[level] += 1;
- total_node_in_sonbuf_allnoson_[level] += parent_son_num;
- }
-#endif
-
- assert(son_pos + 1 == parent_son_num);
- return true;
-}
-
-#ifdef ___DO_STATISTICS___
-void DictBuilder::stat_init() {
- memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
- memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);
-
- sonbufs_num1_ = 0;
- sonbufs_numgt1_ = 0;
- total_lma_node_num_ = 0;
-}
-
-void DictBuilder::stat_print() {
- printf("\n------------STAT INFO-------------\n");
- printf("[root is layer -1]\n");
- printf(".. max_sonbuf_len per layer(from layer 0):\n ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", max_sonbuf_len_[i]);
- printf("-, \n");
-
- printf(".. max_homobuf_len per layer:\n -, ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", max_homobuf_len_[i]);
- printf("\n");
-
- printf(".. total_son_num per layer:\n ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_son_num_[i]);
- printf("-, \n");
-
- printf(".. total_node_hasson per layer:\n 1, ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_node_hasson_[i]);
- printf("\n");
-
- printf(".. total_sonbuf_num per layer:\n ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_sonbuf_num_[i]);
- printf("-, \n");
-
- printf(".. total_sonbuf_allnoson per layer:\n ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_sonbuf_allnoson_[i]);
- printf("-, \n");
-
- printf(".. total_node_in_sonbuf_allnoson per layer:\n ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
- printf("-, \n");
-
- printf(".. total_homo_num per layer:\n 0, ");
- for (size_t i = 0; i < kMaxLemmaSize; i++)
- printf("%d, ", total_homo_num_[i]);
- printf("\n");
-
- printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
- printf(".. son buf allocation number with more than 1 son: %d\n",
- sonbufs_numgt1_);
- printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
-}
-#endif // ___DO_STATISTICS___
-
-#endif // ___BUILD_MODEL___
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp
deleted file mode 100644
index 64d8d085..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/dictlist.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include "../include/dictlist.h"
-#include "../include/mystdlib.h"
-#include "../include/ngram.h"
-#include "../include/searchutility.h"
-
-namespace ime_pinyin {
-
-DictList::DictList() {
- initialized_ = false;
- scis_num_ = 0;
- scis_hz_ = NULL;
- scis_splid_ = NULL;
- buf_ = NULL;
- spl_trie_ = SpellingTrie::get_cpinstance();
-
- assert(kMaxLemmaSize == 8);
- cmp_func_[0] = cmp_hanzis_1;
- cmp_func_[1] = cmp_hanzis_2;
- cmp_func_[2] = cmp_hanzis_3;
- cmp_func_[3] = cmp_hanzis_4;
- cmp_func_[4] = cmp_hanzis_5;
- cmp_func_[5] = cmp_hanzis_6;
- cmp_func_[6] = cmp_hanzis_7;
- cmp_func_[7] = cmp_hanzis_8;
-}
-
-DictList::~DictList() {
- free_resource();
-}
-
-bool DictList::alloc_resource(size_t buf_size, size_t scis_num) {
- // Allocate memory
- buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16)));
- if (NULL == buf_)
- return false;
-
- scis_num_ = scis_num;
-
- scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16)));
- if (NULL == scis_hz_)
- return false;
-
- scis_splid_ = static_cast<SpellingId*>
- (malloc(scis_num_ * sizeof(SpellingId)));
-
- if (NULL == scis_splid_)
- return false;
-
- return true;
-}
-
-void DictList::free_resource() {
- if (NULL != buf_)
- free(buf_);
- buf_ = NULL;
-
- if (NULL != scis_hz_)
- free(scis_hz_);
- scis_hz_ = NULL;
-
- if (NULL != scis_splid_)
- free(scis_splid_);
- scis_splid_ = NULL;
-}
-
-#ifdef ___BUILD_MODEL___
-bool DictList::init_list(const SingleCharItem *scis, size_t scis_num,
- const LemmaEntry *lemma_arr, size_t lemma_num) {
- if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num)
- return false;
-
- initialized_ = false;
-
- if (NULL != buf_)
- free(buf_);
-
- // calculate the size
- size_t buf_size = calculate_size(lemma_arr, lemma_num);
- if (0 == buf_size)
- return false;
-
- if (!alloc_resource(buf_size, scis_num))
- return false;
-
- fill_scis(scis, scis_num);
-
- // Copy the related content from the array to inner buffer
- fill_list(lemma_arr, lemma_num);
-
- initialized_ = true;
- return true;
-}
-
-size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) {
- size_t last_hz_len = 0;
- size_t list_size = 0;
- size_t id_num = 0;
-
- for (size_t i = 0; i < lemma_num; i++) {
- if (0 == i) {
- last_hz_len = lemma_arr[i].hz_str_len;
-
- assert(last_hz_len > 0);
- assert(lemma_arr[0].idx_by_hz == 1);
-
- id_num++;
- start_pos_[0] = 0;
- start_id_[0] = id_num;
-
- last_hz_len = 1;
- list_size += last_hz_len;
- } else {
- size_t current_hz_len = lemma_arr[i].hz_str_len;
-
- assert(current_hz_len >= last_hz_len);
-
- if (current_hz_len == last_hz_len) {
- list_size += current_hz_len;
- id_num++;
- } else {
- for (size_t len = last_hz_len; len < current_hz_len - 1; len++) {
- start_pos_[len] = start_pos_[len - 1];
- start_id_[len] = start_id_[len - 1];
- }
-
- start_pos_[current_hz_len - 1] = list_size;
-
- id_num++;
- start_id_[current_hz_len - 1] = id_num;
-
- last_hz_len = current_hz_len;
- list_size += current_hz_len;
- }
- }
- }
-
- for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) {
- if (0 == i) {
- start_pos_[0] = 0;
- start_id_[0] = 1;
- } else {
- start_pos_[i] = list_size;
- start_id_[i] = id_num;
- }
- }
-
- return start_pos_[kMaxLemmaSize];
-}
-
-void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) {
- assert(scis_num_ == scis_num);
-
- for (size_t pos = 0; pos < scis_num_; pos++) {
- scis_hz_[pos] = scis[pos].hz;
- scis_splid_[pos] = scis[pos].splid;
- }
-}
-
-void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) {
- size_t current_pos = 0;
-
- utf16_strncpy(buf_, lemma_arr[0].hanzi_str,
- lemma_arr[0].hz_str_len);
-
- current_pos = lemma_arr[0].hz_str_len;
-
- size_t id_num = 1;
-
- for (size_t i = 1; i < lemma_num; i++) {
- utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str,
- lemma_arr[i].hz_str_len);
-
- id_num++;
- current_pos += lemma_arr[i].hz_str_len;
- }
-
- assert(current_pos == start_pos_[kMaxLemmaSize]);
- assert(id_num == start_id_[kMaxLemmaSize]);
-}
-
-char16* DictList::find_pos2_startedbyhz(char16 hz_char) {
- char16 *found_2w = static_cast<char16*>
- (mybsearch(&hz_char, buf_ + start_pos_[1],
- (start_pos_[2] - start_pos_[1]) / 2,
- sizeof(char16) * 2, cmp_hanzis_1));
- if (NULL == found_2w)
- return NULL;
-
- while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1))
- found_2w -= 2;
-
- return found_2w;
-}
-#endif // ___BUILD_MODEL___
-
-char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[],
- size_t word_len, int (*cmp_func)(const void *, const void *)) {
- char16 *found_w = static_cast<char16*>
- (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1],
- (start_pos_[word_len] - start_pos_[word_len - 1])
- / word_len,
- sizeof(char16) * word_len, cmp_func));
-
- if (NULL == found_w)
- return NULL;
-
- while (found_w > buf_ + start_pos_[word_len -1] &&
- cmp_func(found_w, found_w - word_len) == 0)
- found_w -= word_len;
-
- return found_w;
-}
-
-size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used) {
- assert(hzs_len <= kMaxPredictSize && hzs_len > 0);
-
- // 1. Prepare work
- int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1];
-
- NGram& ngram = NGram::get_instance();
-
- size_t item_num = 0;
-
- // 2. Do prediction
- for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len;
- pre_len++) {
- uint16 word_len = hzs_len + pre_len;
- char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func);
- if (NULL == w_buf)
- continue;
- while (w_buf < buf_ + start_pos_[word_len] &&
- cmp_func(w_buf, last_hzs) == 0 &&
- item_num < npre_max) {
- memset(npre_items + item_num, 0, sizeof(NPredictItem));
- utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len);
- npre_items[item_num].psb =
- ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1])
- / word_len + start_id_[word_len - 1]);
- npre_items[item_num].his_len = hzs_len;
- item_num++;
- w_buf += word_len;
- }
- }
-
- size_t new_num = 0;
- for (size_t i = 0; i < item_num; i++) {
- // Try to find it in the existing items
- size_t e_pos;
- for (e_pos = 1; e_pos <= b4_used; e_pos++) {
- if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs,
- kMaxPredictSize) == 0)
- break;
- }
- if (e_pos <= b4_used)
- continue;
-
- // If not found, append it to the buffer
- npre_items[new_num] = npre_items[i];
- new_num++;
- }
-
- return new_num;
-}
-
-uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
- uint16 str_max) {
- if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf
- || str_max <= 1)
- return 0;
-
- // Find the range
- for (uint16 i = 0; i < kMaxLemmaSize; i++) {
- if (i + 1 > str_max - 1)
- return 0;
- if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) {
- size_t id_span = id_lemma - start_id_[i];
-
- uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1);
- for (uint16 len = 0; len <= i; len++) {
- str_buf[len] = buf[len];
- }
- str_buf[i+1] = (char16)'\0';
- return i + 1;
- }
- }
- return 0;
-}
-
-uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
- uint16 *splids, uint16 max_splids) {
- char16 *hz_found = static_cast<char16*>
- (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1));
- assert(NULL != hz_found && hanzi == *hz_found);
-
- // Move to the first one.
- while (hz_found > scis_hz_ && hanzi == *(hz_found - 1))
- hz_found--;
-
- // First try to found if strict comparison result is not zero.
- char16 *hz_f = hz_found;
- bool strict = false;
- while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) {
- uint16 pos = hz_f - scis_hz_;
- if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) {
- strict = true;
- }
- hz_f++;
- }
-
- uint16 found_num = 0;
- while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) {
- uint16 pos = hz_found - scis_hz_;
- if (0 == half_splid ||
- (strict && scis_splid_[pos].half_splid == half_splid) ||
- (!strict && spl_trie_->half_full_compatible(half_splid,
- scis_splid_[pos].full_splid))) {
- assert(found_num + 1 < max_splids);
- splids[found_num] = scis_splid_[pos].full_splid;
- found_num++;
- }
- hz_found++;
- }
-
- return found_num;
-}
-
-LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) {
- if (NULL == str || str_len > kMaxLemmaSize)
- return 0;
-
- char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]);
- if (NULL == found)
- return 0;
-
- assert(found > buf_);
- assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]);
- return static_cast<LemmaIdType>
- (start_id_[str_len - 1] +
- (found - buf_ - start_pos_[str_len - 1]) / str_len);
-}
-
-void DictList::convert_to_hanzis(char16 *str, uint16 str_len) {
- assert(NULL != str);
-
- for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
- str[str_pos] = scis_hz_[str[str_pos]];
- }
-}
-
-void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) {
- assert(NULL != str);
-
- for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
- str[str_pos] = 0x100;
- }
-}
-
-bool DictList::save_list(FILE *fp) {
- if (!initialized_ || NULL == fp)
- return false;
-
- if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] ||
- NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_)
- return false;
-
- if (fwrite(&scis_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
- kMaxLemmaSize + 1)
- return false;
-
- if (fwrite(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
- kMaxLemmaSize + 1)
- return false;
-
- if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
- return false;
-
- if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
- return false;
-
- if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
- start_pos_[kMaxLemmaSize])
- return false;
-
- return true;
-}
-
-bool DictList::load_list(FILE *fp) {
- if (NULL == fp)
- return false;
-
- initialized_ = false;
-
- if (fread(&scis_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
- kMaxLemmaSize + 1)
- return false;
-
- if (fread(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
- kMaxLemmaSize + 1)
- return false;
-
- free_resource();
-
- if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_))
- return false;
-
- if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
- return false;
-
- if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
- return false;
-
- if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
- start_pos_[kMaxLemmaSize])
- return false;
-
- initialized_ = true;
- return true;
-}
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp
deleted file mode 100644
index 0cdd0982..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/dicttrie.cpp
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-#include "../include/dicttrie.h"
-#include "../include/dictbuilder.h"
-#include "../include/lpicache.h"
-#include "../include/mystdlib.h"
-#include "../include/ngram.h"
-
-namespace ime_pinyin {
-
-DictTrie::DictTrie() {
- spl_trie_ = SpellingTrie::get_cpinstance();
-
- root_ = NULL;
- splid_le0_index_ = NULL;
- lma_node_num_le0_ = 0;
- nodes_ge1_ = NULL;
- lma_node_num_ge1_ = 0;
- lma_idx_buf_ = NULL;
- lma_idx_buf_len_ = 0;
- total_lma_num_ = 0;
- top_lmas_num_ = 0;
- dict_list_ = NULL;
-
- parsing_marks_ = NULL;
- mile_stones_ = NULL;
- reset_milestones(0, kFirstValidMileStoneHandle);
-}
-
-DictTrie::~DictTrie() {
- free_resource(true);
-}
-
-void DictTrie::free_resource(bool free_dict_list) {
- if (NULL != root_)
- free(root_);
- root_ = NULL;
-
- if (NULL != splid_le0_index_)
- free(splid_le0_index_);
- splid_le0_index_ = NULL;
-
- if (NULL != nodes_ge1_)
- free(nodes_ge1_);
- nodes_ge1_ = NULL;
-
- if (NULL != lma_idx_buf_)
- free(lma_idx_buf_);
- lma_idx_buf_ = NULL;
-
- if (free_dict_list) {
- if (NULL != dict_list_) {
- delete dict_list_;
- }
- dict_list_ = NULL;
- }
-
- if (parsing_marks_)
- delete [] parsing_marks_;
- parsing_marks_ = NULL;
-
- if (mile_stones_)
- delete [] mile_stones_;
- mile_stones_ = NULL;
-
- reset_milestones(0, kFirstValidMileStoneHandle);
-}
-
-inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) {
- return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16));
-}
-
-inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) {
- return ((size_t)node->homo_idx_buf_off_l +
- ((size_t)node->homo_idx_buf_off_h << 16));
-}
-
-inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) {
- LemmaIdType id = 0;
- for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--)
- id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos];
- id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize];
- return id;
-}
-
-#ifdef ___BUILD_MODEL___
-bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) {
- DictBuilder* dict_builder = new DictBuilder();
-
- free_resource(true);
-
- return dict_builder->build_dict(fn_raw, fn_validhzs, this);
-}
-
-bool DictTrie::save_dict(FILE *fp) {
- if (NULL == fp)
- return false;
-
- if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
- != lma_node_num_le0_)
- return false;
-
- if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
- != lma_node_num_ge1_)
- return false;
-
- if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
- lma_idx_buf_len_)
- return false;
-
- return true;
-}
-
-bool DictTrie::save_dict(const char *filename) {
- if (NULL == filename)
- return false;
-
- if (NULL == root_ || NULL == dict_list_)
- return false;
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- NGram &ngram = NGram::get_instance();
-
- FILE *fp = fopen(filename, "wb");
- if (NULL == fp)
- return false;
-
- if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) ||
- !save_dict(fp) || !ngram.save_ngram(fp)) {
- fclose(fp);
- return false;
- }
-
- fclose(fp);
- return true;
-}
-#endif // ___BUILD_MODEL___
-
-bool DictTrie::load_dict(FILE *fp) {
- if (NULL == fp)
- return false;
- if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 ||
- top_lmas_num_ >= lma_idx_buf_len_)
- return false;
-
- free_resource(false);
-
- root_ = static_cast<LmaNodeLE0*>
- (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0)));
- nodes_ge1_ = static_cast<LmaNodeGE1*>
- (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1)));
- lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_);
- total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize;
-
- size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1;
- assert(lma_node_num_le0_ <= buf_size);
- splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16)));
-
- // Init the space for parsing.
- parsing_marks_ = new ParsingMark[kMaxParsingMark];
- mile_stones_ = new MileStone[kMaxMileStone];
- reset_milestones(0, kFirstValidMileStoneHandle);
-
- if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ ||
- NULL == splid_le0_index_ || NULL == parsing_marks_ ||
- NULL == mile_stones_) {
- free_resource(false);
- return false;
- }
-
- if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
- != lma_node_num_le0_)
- return false;
-
- if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
- != lma_node_num_ge1_)
- return false;
-
- if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
- lma_idx_buf_len_)
- return false;
-
- // The quick index for the first level sons
- uint16 last_splid = kFullSplIdStart;
- size_t last_pos = 0;
- for (size_t i = 1; i < lma_node_num_le0_; i++) {
- for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++)
- splid_le0_index_[splid - kFullSplIdStart] = last_pos;
-
- splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] =
- static_cast<uint16>(i);
- last_splid = root_[i].spl_idx;
- last_pos = i;
- }
-
- for (uint16 splid = last_splid + 1;
- splid < buf_size + kFullSplIdStart; splid++) {
- assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size);
- splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1;
- }
-
- return true;
-}
-
-bool DictTrie::load_dict(const char *filename, LemmaIdType start_id,
- LemmaIdType end_id) {
- if (NULL == filename || end_id <= start_id)
- return false;
-
- FILE *fp = fopen(filename, "rb");
- if (NULL == fp)
- return false;
-
- free_resource(true);
-
- dict_list_ = new DictList();
- if (NULL == dict_list_) {
- fclose(fp);
- return false;
- }
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- NGram &ngram = NGram::get_instance();
-
- if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
- !load_dict(fp) || !ngram.load_ngram(fp) ||
- total_lma_num_ > end_id - start_id + 1) {
- free_resource(true);
- fclose(fp);
- return false;
- }
-
- fclose(fp);
- return true;
-}
-
-bool DictTrie::load_dict_fd(int sys_fd, long start_offset,
- long length, LemmaIdType start_id,
- LemmaIdType end_id) {
- if (start_offset < 0 || length <= 0 || end_id <= start_id)
- return false;
-
- FILE *fp = fdopen(sys_fd, "rb");
- if (NULL == fp)
- return false;
-
- if (-1 == fseek(fp, start_offset, SEEK_SET)) {
- fclose(fp);
- return false;
- }
-
- free_resource(true);
-
- dict_list_ = new DictList();
- if (NULL == dict_list_) {
- fclose(fp);
- return false;
- }
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- NGram &ngram = NGram::get_instance();
-
- if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
- !load_dict(fp) || !ngram.load_ngram(fp) ||
- ftell(fp) < start_offset + length ||
- total_lma_num_ > end_id - start_id + 1) {
- free_resource(true);
- fclose(fp);
- return false;
- }
-
- fclose(fp);
- return true;
-}
-
-size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
- LmaNodeLE0 *node) {
- size_t lpi_num = 0;
- NGram& ngram = NGram::get_instance();
- for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
- lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off +
- homo);
- lpi_items[lpi_num].lma_len = 1;
- lpi_items[lpi_num].psb =
- static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
- lpi_num++;
- if (lpi_num >= lpi_max)
- break;
- }
-
- return lpi_num;
-}
-
-size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
- size_t homo_buf_off, LmaNodeGE1 *node,
- uint16 lma_len) {
- size_t lpi_num = 0;
- NGram& ngram = NGram::get_instance();
- for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
- lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo);
- lpi_items[lpi_num].lma_len = lma_len;
- lpi_items[lpi_num].psb =
- static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
- lpi_num++;
- if (lpi_num >= lpi_max)
- break;
- }
-
- return lpi_num;
-}
-
-void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
- if (0 == from_step) {
- parsing_marks_pos_ = 0;
- mile_stones_pos_ = kFirstValidMileStoneHandle;
- } else {
- if (from_handle > 0 && from_handle < mile_stones_pos_) {
- mile_stones_pos_ = from_handle;
-
- MileStone *mile_stone = mile_stones_ + from_handle;
- parsing_marks_pos_ = mile_stone->mark_start;
- }
- }
-}
-
-MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items, size_t lpi_max,
- size_t *lpi_num) {
- if (NULL == dep)
- return 0;
-
- // from LmaNodeLE0 (root) to LmaNodeLE0
- if (0 == from_handle) {
- assert(0 == dep->splids_extended);
- return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num);
- }
-
- // from LmaNodeLE0 to LmaNodeGE1
- if (1 == dep->splids_extended)
- return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num);
-
- // From LmaNodeGE1 to LmaNodeGE1
- return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num);
-}
-
-MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num) {
- assert(NULL != dep && 0 == from_handle);
- *lpi_num = 0;
- MileStoneHandle ret_handle = 0;
-
- uint16 splid = dep->splids[dep->splids_extended];
- uint16 id_start = dep->id_start;
- uint16 id_num = dep->id_num;
-
- LpiCache& lpi_cache = LpiCache::get_instance();
- bool cached = lpi_cache.is_cached(splid);
-
- // 2. Begin exgtending
- // 2.1 Get the LmaPsbItem list
- LmaNodeLE0 *node = root_;
- size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
- size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart];
- for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
- assert(1 == node->son_1st_off);
- LmaNodeLE0 *son = root_ + son_pos;
- assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num);
-
- if (!cached && *lpi_num < lpi_max) {
- bool need_lpi = true;
- if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start)
- need_lpi = false;
-
- if (need_lpi)
- *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
- lpi_max - *lpi_num, son);
- }
-
- // If necessary, fill in a new mile stone.
- if (son->spl_idx == id_start) {
- if (mile_stones_pos_ < kMaxMileStone &&
- parsing_marks_pos_ < kMaxParsingMark) {
- parsing_marks_[parsing_marks_pos_].node_offset = son_pos;
- parsing_marks_[parsing_marks_pos_].node_num = id_num;
- mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_;
- mile_stones_[mile_stones_pos_].mark_num = 1;
- ret_handle = mile_stones_pos_;
- parsing_marks_pos_++;
- mile_stones_pos_++;
- }
- }
-
- if (son->spl_idx >= id_start + id_num -1)
- break;
- }
-
- // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
- // mile_stones_pos_);
- return ret_handle;
-}
-
-MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num) {
- assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
-
- MileStoneHandle ret_handle = 0;
-
- // 1. If this is a half Id, get its corresponding full starting Id and
- // number of full Id.
- size_t ret_val = 0;
-
- uint16 id_start = dep->id_start;
- uint16 id_num = dep->id_num;
-
- // 2. Begin extending.
- MileStone *mile_stone = mile_stones_ + from_handle;
-
- for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
- ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
- uint16 ext_num = p_mark.node_num;
- for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
- LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos;
- size_t found_start = 0;
- size_t found_num = 0;
- for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
- assert(node->son_1st_off <= lma_node_num_ge1_);
- LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos;
- if (son->spl_idx >= id_start
- && son->spl_idx < id_start + id_num) {
- if (*lpi_num < lpi_max) {
- size_t homo_buf_off = get_homo_idx_buf_offset(son);
- *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
- lpi_max - *lpi_num, homo_buf_off, son,
- 2);
- }
-
- // If necessary, fill in the new DTMI
- if (0 == found_num) {
- found_start = son_pos;
- }
- found_num++;
- }
- if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
- (size_t)node->num_of_son - 1) {
- if (found_num > 0) {
- if (mile_stones_pos_ < kMaxMileStone &&
- parsing_marks_pos_ < kMaxParsingMark) {
- parsing_marks_[parsing_marks_pos_].node_offset =
- node->son_1st_off + found_start;
- parsing_marks_[parsing_marks_pos_].node_num = found_num;
- if (0 == ret_val)
- mile_stones_[mile_stones_pos_].mark_start =
- parsing_marks_pos_;
- parsing_marks_pos_++;
- }
-
- ret_val++;
- }
- break;
- } // for son_pos
- } // for ext_pos
- } // for h_pos
- }
-
- if (ret_val > 0) {
- mile_stones_[mile_stones_pos_].mark_num = ret_val;
- ret_handle = mile_stones_pos_;
- mile_stones_pos_++;
- ret_val = 1;
- }
-
- // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
- // mile_stones_pos_);
- return ret_handle;
-}
-
-MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num) {
- assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
-
- MileStoneHandle ret_handle = 0;
-
- // 1. If this is a half Id, get its corresponding full starting Id and
- // number of full Id.
- size_t ret_val = 0;
-
- uint16 id_start = dep->id_start;
- uint16 id_num = dep->id_num;
-
- // 2. Begin extending.
- MileStone *mile_stone = mile_stones_ + from_handle;
-
- for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
- ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
- uint16 ext_num = p_mark.node_num;
- for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
- LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos;
- size_t found_start = 0;
- size_t found_num = 0;
-
- for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
- assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
- LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos;
- if (son->spl_idx >= id_start
- && son->spl_idx < id_start + id_num) {
- if (*lpi_num < lpi_max) {
- size_t homo_buf_off = get_homo_idx_buf_offset(son);
- *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
- lpi_max - *lpi_num, homo_buf_off, son,
- dep->splids_extended + 1);
- }
-
- // If necessary, fill in the new DTMI
- if (0 == found_num) {
- found_start = son_pos;
- }
- found_num++;
- }
- if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
- (size_t)node->num_of_son - 1) {
- if (found_num > 0) {
- if (mile_stones_pos_ < kMaxMileStone &&
- parsing_marks_pos_ < kMaxParsingMark) {
- parsing_marks_[parsing_marks_pos_].node_offset =
- get_son_offset(node) + found_start;
- parsing_marks_[parsing_marks_pos_].node_num = found_num;
- if (0 == ret_val)
- mile_stones_[mile_stones_pos_].mark_start =
- parsing_marks_pos_;
- parsing_marks_pos_++;
- }
-
- ret_val++;
- }
- break;
- }
- } // for son_pos
- } // for ext_pos
- } // for h_pos
-
- if (ret_val > 0) {
- mile_stones_[mile_stones_pos_].mark_num = ret_val;
- ret_handle = mile_stones_pos_;
- mile_stones_pos_++;
- }
-
- // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
- // mile_stones_pos_);
- return ret_handle;
-}
-
-bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num,
- LemmaIdType id_lemma) {
- if (0 == splid_num || NULL == splids)
- return false;
-
- void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart];
-
- for (uint16 pos = 1; pos < splid_num; pos++) {
- if (1 == pos) {
- LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
- LmaNodeGE1 *node_son;
- uint16 son_pos;
- for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son);
- son_pos++) {
- assert(node_le0->son_1st_off <= lma_node_num_ge1_);
- node_son = nodes_ge1_ + node_le0->son_1st_off
- + son_pos;
- if (node_son->spl_idx == splids[pos])
- break;
- }
- if (son_pos < node_le0->num_of_son)
- node = reinterpret_cast<void*>(node_son);
- else
- return false;
- } else {
- LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
- LmaNodeGE1 *node_son;
- uint16 son_pos;
- for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son);
- son_pos++) {
- assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0);
- node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos;
- if (node_son->spl_idx == splids[pos])
- break;
- }
- if (son_pos < node_ge1->num_of_son)
- node = reinterpret_cast<void*>(node_son);
- else
- return false;
- }
- }
-
- if (1 == splid_num) {
- LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
- size_t num_of_homo = (size_t)node_le0->num_of_homo;
- for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
- LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
- char16 str[2];
- get_lemma_str(id_this, str, 2);
- if (id_this == id_lemma)
- return true;
- }
- } else {
- LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
- size_t num_of_homo = (size_t)node_ge1->num_of_homo;
- for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
- size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
- if (get_lemma_id(node_homo_off + homo_pos) == id_lemma)
- return true;
- }
- }
-
- return false;
-}
-
-size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len,
- LmaPsbItem* lma_buf, size_t max_lma_buf) {
- if (splid_str_len > kMaxLemmaSize)
- return 0;
-
-#define MAX_EXTENDBUF_LEN 200
-
- size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment
- size_t* node_buf2[MAX_EXTENDBUF_LEN];
- LmaNodeLE0** node_fr_le0 =
- reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from.
- LmaNodeLE0** node_to_le0 =
- reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to.
- LmaNodeGE1** node_fr_ge1 = NULL;
- LmaNodeGE1** node_to_ge1 = NULL;
- size_t node_fr_num = 1;
- size_t node_to_num = 0;
- node_fr_le0[0] = root_;
- if (NULL == node_fr_le0[0])
- return 0;
-
- size_t spl_pos = 0;
-
- while (spl_pos < splid_str_len) {
- uint16 id_num = 1;
- uint16 id_start = splid_str[spl_pos];
- // If it is a half id
- if (spl_trie_->is_half_id(splid_str[spl_pos])) {
- id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start);
- assert(id_num > 0);
- }
-
- // Extend the nodes
- if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes
- for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
- LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
- assert(node == root_ && 1 == node_fr_num);
- size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
- size_t son_end =
- splid_le0_index_[id_start + id_num - kFullSplIdStart];
- for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
- assert(1 == node->son_1st_off);
- LmaNodeLE0 *node_son = root_ + son_pos;
- assert(node_son->spl_idx >= id_start
- && node_son->spl_idx < id_start + id_num);
- if (node_to_num < MAX_EXTENDBUF_LEN) {
- node_to_le0[node_to_num] = node_son;
- node_to_num++;
- }
- // id_start + id_num - 1 is the last one, which has just been
- // recorded.
- if (node_son->spl_idx >= id_start + id_num - 1)
- break;
- }
- }
-
- spl_pos++;
- if (spl_pos >= splid_str_len || node_to_num == 0)
- break;
- // Prepare the nodes for next extending
- // next time, from LmaNodeLE0 to LmaNodeGE1
- LmaNodeLE0** node_tmp = node_fr_le0;
- node_fr_le0 = node_to_le0;
- node_to_le0 = NULL;
- node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp);
- } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes
- for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
- LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
- for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
- son_pos++) {
- assert(node->son_1st_off <= lma_node_num_ge1_);
- LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off
- + son_pos;
- if (node_son->spl_idx >= id_start
- && node_son->spl_idx < id_start + id_num) {
- if (node_to_num < MAX_EXTENDBUF_LEN) {
- node_to_ge1[node_to_num] = node_son;
- node_to_num++;
- }
- }
- // id_start + id_num - 1 is the last one, which has just been
- // recorded.
- if (node_son->spl_idx >= id_start + id_num - 1)
- break;
- }
- }
-
- spl_pos++;
- if (spl_pos >= splid_str_len || node_to_num == 0)
- break;
- // Prepare the nodes for next extending
- // next time, from LmaNodeGE1 to LmaNodeGE1
- node_fr_ge1 = node_to_ge1;
- node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0);
- node_fr_le0 = NULL;
- node_to_le0 = NULL;
- } else { // From LmaNodeGE1 to LmaNodeGE1 nodes
- for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
- LmaNodeGE1 *node = node_fr_ge1[node_fr_pos];
- for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
- son_pos++) {
- assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
- LmaNodeGE1 *node_son = nodes_ge1_
- + get_son_offset(node) + son_pos;
- if (node_son->spl_idx >= id_start
- && node_son->spl_idx < id_start + id_num) {
- if (node_to_num < MAX_EXTENDBUF_LEN) {
- node_to_ge1[node_to_num] = node_son;
- node_to_num++;
- }
- }
- // id_start + id_num - 1 is the last one, which has just been
- // recorded.
- if (node_son->spl_idx >= id_start + id_num - 1)
- break;
- }
- }
-
- spl_pos++;
- if (spl_pos >= splid_str_len || node_to_num == 0)
- break;
- // Prepare the nodes for next extending
- // next time, from LmaNodeGE1 to LmaNodeGE1
- LmaNodeGE1 **node_tmp = node_fr_ge1;
- node_fr_ge1 = node_to_ge1;
- node_to_ge1 = node_tmp;
- }
-
- // The number of node for next extending
- node_fr_num = node_to_num;
- node_to_num = 0;
- } // while
-
- if (0 == node_to_num)
- return 0;
-
- NGram &ngram = NGram::get_instance();
- size_t lma_num = 0;
-
- // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e',
- // only those candidates for the full matched one-char id will be returned.
- if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0]))
- node_to_num = node_to_num > 0 ? 1 : 0;
-
- for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) {
- size_t num_of_homo = 0;
- if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes
- LmaNodeLE0* node_le0 = node_to_le0[node_pos];
- num_of_homo = (size_t)node_le0->num_of_homo;
- for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
- size_t ch_pos = lma_num + homo_pos;
- lma_buf[ch_pos].id =
- get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
- lma_buf[ch_pos].lma_len = 1;
- lma_buf[ch_pos].psb =
- static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
-
- if (lma_num + homo_pos >= max_lma_buf - 1)
- break;
- }
- } else { // Get from LmaNodeGE1 nodes
- LmaNodeGE1* node_ge1 = node_to_ge1[node_pos];
- num_of_homo = (size_t)node_ge1->num_of_homo;
- for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
- size_t ch_pos = lma_num + homo_pos;
- size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
- lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos);
- lma_buf[ch_pos].lma_len = splid_str_len;
- lma_buf[ch_pos].psb =
- static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
-
- if (lma_num + homo_pos >= max_lma_buf - 1)
- break;
- }
- }
-
- lma_num += num_of_homo;
- if (lma_num >= max_lma_buf) {
- lma_num = max_lma_buf;
- break;
- }
- }
- return lma_num;
-}
-
-uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
- uint16 str_max) {
- return dict_list_->get_lemma_str(id_lemma, str_buf, str_max);
-}
-
-uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid) {
- char16 lma_str[kMaxLemmaSize + 1];
- uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1);
- assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max);
-
- uint16 spl_mtrx[kMaxLemmaSize * 5];
- uint16 spl_start[kMaxLemmaSize + 1];
- spl_start[0] = 0;
- uint16 try_num = 1;
-
- for (uint16 pos = 0; pos < lma_len; pos++) {
- uint16 cand_splids_this = 0;
- if (arg_valid && spl_trie_->is_full_id(splids[pos])) {
- spl_mtrx[spl_start[pos]] = splids[pos];
- cand_splids_this = 1;
- } else {
- cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos],
- arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos],
- kMaxLemmaSize * 5 - spl_start[pos]);
- assert(cand_splids_this > 0);
- }
- spl_start[pos + 1] = spl_start[pos] + cand_splids_this;
- try_num *= cand_splids_this;
- }
-
- for (uint16 try_pos = 0; try_pos < try_num; try_pos++) {
- uint16 mod = 1;
- for (uint16 pos = 0; pos < lma_len; pos++) {
- uint16 radix = spl_start[pos + 1] - spl_start[pos];
- splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix];
- mod *= radix;
- }
-
- if (try_extend(splids, lma_len, id_lemma))
- return lma_len;
- }
-
- return 0;
-}
-
-void DictTrie::set_total_lemma_count_of_others(size_t count) {
- NGram& ngram = NGram::get_instance();
- ngram.set_total_freq_none_sys(count);
-}
-
-void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) {
- return dict_list_->convert_to_hanzis(str, str_len);
-}
-
-void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) {
- return dict_list_->convert_to_scis_ids(str, str_len);
-}
-
-LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) {
- if (NULL == lemma_str || lemma_len > kMaxLemmaSize)
- return 0;
-
- return dict_list_->get_lemma_id(lemma_str, lemma_len);
-}
-
-size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items,
- size_t npre_max, size_t b4_used) {
- NGram &ngram = NGram::get_instance();
-
- size_t item_num = 0;
- size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_;
- size_t top_lmas_pos = 0;
- while (item_num < npre_max && top_lmas_pos < top_lmas_num_) {
- memset(npre_items + item_num, 0, sizeof(NPredictItem));
- LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos);
- top_lmas_pos += 1;
- if (dict_list_->get_lemma_str(top_lma_id,
- npre_items[item_num].pre_hzs,
- kMaxLemmaSize - 1) == 0) {
- continue;
- }
- npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id);
- npre_items[item_num].his_len = his_len;
- item_num++;
- }
- return item_num;
-}
-
-size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used) {
- return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used);
-}
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp
deleted file mode 100644
index 4bb4ca26..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/lpicache.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include "../include/lpicache.h"
-
-namespace ime_pinyin {
-
-LpiCache* LpiCache::instance_ = NULL;
-
-LpiCache::LpiCache() {
- lpi_cache_ = new LmaPsbItem[kFullSplIdStart * kMaxLpiCachePerId];
- lpi_cache_len_ = new uint16[kFullSplIdStart];
- assert(NULL != lpi_cache_);
- assert(NULL != lpi_cache_len_);
- for (uint16 id = 0; id < kFullSplIdStart; id++)
- lpi_cache_len_[id] = 0;
-}
-
-LpiCache::~LpiCache() {
- if (NULL != lpi_cache_)
- delete [] lpi_cache_;
-
- if (NULL != lpi_cache_len_)
- delete [] lpi_cache_len_;
-}
-
-LpiCache& LpiCache::get_instance() {
- if (NULL == instance_) {
- instance_ = new LpiCache();
- assert(NULL != instance_);
- }
- return *instance_;
-}
-
-bool LpiCache::is_cached(uint16 splid) {
- if (splid >= kFullSplIdStart)
- return false;
- return lpi_cache_len_[splid] != 0;
-}
-
-size_t LpiCache::put_cache(uint16 splid, LmaPsbItem lpi_items[],
- size_t lpi_num) {
- uint16 num = kMaxLpiCachePerId;
- if (num > lpi_num)
- num = static_cast<uint16>(lpi_num);
-
- LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId;
- for (uint16 pos = 0; pos < num; pos++)
- lpi_cache_this[pos] = lpi_items[pos];
-
- lpi_cache_len_[splid] = num;
- return num;
-}
-
-size_t LpiCache::get_cache(uint16 splid, LmaPsbItem lpi_items[],
- size_t lpi_max) {
- if (lpi_max > lpi_cache_len_[splid])
- lpi_max = lpi_cache_len_[splid];
-
- LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId;
- for (uint16 pos = 0; pos < lpi_max; pos++) {
- lpi_items[pos] = lpi_cache_this[pos];
- }
- return lpi_max;
-}
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp
deleted file mode 100644
index 41e11433..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/matrixsearch.cpp
+++ /dev/null
@@ -1,1981 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include "../include/lpicache.h"
-#include "../include/matrixsearch.h"
-#include "../include/mystdlib.h"
-#include "../include/ngram.h"
-#include "../include/userdict.h"
-
-namespace ime_pinyin {
-
-#define PRUMING_SCORE 8000.0
-
-MatrixSearch::MatrixSearch() {
- inited_ = false;
- spl_trie_ = SpellingTrie::get_cpinstance();
-
- reset_pointers_to_null();
-
- pys_decoded_len_ = 0;
- mtrx_nd_pool_used_ = 0;
- dmi_pool_used_ = 0;
- xi_an_enabled_ = false;
- dmi_c_phrase_ = false;
-
- assert(kMaxSearchSteps > 0);
- max_sps_len_ = kMaxSearchSteps - 1;
- max_hzs_len_ = kMaxSearchSteps;
-}
-
-MatrixSearch::~MatrixSearch() {
- free_resource();
-}
-
-void MatrixSearch::reset_pointers_to_null() {
- dict_trie_ = NULL;
- user_dict_ = NULL;
- spl_parser_ = NULL;
-
- share_buf_ = NULL;
-
- // The following four buffers are used for decoding, and they are based on
- // share_buf_, no need to delete them.
- mtrx_nd_pool_ = NULL;
- dmi_pool_ = NULL;
- matrix_ = NULL;
- dep_ = NULL;
-
- // Based on share_buf_, no need to delete them.
- npre_items_ = NULL;
-}
-
-bool MatrixSearch::alloc_resource() {
- free_resource();
-
- dict_trie_ = new DictTrie();
- user_dict_ = static_cast<AtomDictBase*>(new UserDict());
- spl_parser_ = new SpellingParser();
-
- size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
- mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
- size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
- dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
- size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
- matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
- size_t dep_size = sizeof(DictExtPara);
- dep_size = align_to_size_t(dep_size) / sizeof(size_t);
-
- // share_buf's size is determined by the buffers for search.
- share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
-
- if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
- NULL == share_buf_)
- return false;
-
- // The buffers for search are based on the share buffer
- mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
- dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
- matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
- dep_ = reinterpret_cast<DictExtPara*>
- (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
-
- // The prediction buffer is also based on the share buffer.
- npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
- npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
- sizeof(size_t) / sizeof(NPredictItem);
- return true;
-}
-
-void MatrixSearch::free_resource() {
- if (NULL != dict_trie_)
- delete dict_trie_;
-
- if (NULL != user_dict_)
- delete user_dict_;
-
- if (NULL != spl_parser_)
- delete spl_parser_;
-
- if (NULL != share_buf_)
- delete [] share_buf_;
-
- reset_pointers_to_null();
-}
-
-bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
- if (NULL == fn_sys_dict || NULL == fn_usr_dict)
- return false;
-
- if (!alloc_resource())
- return false;
-
- if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
- return false;
-
- // If engine fails to load the user dictionary, reset the user dictionary
- // to NULL.
- if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
- delete user_dict_;
- user_dict_ = NULL;
- } else{
- user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
- }
-
- reset_search0();
-
- inited_ = true;
- return true;
-}
-
-bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
- const char *fn_usr_dict) {
- if (NULL == fn_usr_dict)
- return false;
-
- if (!alloc_resource())
- return false;
-
- if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
- return false;
-
- if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
- delete user_dict_;
- user_dict_ = NULL;
- } else {
- user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
- }
-
- reset_search0();
-
- inited_ = true;
- return true;
-}
-
-void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) {
- assert(inited_);
-
- if (NULL != user_dict_) {
- delete user_dict_;
- user_dict_ = NULL;
- }
-
- if (NULL != fn_usr_dict) {
- user_dict_ = static_cast<AtomDictBase*>(new UserDict());
- if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
- delete user_dict_;
- user_dict_ = NULL;
- }
- }
-
- reset_search0();
-}
-
-bool MatrixSearch::is_user_dictionary_enabled() const {
- return NULL != user_dict_;
-}
-
-void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
- if (0 != max_sps_len)
- max_sps_len_ = max_sps_len;
- if (0 != max_hzs_len)
- max_hzs_len_ = max_hzs_len;
-}
-
-void MatrixSearch::close() {
- flush_cache();
- free_resource();
- inited_ = false;
-}
-
-void MatrixSearch::flush_cache() {
- if (NULL != user_dict_)
- user_dict_->flush_cache();
-}
-
-void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
- xi_an_enabled_ = xi_an_enabled;
-}
-
-bool MatrixSearch::get_xi_an_switch() {
- return xi_an_enabled_;
-}
-
-bool MatrixSearch::reset_search() {
- if (!inited_)
- return false;
- return reset_search0();
-}
-
-bool MatrixSearch::reset_search0() {
- if (!inited_)
- return false;
-
- pys_decoded_len_ = 0;
- mtrx_nd_pool_used_ = 0;
- dmi_pool_used_ = 0;
-
- // Get a MatrixNode from the pool
- matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
- matrix_[0].mtrx_nd_num = 1;
- mtrx_nd_pool_used_ += 1;
-
- // Update the node, and make it to be a starting node
- MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
- node->id = 0;
- node->score = 0;
- node->from = NULL;
- node->step = 0;
- node->dmi_fr = (PoolPosType)-1;
-
- matrix_[0].dmi_pos = 0;
- matrix_[0].dmi_num = 0;
- matrix_[0].dmi_has_full_id = 1;
- matrix_[0].mtrx_nd_fixed = node;
-
- lma_start_[0] = 0;
- fixed_lmas_ = 0;
- spl_start_[0] = 0;
- fixed_hzs_ = 0;
-
- dict_trie_->reset_milestones(0, 0);
- if (NULL != user_dict_)
- user_dict_->reset_milestones(0, 0);
-
- return true;
-}
-
-bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
- bool clear_dmi_this_step,
- bool clear_mtrx_this_step) {
- if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
- return false;
-
- if (0 == ch_pos) {
- reset_search0();
- } else {
- // Prepare mile stones of this step to clear.
- MileStoneHandle *dict_handles_to_clear = NULL;
- if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
- dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
- }
-
- // If there are more steps, and this step is not allowed to clear, find
- // milestones of next step.
- if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
- dict_handles_to_clear = NULL;
- if (matrix_[ch_pos + 1].dmi_num > 0) {
- dict_handles_to_clear =
- dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
- }
- }
-
- if (NULL != dict_handles_to_clear) {
- dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
- if (NULL != user_dict_)
- user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
- }
-
- pys_decoded_len_ = ch_pos;
-
- if (clear_dmi_this_step) {
- dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
- + matrix_[ch_pos - 1].dmi_num;
- matrix_[ch_pos].dmi_num = 0;
- } else {
- dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
- }
-
- if (clear_mtrx_this_step) {
- mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
- + matrix_[ch_pos - 1].mtrx_nd_num;
- matrix_[ch_pos].mtrx_nd_num = 0;
- } else {
- mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
- + matrix_[ch_pos].mtrx_nd_num;
- }
-
- // Modify fixed_hzs_
- if (fixed_hzs_ > 0 &&
- ((kLemmaIdComposing != lma_id_[0]) ||
- (kLemmaIdComposing == lma_id_[0] &&
- spl_start_[c_phrase_.length] <= ch_pos))) {
- size_t fixed_ch_pos = ch_pos;
- if (clear_fixed_this_step)
- fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
- while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
- fixed_ch_pos--;
-
- fixed_lmas_ = 0;
- fixed_hzs_ = 0;
- if (fixed_ch_pos > 0) {
- while (spl_start_[fixed_hzs_] < fixed_ch_pos)
- fixed_hzs_++;
- assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
-
- while (lma_start_[fixed_lmas_] < fixed_hzs_)
- fixed_lmas_++;
- assert(lma_start_[fixed_lmas_] == fixed_hzs_);
- }
-
- // Re-search the Pinyin string for the unlocked lemma
- // which was previously fixed.
- //
- // Prepare mile stones of this step to clear.
- MileStoneHandle *dict_handles_to_clear = NULL;
- if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
- matrix_[fixed_ch_pos].dmi_num > 0) {
- dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
- }
-
- // If there are more steps, and this step is not allowed to clear, find
- // milestones of next step.
- if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
- dict_handles_to_clear = NULL;
- if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
- dict_handles_to_clear =
- dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
- }
- }
-
- if (NULL != dict_handles_to_clear) {
- dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
- if (NULL != user_dict_)
- user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
- }
-
-
- pys_decoded_len_ = fixed_ch_pos;
-
- if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
- dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
- + matrix_[fixed_ch_pos - 1].dmi_num;
- matrix_[fixed_ch_pos].dmi_num = 0;
- } else {
- dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
- matrix_[fixed_ch_pos].dmi_num;
- }
-
- if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
- mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
- + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
- matrix_[fixed_ch_pos].mtrx_nd_num = 0;
- } else {
- mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
- + matrix_[fixed_ch_pos].mtrx_nd_num;
- }
-
- for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
- add_char(pys_[re_pos]);
- }
- } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
- for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
- uint16 splpos_begin = c_phrase_.sublma_start[subpos];
- uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
- for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
- // If ch_pos is in this spelling
- uint16 spl_start = c_phrase_.spl_start[splpos];
- uint16 spl_end = c_phrase_.spl_start[splpos + 1];
- if (ch_pos >= spl_start && ch_pos < spl_end) {
- // Clear everything after this position
- c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
- c_phrase_.sublma_start[subpos + 1] = splpos;
- c_phrase_.sublma_num = subpos + 1;
- c_phrase_.length = splpos;
-
- if (splpos == splpos_begin) {
- c_phrase_.sublma_num = subpos;
- }
- }
- }
- }
-
- // Extend the composing phrase.
- reset_search0();
- dmi_c_phrase_ = true;
- uint16 c_py_pos = 0;
- while (c_py_pos < spl_start_[c_phrase_.length]) {
- bool b_ac_tmp = add_char(pys_[c_py_pos]);
- assert(b_ac_tmp);
- c_py_pos++;
- }
- dmi_c_phrase_ = false;
-
- lma_id_num_ = 1;
- fixed_lmas_ = 1;
- fixed_lmas_no1_[0] = 0; // A composing string is always modified.
- fixed_hzs_ = c_phrase_.length;
- lma_start_[1] = fixed_hzs_;
- lma_id_[0] = kLemmaIdComposing;
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
- }
- }
-
- return true;
-}
-
-void MatrixSearch::del_in_pys(size_t start, size_t len) {
- while (start < kMaxRowNum - len && '\0' != pys_[start]) {
- pys_[start] = pys_[start + len];
- start++;
- }
-}
-
-size_t MatrixSearch::search(const char *py, size_t py_len) {
- if (!inited_ || NULL == py)
- return 0;
-
- // If the search Pinyin string is too long, it will be truncated.
- if (py_len > kMaxRowNum - 1)
- py_len = kMaxRowNum - 1;
-
- // Compare the new string with the previous one. Find their prefix to
- // increase search efficiency.
- size_t ch_pos = 0;
- for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
- if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
- break;
- }
-
- bool clear_fix = true;
- if (ch_pos == pys_decoded_len_)
- clear_fix = false;
-
- reset_search(ch_pos, clear_fix, false, false);
-
- memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
- pys_[py_len] = '\0';
-
- while ('\0' != pys_[ch_pos]) {
- if (!add_char(py[ch_pos])) {
- pys_decoded_len_ = ch_pos;
- break;
- }
- ch_pos++;
- }
-
- // Get spelling ids and starting positions.
- get_spl_start_id();
-
- // If there are too many spellings, remove the last letter until the spelling
- // number is acceptable.
- while (spl_id_num_ > 9) {
- py_len--;
- reset_search(py_len, false, false, false);
- pys_[py_len] = '\0';
- get_spl_start_id();
- }
-
- prepare_candidates();
-
- if (kPrintDebug0) {
- printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
- printf("--DMI Pool Used: %d\n", dmi_pool_used_);
-
- if (kPrintDebug1) {
- for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
- debug_print_dmi(pos, 1);
- }
- }
- }
-
- return ch_pos;
-}
-
-size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
- bool clear_fixed_this_step) {
- if (!inited_)
- return 0;
-
- size_t reset_pos = pos;
-
- // Out of range for both Pinyin mode and Spelling id mode.
- if (pys_decoded_len_ <= pos) {
- del_in_pys(pos, 1);
-
- reset_pos = pys_decoded_len_;
- // Decode the string after the un-decoded position
- while ('\0' != pys_[reset_pos]) {
- if (!add_char(pys_[reset_pos])) {
- pys_decoded_len_ = reset_pos;
- break;
- }
- reset_pos++;
- }
- get_spl_start_id();
- prepare_candidates();
- return pys_decoded_len_;
- }
-
- // Spelling id mode, but out of range.
- if (is_pos_in_splid && spl_id_num_ <= pos)
- return pys_decoded_len_;
-
- // Begin to handle two modes respectively.
- // Pinyin mode by default
- size_t c_py_len = 0; // The length of composing phrase's Pinyin
- size_t del_py_len = 1;
- if (!is_pos_in_splid) {
- // Pinyin mode is only allowed to delete beyond the fixed lemmas.
- if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
- return pys_decoded_len_;
-
- del_in_pys(pos, 1);
-
- // If the deleted character is just the one after the last fixed lemma
- if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
- // If all fixed lemmas have been merged, and the caller of the function
- // request to unlock the last fixed lemma.
- if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
- // Unlock the last sub lemma in the composing phrase. Because it is not
- // easy to unlock it directly. Instead, we re-decode the modified
- // composing phrase.
- c_phrase_.sublma_num--;
- c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
- reset_pos = spl_start_[c_phrase_.length];
- c_py_len = reset_pos;
- }
- }
- } else {
- del_py_len = spl_start_[pos + 1] - spl_start_[pos];
-
- del_in_pys(spl_start_[pos], del_py_len);
-
- if (pos >= lma_start_[fixed_lmas_]) {
- c_py_len = 0;
- reset_pos = spl_start_[pos + 1] - del_py_len;
- } else {
- c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
- reset_pos = c_py_len;
- if (c_py_len > 0)
- merge_fixed_lmas(pos);
- }
- }
-
- if (c_py_len > 0) {
- assert(c_phrase_.length > 0 && c_py_len ==
- c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
- // The composing phrase is valid, reset all search space,
- // and begin a new search which will only extend the composing
- // phrase.
- reset_search0();
-
- dmi_c_phrase_ = true;
- // Extend the composing phrase.
- uint16 c_py_pos = 0;
- while (c_py_pos < c_py_len) {
- bool b_ac_tmp = add_char(pys_[c_py_pos]);
- assert(b_ac_tmp);
- c_py_pos++;
- }
- dmi_c_phrase_ = false;
-
- // Fixd the composing phrase as the first choice.
- lma_id_num_ = 1;
- fixed_lmas_ = 1;
- fixed_lmas_no1_[0] = 0; // A composing string is always modified.
- fixed_hzs_ = c_phrase_.length;
- lma_start_[1] = fixed_hzs_;
- lma_id_[0] = kLemmaIdComposing;
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
- } else {
- // Reseting search only clear pys_decoded_len_, but the string is kept.
- reset_search(reset_pos, clear_fixed_this_step, false, false);
- }
-
- // Decode the string after the delete position.
- while ('\0' != pys_[reset_pos]) {
- if (!add_char(pys_[reset_pos])) {
- pys_decoded_len_ = reset_pos;
- break;
- }
- reset_pos++;
- }
-
- get_spl_start_id();
- prepare_candidates();
- return pys_decoded_len_;
-}
-
-size_t MatrixSearch::get_candidate_num() {
- if (!inited_ || 0 == pys_decoded_len_ ||
- 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
- return 0;
-
- return 1 + lpi_total_;
-}
-
-char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
- size_t max_len) {
- if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
- return NULL;
-
- if (0 == cand_id) {
- return get_candidate0(cand_str, max_len, NULL, false);
- } else {
- cand_id--;
- }
-
- // For this case: the current sentence is a word only, and the user fixed it,
- // so the result will be fixed to the sentence space, and
- // lpi_total_ will be set to 0.
- if (0 == lpi_total_) {
- return get_candidate0(cand_str, max_len, NULL, false);
- }
-
- LemmaIdType id = lpi_items_[cand_id].id;
- char16 s[kMaxLemmaSize + 1];
-
- uint16 s_len = lpi_items_[cand_id].lma_len;
- if (s_len > 1) {
- s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
- } else {
- // For a single character, Hanzi is ready.
- s[0] = lpi_items_[cand_id].hanzi;
- s[1] = static_cast<char16>(0);
- }
-
- if (s_len > 0 && max_len > s_len) {
- utf16_strncpy(cand_str, s, s_len);
- cand_str[s_len] = (char16)'\0';
- return cand_str;
- }
-
- return NULL;
-}
-
-void MatrixSearch::update_dict_freq() {
- if (NULL != user_dict_) {
- // Update the total frequency of all lemmas, including system lemmas and
- // user dictionary lemmas.
- size_t total_freq = user_dict_->get_total_lemma_count();
- dict_trie_->set_total_lemma_count_of_others(total_freq);
- }
-}
-
-bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
- float score) {
- if (lma_to - lma_fr <= 1 || NULL == user_dict_)
- return false;
-
- char16 word_str[kMaxLemmaSize + 1];
- uint16 spl_ids[kMaxLemmaSize];
-
- uint16 spl_id_fr = 0;
-
- for (uint16 pos = lma_fr; pos < lma_to; pos++) {
- LemmaIdType lma_id = lma_id_[pos];
- if (is_user_lemma(lma_id)) {
- user_dict_->update_lemma(lma_id, 1, true);
- }
- uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
- utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
-
- uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
- kMaxLemmaSize + 1 - spl_id_fr);
- assert(tmp == lma_len);
-
- tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
- if (tmp != lma_len) {
- return false;
- }
-
- spl_id_fr += lma_len;
- }
-
- assert(spl_id_fr <= kMaxLemmaSize);
-
- return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
- spl_id_fr, 1);
-}
-
-void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
- if (dmi_pos >= dmi_pool_used_) return;
-
- DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
-
- if (1 == nest_level) {
- printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
- }
- if (dmi->dict_level > 1) {
- debug_print_dmi(dmi->dmi_fr, nest_level + 1);
- }
- printf("---%d\n", dmi->dict_level);
- printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
- printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
- get_spelling_str(dmi->spl_id), dmi->spl_id);
- printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
- if (1 == nest_level) {
- printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
- }
-}
-
-bool MatrixSearch::try_add_cand0_to_userdict() {
- size_t new_cand_num = get_candidate_num();
- if (fixed_hzs_ > 0 && 1 == new_cand_num) {
- float score_from = 0;
- uint16 lma_id_from = 0;
- uint16 pos = 0;
- bool modified = false;
- while (pos < fixed_lmas_) {
- if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
- static_cast<uint16>(kMaxLemmaSize)) {
- float score_to_add =
- mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
- .mtrx_nd_pos].score - score_from;
- if (modified) {
- score_to_add += 1.0;
- if (score_to_add > NGram::kMaxScore) {
- score_to_add = NGram::kMaxScore;
- }
- add_lma_to_userdict(lma_id_from, pos, score_to_add);
- }
- lma_id_from = pos;
- score_from += score_to_add;
-
- // Clear the flag for next user lemma.
- modified = false;
- }
-
- if (0 == fixed_lmas_no1_[pos]) {
- modified = true;
- }
- pos++;
- }
-
- // Single-char word is not allowed to add to userdict.
- if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
- float score_to_add =
- mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
- .mtrx_nd_pos].score - score_from;
- if (modified) {
- score_to_add += 1.0;
- if (score_to_add > NGram::kMaxScore) {
- score_to_add = NGram::kMaxScore;
- }
- add_lma_to_userdict(lma_id_from, pos, score_to_add);
- }
- }
- }
- return true;
-}
-
-// Choose a candidate, and give new candidates for next step.
-// If user finishes selection, we will try to communicate with user dictionary
-// to add new items or update score of some existing items.
-//
-// Basic rule:
-// 1. If user selects the first choice:
-// 1.1. If the first choice is not a sentence, instead, it is a lemma:
-// 1.1.1. If the first choice is a user lemma, notify the user
-// dictionary that a user lemma is hit, and add occuring count
-// by 1.
-// 1.1.2. If the first choice is a system lemma, do nothing.
-// 1.2. If the first choice is a sentence containing more than one lemma:
-// 1.2.1. The whole sentence will be added as a user lemma. If the
-// sentence contains user lemmas, -> hit, and add occuring count
-// by 1.
-size_t MatrixSearch::choose(size_t cand_id) {
- if (!inited_ || 0 == pys_decoded_len_)
- return 0;
-
- if (0 == cand_id) {
- fixed_hzs_ = spl_id_num_;
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
- matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
- for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
- fixed_lmas_no1_[pos] = 1;
- }
- fixed_lmas_ = lma_id_num_;
- lpi_total_ = 0; // Clean all other candidates.
-
- // 1. It is the first choice
- if (1 == lma_id_num_) {
- // 1.1. The first choice is not a sentence but a lemma
- if (is_user_lemma(lma_id_[0])) {
- // 1.1.1. The first choice is a user lemma, notify the user dictionary
- // that it is hit.
- if (NULL != user_dict_)
- user_dict_->update_lemma(lma_id_[0], 1, true);
- } else {
- // 1.1.2. do thing for a system lemma.
- }
- } else {
- // 1.2. The first choice is a sentence.
- // 1.2.1 Try to add the whole sentence to user dictionary, the whole
- // sentence may be splitted into many items.
- if (NULL != user_dict_) {
- try_add_cand0_to_userdict();
- }
- }
- update_dict_freq();
- return 1;
- } else {
- cand_id--;
- }
-
- // 2. It is not the full sentence candidate.
- // Find the length of the candidate.
- LemmaIdType id_chosen = lpi_items_[cand_id].id;
- LmaScoreType score_chosen = lpi_items_[cand_id].psb;
- size_t cand_len = lpi_items_[cand_id].lma_len;
-
- assert(cand_len > 0);
-
- // Notify the atom dictionary that this item is hit.
- if (is_user_lemma(id_chosen)) {
- if (NULL != user_dict_) {
- user_dict_->update_lemma(id_chosen, 1, true);
- }
- update_dict_freq();
- }
-
- // 3. Fixed the chosen item.
- // 3.1 Get the steps number.
- size_t step_fr = spl_start_[fixed_hzs_];
- size_t step_to = spl_start_[fixed_hzs_ + cand_len];
-
- // 3.2 Save the length of the original string.
- size_t pys_decoded_len = pys_decoded_len_;
-
- // 3.2 Reset the space of the fixed part.
- reset_search(step_to, false, false, true);
-
- // 3.3 For the last character of the fixed part, the previous DMI
- // information will be kept, while the MTRX information will be re-extended,
- // and only one node will be extended.
- matrix_[step_to].mtrx_nd_num = 0;
-
- LmaPsbItem lpi_item;
- lpi_item.psb = score_chosen;
- lpi_item.id = id_chosen;
-
- PoolPosType step_to_dmi_fr = match_dmi(step_to,
- spl_id_ + fixed_hzs_, cand_len);
- //assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
-
- extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
- step_to_dmi_fr, step_to);
-
- matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
- mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
- matrix_[step_to].mtrx_nd_num;
-
- if (id_chosen == lma_id_[fixed_lmas_])
- fixed_lmas_no1_[fixed_lmas_] = 1;
- else
- fixed_lmas_no1_[fixed_lmas_] = 0;
- lma_id_[fixed_lmas_] = id_chosen;
- lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
- fixed_lmas_++;
- fixed_hzs_ = fixed_hzs_ + cand_len;
-
- while (step_to != pys_decoded_len) {
- bool b = add_char(pys_[step_to]);
- assert(b);
- step_to++;
- }
-
- if (fixed_hzs_ < spl_id_num_) {
- prepare_candidates();
- } else {
- lpi_total_ = 0;
- if (NULL != user_dict_) {
- try_add_cand0_to_userdict();
- }
- }
-
- return get_candidate_num();
-}
-
-size_t MatrixSearch::cancel_last_choice() {
- if (!inited_ || 0 == pys_decoded_len_)
- return 0;
-
- size_t step_start = 0;
- if (fixed_hzs_ > 0) {
- size_t step_end = spl_start_[fixed_hzs_];
- MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
- assert(NULL != end_node);
-
- step_start = end_node->from->step;
-
- if (step_start > 0) {
- DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
- fixed_hzs_ -= dmi->dict_level;
- } else {
- fixed_hzs_ = 0;
- }
-
- reset_search(step_start, false, false, false);
-
- while (pys_[step_start] != '\0') {
- bool b = add_char(pys_[step_start]);
- assert(b);
- step_start++;
- }
-
- prepare_candidates();
- }
- return get_candidate_num();
-}
-
-size_t MatrixSearch::get_fixedlen() {
- if (!inited_ || 0 == pys_decoded_len_)
- return 0;
- return fixed_hzs_;
-}
-
-bool MatrixSearch::prepare_add_char(char ch) {
- if (pys_decoded_len_ >= kMaxRowNum - 1 ||
- (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
- return false;
-
- if (dmi_pool_used_ >= kDmiPoolSize) return false;
-
- pys_[pys_decoded_len_] = ch;
- pys_decoded_len_++;
-
- MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
- mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
- mtrx_this_row->mtrx_nd_num = 0;
- mtrx_this_row->dmi_pos = dmi_pool_used_;
- mtrx_this_row->dmi_num = 0;
- mtrx_this_row->dmi_has_full_id = 0;
-
- return true;
-}
-
-bool MatrixSearch::is_split_at(uint16 pos) {
- return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
-}
-
-void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
- PoolPosType dmi_fr, uint16 spl_id,
- uint16 node_num, unsigned char dict_level,
- bool splid_end_split, unsigned char splstr_len,
- unsigned char all_full_id) {
- dmi->dict_handles[0] = handles[0];
- dmi->dict_handles[1] = handles[1];
- dmi->dmi_fr = dmi_fr;
- dmi->spl_id = spl_id;
- dmi->dict_level = dict_level;
- dmi->splid_end_split = splid_end_split ? 1 : 0;
- dmi->splstr_len = splstr_len;
- dmi->all_full_id = all_full_id;
- dmi->c_phrase = 0;
-}
-
-bool MatrixSearch::add_char(char ch) {
- if (!prepare_add_char(ch))
- return false;
- return add_char_qwerty();
-}
-
-bool MatrixSearch::add_char_qwerty() {
- matrix_[pys_decoded_len_].mtrx_nd_num = 0;
-
- bool spl_matched = false;
- uint16 longest_ext = 0;
- // Extend the search matrix, from the oldest unfixed row. ext_len means
- // extending length.
- for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
- if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
- continue;
-
- // Refer to the declaration of the variable dmi_has_full_id for the
- // explanation of this piece of code. In one word, it is used to prevent
- // from the unwise extending of "shoud ou" but allow the reasonable
- // extending of "heng ao", "lang a", etc.
- if (ext_len > 1 && 0 != longest_ext &&
- 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
- if (xi_an_enabled_)
- continue;
- else
- break;
- }
-
- uint16 oldrow = pys_decoded_len_ - ext_len;
-
- // 0. If that row is before the last fixed step, ignore.
- if (spl_start_[fixed_hzs_] > oldrow)
- continue;
-
- // 1. Check if that old row has valid MatrixNode. If no, means that row is
- // not a boundary, either a word boundary or a spelling boundary.
- // If it is for extending composing phrase, it's OK to ignore the 0.
- if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
- continue;
-
- // 2. Get spelling id(s) for the last ext_len chars.
- uint16 spl_idx;
- bool is_pre = false;
- spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
- ext_len, &is_pre);
- if (is_pre)
- spl_matched = true;
-
- if (0 == spl_idx)
- continue;
-
- bool splid_end_split = is_split_at(oldrow + ext_len);
-
- // 3. Extend the DMI nodes of that old row
- // + 1 is to extend an extra node from the root
- for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
- dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
- dmi_pos++) {
- DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
- if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
- dmi = NULL; // The last one, NULL means extending from the root.
- } else {
- // If the dmi is covered by the fixed arrange, ignore it.
- if (fixed_hzs_ > 0 &&
- pys_decoded_len_ - ext_len - dmi->splstr_len <
- spl_start_[fixed_hzs_]) {
- continue;
- }
- // If it is not in mode for composing phrase, and the source DMI node
- // is marked for composing phrase, ignore this node.
- if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
- continue;
- }
- }
-
- // For example, if "gao" is extended, "g ao" is not allowed.
- // or "zh" has been passed, "z h" is not allowed.
- // Both word and word-connection will be prevented.
- if (longest_ext > ext_len) {
- if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
- continue;
- }
-
- // "z h" is not allowed.
- if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
- continue;
- }
- }
-
- dep_->splids_extended = 0;
- if (NULL != dmi) {
- uint16 prev_ids_num = dmi->dict_level;
- if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
- (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) {
- continue;
- }
-
- DictMatchInfo *d = dmi;
- while (d) {
- dep_->splids[--prev_ids_num] = d->spl_id;
- if ((PoolPosType)-1 == d->dmi_fr)
- break;
- d = dmi_pool_ + d->dmi_fr;
- }
- assert(0 == prev_ids_num);
- dep_->splids_extended = dmi->dict_level;
- }
- dep_->splids[dep_->splids_extended] = spl_idx;
- dep_->ext_len = ext_len;
- dep_->splid_end_split = splid_end_split;
-
- dep_->id_num = 1;
- dep_->id_start = spl_idx;
- if (spl_trie_->is_half_id(spl_idx)) {
- // Get the full id list
- dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
- assert(dep_->id_num > 0);
- }
-
- uint16 new_dmi_num;
-
- new_dmi_num = extend_dmi(dep_, dmi);
-
- if (new_dmi_num > 0) {
- if (dmi_c_phrase_) {
- dmi_pool_[dmi_pool_used_].c_phrase = 1;
- }
- matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
- dmi_pool_used_ += new_dmi_num;
-
- if (!spl_trie_->is_half_id(spl_idx))
- matrix_[pys_decoded_len_].dmi_has_full_id = 1;
- }
-
- // If get candiate lemmas, try to extend the path
- if (lpi_total_ > 0) {
- uint16 fr_row;
- if (NULL == dmi) {
- fr_row = oldrow;
- } else {
- assert(oldrow >= dmi->splstr_len);
- fr_row = oldrow - dmi->splstr_len;
- }
- for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
- mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
- matrix_[fr_row].mtrx_nd_num;
- mtrx_nd_pos++) {
- MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
-
- extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
- dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
- if (longest_ext == 0)
- longest_ext = ext_len;
- }
- }
- } // for dmi_pos
- } // for ext_len
- mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
-
- if (dmi_c_phrase_)
- return true;
-
- return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
-}
-
-void MatrixSearch::prepare_candidates() {
- // Get candiates from the first un-fixed step.
- uint16 lma_size_max = kMaxLemmaSize;
- if (lma_size_max > spl_id_num_ - fixed_hzs_)
- lma_size_max = spl_id_num_ - fixed_hzs_;
-
- uint16 lma_size = lma_size_max;
-
- // If the full sentense candidate's unfixed part may be the same with a normal
- // lemma. Remove the lemma candidate in this case.
- char16 fullsent[kMaxLemmaSize + 1];
- char16 *pfullsent = NULL;
- uint16 sent_len;
- pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
-
- // If the unfixed part contains more than one ids, it is not necessary to
- // check whether a lemma's string is the same to the unfixed part of the full
- // sentence candidate, so, set it to NULL;
- if (sent_len > kMaxLemmaSize)
- pfullsent = NULL;
-
- lpi_total_ = 0;
- size_t lpi_num_full_match = 0; // Number of items which are fully-matched.
- while (lma_size > 0) {
- size_t lma_num;
- lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
- lpi_items_ + lpi_total_,
- size_t(kMaxLmaPsbItems - lpi_total_),
- pfullsent, lma_size == lma_size_max);
-
- if (lma_num > 0) {
- lpi_total_ += lma_num;
- // For next lemma candidates which are not the longest, it is not
- // necessary to compare with the full sentence candiate.
- pfullsent = NULL;
- }
- if (lma_size == lma_size_max) {
- lpi_num_full_match = lpi_total_;
- }
- lma_size--;
- }
-
- // Sort those partially-matched items by their unified scores.
- myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
- sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
-
- if (kPrintDebug0) {
- printf("-----Prepare candidates, score:\n");
- for (size_t a = 0; a < lpi_total_; a++) {
- printf("[%03d]%d ", a, lpi_items_[a].psb);
- if ((a + 1) % 6 == 0) printf("\n");
- }
- printf("\n");
- }
-
- if (kPrintDebug0) {
- printf("--- lpi_total_ = %d\n", lpi_total_);
- }
-}
-
-const char* MatrixSearch::get_pystr(size_t *decoded_len) {
- if (!inited_ || NULL == decoded_len)
- return NULL;
-
- *decoded_len = pys_decoded_len_;
- return pys_;
-}
-
-void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
- if (fixed_lmas_ == 0)
- return;
- // Update spelling segmentation information first.
- spl_id_num_ -= 1;
- uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
- for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
- spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
- if (pos == spl_id_num_)
- break;
- spl_id_[pos] = spl_id_[pos + 1];
- }
-
- // Begin to merge.
- uint16 phrase_len = 0;
-
- // Update the spelling ids to the composing phrase.
- // We need to convert these ids into full id in the future.
- memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
- memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
-
- // If composing phrase has not been created, first merge all fixed
- // lemmas into a composing phrase without deletion.
- if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
- uint16 bp = 1; // Begin position of real fixed lemmas.
- // There is no existing composing phrase.
- if (kLemmaIdComposing != lma_id_[0]) {
- c_phrase_.sublma_num = 0;
- bp = 0;
- }
-
- uint16 sub_num = c_phrase_.sublma_num;
- for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
- c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
- if (lma_start_[pos] > del_spl_pos) {
- c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
- }
-
- if (pos == fixed_lmas_)
- break;
-
- uint16 lma_len;
- char16 *lma_str = c_phrase_.chn_str +
- c_phrase_.sublma_start[sub_num] + phrase_len;
-
- lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
- assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
- phrase_len += lma_len;
- }
- assert(phrase_len == lma_start_[fixed_lmas_]);
- c_phrase_.length = phrase_len; // will be deleted by 1
- c_phrase_.sublma_num += fixed_lmas_ - bp;
- } else {
- for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
- if (c_phrase_.sublma_start[pos] > del_spl_pos) {
- c_phrase_.sublma_start[pos] -= 1;
- }
- }
- phrase_len = c_phrase_.length;
- }
-
- assert(phrase_len > 0);
- if (1 == phrase_len) {
- // After the only one is deleted, nothing will be left.
- fixed_lmas_ = 0;
- return;
- }
-
- // Delete the Chinese character in the merged phrase.
- // The corresponding elements in spl_ids and spl_start of the
- // phrase have been deleted.
- char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
- for (uint16 pos = 0;
- pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
- pos++) {
- chn_str[pos] = chn_str[pos + 1];
- }
- c_phrase_.length -= 1;
-
- // If the deleted spelling id is in a sub lemma which contains more than
- // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
- // which only contains 1 id, the whole sub lemma needs to be deleted, so
- // del_a_sub will be true.
- bool del_a_sub = false;
- for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
- if (c_phrase_.sublma_start[pos - 1] ==
- c_phrase_.sublma_start[pos]) {
- del_a_sub = true;
- }
- if (del_a_sub) {
- c_phrase_.sublma_start[pos - 1] =
- c_phrase_.sublma_start[pos];
- }
- }
- if (del_a_sub)
- c_phrase_.sublma_num -= 1;
-
- return;
-}
-
-void MatrixSearch::get_spl_start_id() {
- lma_id_num_ = 0;
- lma_start_[0] = 0;
-
- spl_id_num_ = 0;
- spl_start_[0] = 0;
- if (!inited_ || 0 == pys_decoded_len_ ||
- 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
- return;
-
- // Calculate number of lemmas and spellings
- // Only scan those part which is not fixed.
- lma_id_num_ = fixed_lmas_;
- spl_id_num_ = fixed_hzs_;
-
- MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
- while (mtrx_nd != mtrx_nd_pool_) {
- if (fixed_hzs_ > 0) {
- if (mtrx_nd->step <= spl_start_[fixed_hzs_])
- break;
- }
-
- // Update the spelling segamentation information
- unsigned char word_splstr_len = 0;
- PoolPosType dmi_fr = mtrx_nd->dmi_fr;
- if ((PoolPosType)-1 != dmi_fr)
- word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
-
- while ((PoolPosType)-1 != dmi_fr) {
- spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
- (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
- spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
- spl_id_num_++;
- dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
- }
-
- // Update the lemma segmentation information
- lma_start_[lma_id_num_ + 1] = spl_id_num_;
- lma_id_[lma_id_num_] = mtrx_nd->id;
- lma_id_num_++;
-
- mtrx_nd = mtrx_nd->from;
- }
-
- // Reverse the result of spelling info
- for (size_t pos = fixed_hzs_;
- pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
- if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
- spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
- spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
- spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
-
- spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
- spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
- spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
- }
- }
-
- // Reverse the result of lemma info
- for (size_t pos = fixed_lmas_;
- pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
- assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
-
- if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
- lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
- lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
- lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
-
- lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
- lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
- lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
- }
- }
-
- for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
- if (pos < lma_id_num_)
- lma_start_[pos] = lma_start_[pos - 1] +
- (lma_start_[pos] - lma_start_[pos + 1]);
- else
- lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
- lma_start_[fixed_lmas_];
- }
-
- // Find the last fixed position
- fixed_hzs_ = 0;
- for (size_t pos = spl_id_num_; pos > 0; pos--) {
- if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
- fixed_hzs_ = pos;
- break;
- }
- }
-
- return;
-}
-
-size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
- get_spl_start_id();
- spl_start = spl_start_;
- return spl_id_num_;
-}
-
-size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
- if (dmi_pool_used_ >= kDmiPoolSize) return 0;
-
- if (dmi_c_phrase_)
- return extend_dmi_c(dep, dmi_s);
-
- LpiCache& lpi_cache = LpiCache::get_instance();
- uint16 splid = dep->splids[dep->splids_extended];
-
- bool cached = false;
- if (0 == dep->splids_extended)
- cached = lpi_cache.is_cached(splid);
-
- // 1. If this is a half Id, get its corresponding full starting Id and
- // number of full Id.
- size_t ret_val = 0;
- PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node
-
- lpi_total_ = 0;
-
- MileStoneHandle from_h[3];
- from_h[0] = 0;
- from_h[1] = 0;
-
- if (0 != dep->splids_extended) {
- from_h[0] = dmi_s->dict_handles[0];
- from_h[1] = dmi_s->dict_handles[1];
- }
-
- // 2. Begin exgtending in the system dictionary
- size_t lpi_num = 0;
- MileStoneHandle handles[2];
- handles[0] = handles[1] = 0;
- if (from_h[0] > 0 || NULL == dmi_s) {
- handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
- kMaxLmaPsbItems, &lpi_num);
- }
- if (handles[0] > 0)
- lpi_total_ = lpi_num;
-
- if (NULL == dmi_s) { // from root
- assert(0 != handles[0]);
- mtrx_dmi_fr = dmi_pool_used_;
- }
-
- // 3. Begin extending in the user dictionary
- if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
- handles[1] = user_dict_->extend_dict(from_h[1], dep,
- lpi_items_ + lpi_total_,
- kMaxLmaPsbItems - lpi_total_,
- &lpi_num);
- if (handles[1] > 0) {
- if (kPrintDebug0) {
- for (size_t t = 0; t < lpi_num; t++) {
- printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
- lpi_items_[lpi_total_ + t].psb);
- }
- }
- lpi_total_ += lpi_num;
- }
- }
-
- if (0 != handles[0] || 0 != handles[1]) {
- if (dmi_pool_used_ >= kDmiPoolSize) return 0;
-
- DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
- if (NULL == dmi_s) {
- fill_dmi(dmi_add, handles,
- (PoolPosType)-1, splid,
- 1, 1, dep->splid_end_split, dep->ext_len,
- spl_trie_->is_half_id(splid) ? 0 : 1);
- } else {
- fill_dmi(dmi_add, handles,
- dmi_s - dmi_pool_, splid, 1,
- dmi_s->dict_level + 1, dep->splid_end_split,
- dmi_s->splstr_len + dep->ext_len,
- spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
- }
-
- ret_val = 1;
- }
-
- if (!cached) {
- if (0 == lpi_total_)
- return ret_val;
-
- if (kPrintDebug0) {
- printf("--- lpi_total_ = %d\n", lpi_total_);
- }
-
- myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
- if (NULL == dmi_s && spl_trie_->is_half_id(splid))
- lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
- } else {
- assert(spl_trie_->is_half_id(splid));
- lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
- }
-
- return ret_val;
-}
-
-size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
- lpi_total_ = 0;
-
- uint16 pos = dep->splids_extended;
- assert(dmi_c_phrase_);
- if (pos >= c_phrase_.length)
- return 0;
-
- uint16 splid = dep->splids[pos];
- if (splid == c_phrase_.spl_ids[pos]) {
- DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
- MileStoneHandle handles[2]; // Actually never used.
- if (NULL == dmi_s)
- fill_dmi(dmi_add, handles,
- (PoolPosType)-1, splid,
- 1, 1, dep->splid_end_split, dep->ext_len,
- spl_trie_->is_half_id(splid) ? 0 : 1);
- else
- fill_dmi(dmi_add, handles,
- dmi_s - dmi_pool_, splid, 1,
- dmi_s->dict_level + 1, dep->splid_end_split,
- dmi_s->splstr_len + dep->ext_len,
- spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
-
- if (pos == c_phrase_.length - 1) {
- lpi_items_[0].id = kLemmaIdComposing;
- lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score.
- lpi_total_ = 1;
- }
- return 1;
- }
- return 0;
-}
-
-size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
- size_t lpi_num, PoolPosType dmi_fr,
- size_t res_row) {
- assert(NULL != mtrx_nd);
- matrix_[res_row].mtrx_nd_fixed = NULL;
-
- if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
- return 0;
-
- if (0 == mtrx_nd->step) {
- // Because the list is sorted, if the source step is 0, it is only
- // necessary to pick up the first kMaxNodeARow items.
- if (lpi_num > kMaxNodeARow)
- lpi_num = kMaxNodeARow;
- }
-
- MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
- for (size_t pos = 0; pos < lpi_num; pos++) {
- float score = mtrx_nd->score + lpi_items[pos].psb;
- if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
- break;
-
- // Try to add a new node
- size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
- MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
- bool replace = false;
- // Find its position
- while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
- if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
- *mtrx_nd_res = *(mtrx_nd_res - 1);
- mtrx_nd_res--;
- replace = true;
- }
- if (replace || (mtrx_nd_num < kMaxNodeARow &&
- matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
- mtrx_nd_res->id = lpi_items[pos].id;
- mtrx_nd_res->score = score;
- mtrx_nd_res->from = mtrx_nd;
- mtrx_nd_res->dmi_fr = dmi_fr;
- mtrx_nd_res->step = res_row;
- if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
- matrix_[res_row].mtrx_nd_num++;
- }
- }
- return matrix_[res_row].mtrx_nd_num;
-}
-
-PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
- uint16 spl_id_num) {
- if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
- return static_cast<PoolPosType>(-1);
- }
-
- for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
- DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
-
- if (dmi->dict_level != spl_id_num)
- continue;
-
- bool matched = true;
- for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
- if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
- matched = false;
- break;
- }
-
- dmi = dmi_pool_ + dmi->dmi_fr;
- }
- if (matched) {
- return matrix_[step_to].dmi_pos + dmi_pos;
- }
- }
-
- return static_cast<PoolPosType>(-1);
-}
-
-char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
- uint16 *retstr_len,
- bool only_unfixed) {
- if (pys_decoded_len_ == 0 ||
- matrix_[pys_decoded_len_].mtrx_nd_num == 0)
- return NULL;
-
- LemmaIdType idxs[kMaxRowNum];
- size_t id_num = 0;
-
- MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
-
- if (kPrintDebug0) {
- printf("--- sentence score: %f\n", mtrx_nd->score);
- }
-
- if (kPrintDebug1) {
- printf("==============Sentence DMI (reverse order) begin===========>>\n");
- }
-
- while (mtrx_nd != NULL) {
- idxs[id_num] = mtrx_nd->id;
- id_num++;
-
- if (kPrintDebug1) {
- printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
- mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
- debug_print_dmi(mtrx_nd->dmi_fr, 1);
- }
-
- mtrx_nd = mtrx_nd->from;
- }
-
- if (kPrintDebug1) {
- printf("<<==============Sentence DMI (reverse order) end=============\n");
- }
-
- size_t ret_pos = 0;
- do {
- id_num--;
- if (0 == idxs[id_num])
- continue;
-
- char16 str[kMaxLemmaSize + 1];
- uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
- if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
- (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
- if (!only_unfixed)
- utf16_strncpy(cand_str + ret_pos, str, str_len);
- else if (ret_pos >= fixed_hzs_)
- utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
-
- ret_pos += str_len;
- } else {
- return NULL;
- }
- } while (id_num != 0);
-
- if (!only_unfixed) {
- if (NULL != retstr_len)
- *retstr_len = ret_pos;
- cand_str[ret_pos] = (char16)'\0';
- } else {
- if (NULL != retstr_len)
- *retstr_len = ret_pos - fixed_hzs_;
- cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
- }
- return cand_str;
-}
-
-size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
- LmaPsbItem* lma_buf, size_t max_lma_buf,
- const char16 *pfullsent, bool sort_by_psb) {
- if (splid_str_len > kMaxLemmaSize)
- return 0;
-
- size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
- lma_buf, max_lma_buf);
- size_t num2 = 0;
- if (NULL != user_dict_) {
- num2 = user_dict_->get_lpis(splid_str, splid_str_len,
- lma_buf + num1, max_lma_buf - num1);
- }
-
- size_t num = num1 + num2;
-
- if (0 == num)
- return 0;
-
- // Remove repeated items.
- if (splid_str_len > 1) {
- LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
- size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
- sizeof(LmaPsbStrItem);
- //assert(lpsi_num > num);
- if (num > lpsi_num) num = lpsi_num;
- lpsi_num = num;
-
- for (size_t pos = 0; pos < lpsi_num; pos++) {
- lpsis[pos].lpi = lma_buf[pos];
- get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
- }
-
- myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
-
- size_t remain_num = 0;
- for (size_t pos = 0; pos < lpsi_num; pos++) {
- if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
- if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
- assert(remain_num > 0);
- lma_buf[remain_num - 1] = lpsis[pos].lpi;
- }
- continue;
- }
- if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
- continue;
-
- lma_buf[remain_num] = lpsis[pos].lpi;
- remain_num++;
- }
-
- // Update the result number
- num = remain_num;
- } else {
- // For single character, some characters have more than one spelling, for
- // example, "de" and "di" are all valid for a Chinese character, so when
- // the user input "d", repeated items are generated.
- // For single character lemmas, Hanzis will be gotten
- for (size_t pos = 0; pos < num; pos++) {
- char16 hanzis[2];
- get_lemma_str(lma_buf[pos].id, hanzis, 2);
- lma_buf[pos].hanzi = hanzis[0];
- }
-
- myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
-
- size_t remain_num = 0;
- for (size_t pos = 0; pos < num; pos++) {
- if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
- if (NULL != pfullsent &&
- static_cast<char16>(0) == pfullsent[1] &&
- lma_buf[pos].hanzi == pfullsent[0])
- continue;
-
- if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
- assert(remain_num > 0);
- assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
- lma_buf[remain_num - 1] = lma_buf[pos];
- }
- continue;
- }
- if (NULL != pfullsent &&
- static_cast<char16>(0) == pfullsent[1] &&
- lma_buf[pos].hanzi == pfullsent[0])
- continue;
-
- lma_buf[remain_num] = lma_buf[pos];
- remain_num++;
- }
-
- num = remain_num;
- }
-
- if (sort_by_psb) {
- myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
- }
- return num;
-}
-
-uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
- uint16 str_max) {
- uint16 str_len = 0;
-
- if (is_system_lemma(id_lemma)) {
- str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
- } else if (is_user_lemma(id_lemma)) {
- if (NULL != user_dict_) {
- str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
- } else {
- str_len = 0;
- str_buf[0] = static_cast<char16>('\0');
- }
- } else if (is_composing_lemma(id_lemma)) {
- if (str_max <= 1)
- return 0;
- str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
- if (str_len > str_max - 1)
- str_len = str_max - 1;
- utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
- str_buf[str_len] = (char16)'\0';
- return str_len;
- }
-
- return str_len;
-}
-
-uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid) {
- uint16 splid_num = 0;
-
- if (arg_valid) {
- for (splid_num = 0; splid_num < splids_max; splid_num++) {
- if (spl_trie_->is_half_id(splids[splid_num]))
- break;
- }
- if (splid_num == splids_max)
- return splid_num;
- }
-
- if (is_system_lemma(id_lemma)) {
- splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
- arg_valid);
- } else if (is_user_lemma(id_lemma)) {
- if (NULL != user_dict_) {
- splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
- arg_valid);
- } else {
- splid_num = 0;
- }
- } else if (is_composing_lemma(id_lemma)) {
- if (c_phrase_.length > splids_max) {
- return 0;
- }
- for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
- splids[pos] = c_phrase_.spl_ids[pos];
- if (spl_trie_->is_half_id(splids[pos])) {
- return 0;
- }
- }
- }
- return splid_num;
-}
-
-size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
- char16 predict_buf[][kMaxPredictSize + 1],
- size_t buf_len) {
- size_t res_total = 0;
- memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
- // In order to shorten the comments, j-character candidates predicted by
- // i-character prefix are called P(i,j). All candiates predicted by
- // i-character prefix are called P(i,*)
- // Step 1. Get P(kMaxPredictSize, *) and sort them, here
- // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
- for (size_t len = fixed_len; len >0; len--) {
- // How many blank items are available
- size_t this_max = npre_items_len_ - res_total;
- size_t res_this;
- // If the history is longer than 1, and we can not get prediction from
- // lemmas longer than 2, in this case, we will add lemmas with
- // highest scores as the prediction result.
- if (fixed_len > 1 && 1 == len && 0 == res_total) {
- // Try to find if recent n (n>1) characters can be a valid lemma in system
- // dictionary.
- bool nearest_n_word = false;
- for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
- if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
- nearest_n_word = true;
- break;
- }
- }
- res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
- npre_items_ + res_total,
- this_max, res_total);
- res_total += res_this;
- }
-
- // How many blank items are available
- this_max = npre_items_len_ - res_total;
- res_this = 0;
- if (!kOnlyUserDictPredict) {
- res_this =
- dict_trie_->predict(fixed_buf + fixed_len - len, len,
- npre_items_ + res_total, this_max,
- res_total);
- }
-
- if (NULL != user_dict_) {
- res_this = res_this +
- user_dict_->predict(fixed_buf + fixed_len - len, len,
- npre_items_ + res_total + res_this,
- this_max - res_this, res_total + res_this);
- }
-
- if (kPredictLimitGt1) {
- myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
- cmp_npre_by_score);
-
- if (len > 3) {
- if (res_this > kMaxPredictNumByGt3)
- res_this = kMaxPredictNumByGt3;
- } else if (3 == len) {
- if (res_this > kMaxPredictNumBy3)
- res_this = kMaxPredictNumBy3;
- } else if (2 == len) {
- if (res_this > kMaxPredictNumBy2)
- res_this = kMaxPredictNumBy2;
- }
- }
-
- res_total += res_this;
- }
-
- res_total = remove_duplicate_npre(npre_items_, res_total);
-
- if (kPreferLongHistoryPredict) {
- myqsort(npre_items_, res_total, sizeof(NPredictItem),
- cmp_npre_by_hislen_score);
- } else {
- myqsort(npre_items_, res_total, sizeof(NPredictItem),
- cmp_npre_by_score);
- }
-
- if (buf_len < res_total) {
- res_total = buf_len;
- }
-
- if (kPrintDebug2) {
- printf("/////////////////Predicted Items Begin////////////////////>>\n");
- for (size_t i = 0; i < res_total; i++) {
- printf("---");
- for (size_t j = 0; j < kMaxPredictSize; j++) {
- printf("%d ", npre_items_[i].pre_hzs[j]);
- }
- printf("\n");
- }
- printf("<<///////////////Predicted Items End////////////////////////\n");
- }
-
- for (size_t i = 0; i < res_total; i++) {
- utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
- kMaxPredictSize);
- predict_buf[i][kMaxPredictSize] = '\0';
- }
-
- return res_total;
-}
-
-size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
- char16 predict_buf[][kMaxPredictSize + 1],
- size_t buf_len) {
- size_t fixed_len = utf16_strlen(fixed_buf);
- if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
- return 0;
-
- return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
-}
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp
deleted file mode 100644
index 93bbcc9f..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/mystdlib.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-
-namespace ime_pinyin {
-
-// For debug purpose. You can add a fixed version of qsort and bsearch functions
-// here so that the output will be totally the same under different platforms.
-
-void myqsort(void *p, size_t n, size_t es,
- int (*cmp)(const void *, const void *)) {
- qsort(p,n, es, cmp);
-}
-
-void *mybsearch(const void *k, const void *b,
- size_t n, size_t es,
- int (*cmp)(const void *, const void *)) {
- return bsearch(k, b, n, es, cmp);
-}
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp
deleted file mode 100644
index 6aec850b..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/ngram.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include "../include/mystdlib.h"
-#include "../include/ngram.h"
-
-namespace ime_pinyin {
-
-#define ADD_COUNT 0.3
-
-int comp_double(const void *p1, const void *p2) {
- if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
- return -1;
- if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
- return 1;
- return 0;
-}
-
-inline double distance(double freq, double code) {
- // return fabs(freq - code);
- return freq * fabs(log(freq) - log(code));
-}
-
-// Find the index of the code value which is nearest to the given freq
-int qsearch_nearest(double code_book[], double freq, int start, int end) {
- if (start == end)
- return start;
-
- if (start + 1 == end) {
- if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
- return start;
- return end;
- }
-
- int mid = (start + end) / 2;
-
- if (code_book[mid] > freq)
- return qsearch_nearest(code_book, freq, start, mid);
- else
- return qsearch_nearest(code_book, freq, mid, end);
-}
-
-size_t update_code_idx(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- size_t changed = 0;
- for (size_t pos = 0; pos < num; pos++) {
- CODEBOOK_TYPE idx;
- idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
- if (idx != code_idx[pos])
- changed++;
- code_idx[pos] = idx;
- }
- return changed;
-}
-
-double recalculate_kernel(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- double ret = 0;
-
- size_t *item_num = new size_t[kCodeBookSize];
- assert(item_num);
- memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
-
- double *cb_new = new double[kCodeBookSize];
- assert(cb_new);
- memset(cb_new, 0, sizeof(double) * kCodeBookSize);
-
- for (size_t pos = 0; pos < num; pos++) {
- ret += distance(freqs[pos], code_book[code_idx[pos]]);
-
- cb_new[code_idx[pos]] += freqs[pos];
- item_num[code_idx[pos]] += 1;
- }
-
- for (size_t code = 0; code < kCodeBookSize; code++) {
- assert(item_num[code] > 0);
- code_book[code] = cb_new[code] / item_num[code];
- }
-
- delete [] item_num;
- delete [] cb_new;
-
- return ret;
-}
-
-void iterate_codes(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- size_t iter_num = 0;
- double delta_last = 0;
- do {
- size_t changed = update_code_idx(freqs, num, code_book, code_idx);
-
- double delta = recalculate_kernel(freqs, num, code_book, code_idx);
-
- if (kPrintDebug0) {
- printf("---Unigram codebook iteration: %d : %d, %.9f\n",
- iter_num, changed, delta);
- }
- iter_num++;
-
- if (iter_num > 1 &&
- (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
- break;
- delta_last = delta;
- } while (true);
-}
-
-
-NGram* NGram::instance_ = NULL;
-
-NGram::NGram() {
- initialized_ = false;
- idx_num_ = 0;
- lma_freq_idx_ = NULL;
- sys_score_compensation_ = 0;
-
-#ifdef ___BUILD_MODEL___
- freq_codes_df_ = NULL;
-#endif
- freq_codes_ = NULL;
-}
-
-NGram::~NGram() {
- if (NULL != lma_freq_idx_)
- free(lma_freq_idx_);
-
-#ifdef ___BUILD_MODEL___
- if (NULL != freq_codes_df_)
- free(freq_codes_df_);
-#endif
-
- if (NULL != freq_codes_)
- free(freq_codes_);
-}
-
-NGram& NGram::get_instance() {
- if (NULL == instance_)
- instance_ = new NGram();
- return *instance_;
-}
-
-bool NGram::save_ngram(FILE *fp) {
- if (!initialized_ || NULL == fp)
- return false;
-
- if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_)
- return false;
-
- if (fwrite(&idx_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
- kCodeBookSize)
- return false;
-
- if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
- return false;
-
- return true;
-}
-
-bool NGram::load_ngram(FILE *fp) {
- if (NULL == fp)
- return false;
-
- initialized_ = false;
-
- if (fread(&idx_num_, sizeof(uint32), 1, fp) != 1 )
- return false;
-
- if (NULL != lma_freq_idx_)
- free(lma_freq_idx_);
-
- if (NULL != freq_codes_)
- free(freq_codes_);
-
- lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
- (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
- freq_codes_ = static_cast<LmaScoreType*>
- (malloc(kCodeBookSize * sizeof(LmaScoreType)));
-
- if (NULL == lma_freq_idx_ || NULL == freq_codes_)
- return false;
-
- if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
- kCodeBookSize)
- return false;
-
- if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
- return false;
-
- initialized_ = true;
-
- total_freq_none_sys_ = 0;
- return true;
-}
-
-void NGram::set_total_freq_none_sys(size_t freq_none_sys) {
- total_freq_none_sys_ = freq_none_sys;
- if (0 == total_freq_none_sys_) {
- sys_score_compensation_ = 0;
- } else {
- double factor = static_cast<double>(kSysDictTotalFreq) / (
- kSysDictTotalFreq + total_freq_none_sys_);
- sys_score_compensation_ = static_cast<float>(
- log(factor) * kLogValueAmplifier);
- }
-}
-
-// The caller makes sure this oject is initialized.
-float NGram::get_uni_psb(LemmaIdType lma_id) {
- return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
- sys_score_compensation_;
-}
-
-float NGram::convert_psb_to_score(double psb) {
- float score = static_cast<float>(
- log(psb) * static_cast<double>(kLogValueAmplifier));
- if (score > static_cast<float>(kMaxScore)) {
- score = static_cast<float>(kMaxScore);
- }
- return score;
-}
-
-#ifdef ___BUILD_MODEL___
-bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
- LemmaIdType next_idx_unused) {
- if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
- return false;
-
- double total_freq = 0;
- double *freqs = new double[next_idx_unused];
- if (NULL == freqs)
- return false;
-
- freqs[0] = ADD_COUNT;
- total_freq += freqs[0];
- LemmaIdType idx_now = 0;
- for (size_t pos = 0; pos < lemma_num; pos++) {
- if (lemma_arr[pos].idx_by_hz == idx_now)
- continue;
- idx_now++;
-
- assert(lemma_arr[pos].idx_by_hz == idx_now);
-
- freqs[idx_now] = lemma_arr[pos].freq;
- if (freqs[idx_now] <= 0)
- freqs[idx_now] = 0.3;
-
- total_freq += freqs[idx_now];
- }
-
- double max_freq = 0;
- idx_num_ = idx_now + 1;
- assert(idx_now + 1 == next_idx_unused);
-
- for (size_t pos = 0; pos < idx_num_; pos++) {
- freqs[pos] = freqs[pos] / total_freq;
- assert(freqs[pos] > 0);
- if (freqs[pos] > max_freq)
- max_freq = freqs[pos];
- }
-
- // calculate the code book
- if (NULL == freq_codes_df_)
- freq_codes_df_ = new double[kCodeBookSize];
- assert(freq_codes_df_);
- memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
-
- if (NULL == freq_codes_)
- freq_codes_ = new LmaScoreType[kCodeBookSize];
- assert(freq_codes_);
- memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
-
- size_t freq_pos = 0;
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- bool found = true;
-
- while (found) {
- found = false;
- double cand = freqs[freq_pos];
- for (size_t i = 0; i < code_pos; i++)
- if (freq_codes_df_[i] == cand) {
- found = true;
- break;
- }
- if (found)
- freq_pos++;
- }
-
- freq_codes_df_[code_pos] = freqs[freq_pos];
- freq_pos++;
- }
-
- myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
-
- if (NULL == lma_freq_idx_)
- lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
- assert(lma_freq_idx_);
-
- iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
-
- delete [] freqs;
-
- if (kPrintDebug0) {
- printf("\n------Language Model Unigram Codebook------\n");
- }
-
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- double log_score = log(freq_codes_df_[code_pos]);
- float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
- if (kPrintDebug0) {
- printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
- code_pos, freq_codes_df_[code_pos], log_score, final_score);
- }
- freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
- }
-
- initialized_ = true;
- return true;
-}
-#endif
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp
deleted file mode 100644
index 4d206a76..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/pinyinime.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include "../include/pinyinime.h"
-#include "../include/dicttrie.h"
-#include "../include/matrixsearch.h"
-#include "../include/spellingtrie.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- using namespace ime_pinyin;
-
- // The maximum number of the prediction items.
- static const size_t kMaxPredictNum = 500;
-
- // Used to search Pinyin string and give the best candidate.
- MatrixSearch* matrix_search = NULL;
-
- char16 predict_buf[kMaxPredictNum][kMaxPredictSize + 1];
-
- bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict) {
- if (NULL != matrix_search)
- delete matrix_search;
-
- matrix_search = new MatrixSearch();
- if (NULL == matrix_search) {
- return false;
- }
-
- return matrix_search->init(fn_sys_dict, fn_usr_dict);
- }
-
- bool im_open_decoder_fd(int sys_fd, long start_offset, long length,
- const char *fn_usr_dict) {
- if (NULL != matrix_search)
- delete matrix_search;
-
- matrix_search = new MatrixSearch();
- if (NULL == matrix_search)
- return false;
-
- return matrix_search->init_fd(sys_fd, start_offset, length, fn_usr_dict);
- }
-
- void im_close_decoder() {
- if (NULL != matrix_search) {
- matrix_search->close();
- delete matrix_search;
- }
- matrix_search = NULL;
- }
-
- void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
- if (NULL != matrix_search) {
- matrix_search->set_max_lens(max_sps_len, max_hzs_len);
- }
- }
-
- void im_flush_cache() {
- if (NULL != matrix_search)
- matrix_search->flush_cache();
- }
-
- // To be updated.
- size_t im_search(const char* pybuf, size_t pylen) {
- if (NULL == matrix_search)
- return 0;
-
- matrix_search->search(pybuf, pylen);
- return matrix_search->get_candidate_num();
- }
-
- size_t im_delsearch(size_t pos, bool is_pos_in_splid,
- bool clear_fixed_this_step) {
- if (NULL == matrix_search)
- return 0;
- matrix_search->delsearch(pos, is_pos_in_splid, clear_fixed_this_step);
- return matrix_search->get_candidate_num();
- }
-
- void im_reset_search() {
- if (NULL == matrix_search)
- return;
-
- matrix_search->reset_search();
- }
-
- // To be removed
- size_t im_add_letter(char ch) {
- return 0;
- }
-
- const char* im_get_sps_str(size_t *decoded_len) {
- if (NULL == matrix_search)
- return NULL;
-
- return matrix_search->get_pystr(decoded_len);
- }
-
- char16* im_get_candidate(size_t cand_id, char16* cand_str,
- size_t max_len) {
- if (NULL == matrix_search)
- return NULL;
-
- return matrix_search->get_candidate(cand_id, cand_str, max_len);
- }
-
- size_t im_get_spl_start_pos(const uint16 *&spl_start) {
- if (NULL == matrix_search)
- return 0;
-
- return matrix_search->get_spl_start(spl_start);
- }
-
- size_t im_choose(size_t choice_id) {
- if (NULL == matrix_search)
- return 0;
-
- return matrix_search->choose(choice_id);
- }
-
- size_t im_cancel_last_choice() {
- if (NULL == matrix_search)
- return 0;
-
- return matrix_search->cancel_last_choice();
- }
-
- size_t im_get_fixed_len() {
- if (NULL == matrix_search)
- return 0;
-
- return matrix_search->get_fixedlen();
- }
-
- // To be removed
- bool im_cancel_input() {
- return true;
- }
-
-
- size_t im_get_predicts(const char16 *his_buf,
- char16 (*&pre_buf)[kMaxPredictSize + 1]) {
- if (NULL == his_buf)
- return 0;
-
- size_t fixed_len = utf16_strlen(his_buf);
- const char16 *fixed_ptr = his_buf;
- if (fixed_len > kMaxPredictSize) {
- fixed_ptr += fixed_len - kMaxPredictSize;
- fixed_len = kMaxPredictSize;
- }
-
- pre_buf = predict_buf;
- return matrix_search->get_predicts(his_buf, pre_buf, kMaxPredictNum);
- }
-
- void im_enable_shm_as_szm(bool enable) {
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- spl_trie.szm_enable_shm(enable);
- }
-
- void im_enable_ym_as_szm(bool enable) {
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- spl_trie.szm_enable_ym(enable);
- }
-
- void im_init_user_dictionary(const char *fn_usr_dict) {
- if (!matrix_search)
- return;
- matrix_search->flush_cache();
- matrix_search->init_user_dictionary(fn_usr_dict);
- }
-
- bool im_is_user_dictionary_enabled(void) {
- return NULL != matrix_search ? matrix_search->is_user_dictionary_enabled() : false;
- }
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp
deleted file mode 100644
index 281da388..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/searchutility.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include "../include/mystdlib.h"
-#include "../include/searchutility.h"
-
-namespace ime_pinyin {
-
-bool is_system_lemma(LemmaIdType lma_id) {
- return (0 < lma_id && lma_id <= kSysDictIdEnd);
-}
-
-bool is_user_lemma(LemmaIdType lma_id) {
- return (kUserDictIdStart <= lma_id && lma_id <= kUserDictIdEnd);
-}
-
-bool is_composing_lemma(LemmaIdType lma_id) {
- return (kLemmaIdComposing == lma_id);
-}
-
-int cmp_lpi_with_psb(const void *p1, const void *p2) {
- if ((static_cast<const LmaPsbItem*>(p1))->psb >
- (static_cast<const LmaPsbItem*>(p2))->psb)
- return 1;
- if ((static_cast<const LmaPsbItem*>(p1))->psb <
- (static_cast<const LmaPsbItem*>(p2))->psb)
- return -1;
- return 0;
-}
-
-int cmp_lpi_with_unified_psb(const void *p1, const void *p2) {
- const LmaPsbItem *item1 = static_cast<const LmaPsbItem*>(p1);
- const LmaPsbItem *item2 = static_cast<const LmaPsbItem*>(p2);
-
- // The real unified psb is psb1 / lma_len1 and psb2 * lma_len2
- // But we use psb1 * lma_len2 and psb2 * lma_len1 to get better
- // precision.
- size_t up1 = item1->psb * (item2->lma_len);
- size_t up2 = item2->psb * (item1->lma_len);
- if (up1 < up2) {
- return -1;
- }
- if (up1 > up2) {
- return 1;
- }
- return 0;
-}
-
-int cmp_lpi_with_id(const void *p1, const void *p2) {
- if ((static_cast<const LmaPsbItem*>(p1))->id <
- (static_cast<const LmaPsbItem*>(p2))->id)
- return -1;
- if ((static_cast<const LmaPsbItem*>(p1))->id >
- (static_cast<const LmaPsbItem*>(p2))->id)
- return 1;
- return 0;
-}
-
-int cmp_lpi_with_hanzi(const void *p1, const void *p2) {
- if ((static_cast<const LmaPsbItem*>(p1))->hanzi <
- (static_cast<const LmaPsbItem*>(p2))->hanzi)
- return -1;
- if ((static_cast<const LmaPsbItem*>(p1))->hanzi >
- (static_cast<const LmaPsbItem*>(p2))->hanzi)
- return 1;
-
- return 0;
-}
-
-int cmp_lpsi_with_str(const void *p1, const void *p2) {
- return utf16_strcmp((static_cast<const LmaPsbStrItem*>(p1))->str,
- (static_cast<const LmaPsbStrItem*>(p2))->str);
-}
-
-
-int cmp_hanzis_1(const void *p1, const void *p2) {
- if (*static_cast<const char16*>(p1) <
- *static_cast<const char16*>(p2))
- return -1;
-
- if (*static_cast<const char16*>(p1) >
- *static_cast<const char16*>(p2))
- return 1;
- return 0;
-}
-
-int cmp_hanzis_2(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 2);
-}
-
-int cmp_hanzis_3(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 3);
-}
-
-int cmp_hanzis_4(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 4);
-}
-
-int cmp_hanzis_5(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 5);
-}
-
-int cmp_hanzis_6(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 6);
-}
-
-int cmp_hanzis_7(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 7);
-}
-
-int cmp_hanzis_8(const void *p1, const void *p2) {
- return utf16_strncmp(static_cast<const char16*>(p1),
- static_cast<const char16*>(p2), 8);
-}
-
-int cmp_npre_by_score(const void *p1, const void *p2) {
- if ((static_cast<const NPredictItem*>(p1))->psb >
- (static_cast<const NPredictItem*>(p2))->psb)
- return 1;
-
- if ((static_cast<const NPredictItem*>(p1))->psb <
- (static_cast<const NPredictItem*>(p2))->psb)
- return -1;
-
- return 0;
-}
-
-int cmp_npre_by_hislen_score(const void *p1, const void *p2) {
- if ((static_cast<const NPredictItem*>(p1))->his_len <
- (static_cast<const NPredictItem*>(p2))->his_len)
- return 1;
-
- if ((static_cast<const NPredictItem*>(p1))->his_len >
- (static_cast<const NPredictItem*>(p2))->his_len)
- return -1;
-
- if ((static_cast<const NPredictItem*>(p1))->psb >
- (static_cast<const NPredictItem*>(p2))->psb)
- return 1;
-
- if ((static_cast<const NPredictItem*>(p1))->psb <
- (static_cast<const NPredictItem*>(p2))->psb)
- return -1;
-
- return 0;
-}
-
-int cmp_npre_by_hanzi_score(const void *p1, const void *p2) {
- int ret_v = (utf16_strncmp((static_cast<const NPredictItem*>(p1))->pre_hzs,
- (static_cast<const NPredictItem*>(p2))->pre_hzs, kMaxPredictSize));
- if (0 != ret_v)
- return ret_v;
-
- if ((static_cast<const NPredictItem*>(p1))->psb >
- (static_cast<const NPredictItem*>(p2))->psb)
- return 1;
-
- if ((static_cast<const NPredictItem*>(p1))->psb <
- (static_cast<const NPredictItem*>(p2))->psb)
- return -1;
-
- return 0;
-}
-
-size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num) {
- if (NULL == npre_items || 0 == npre_num)
- return 0;
-
- myqsort(npre_items, npre_num, sizeof(NPredictItem), cmp_npre_by_hanzi_score);
-
- size_t remain_num = 1; // The first one is reserved.
- for (size_t pos = 1; pos < npre_num; pos++) {
- if (utf16_strncmp(npre_items[pos].pre_hzs,
- npre_items[remain_num - 1].pre_hzs,
- kMaxPredictSize) != 0) {
- if (remain_num != pos) {
- npre_items[remain_num] = npre_items[pos];
- }
- remain_num++;
- }
- }
- return remain_num;
-}
-
-size_t align_to_size_t(size_t size) {
- size_t s = sizeof(size_t);
- return (size + s -1) / s * s;
-}
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp
deleted file mode 100644
index 6005e20d..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtable.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include "../include/spellingtable.h"
-
-namespace ime_pinyin {
-
-#ifdef ___BUILD_MODEL___
-
-const char SpellingTable::
- kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
-
-// "" is the biggest, so that all empty strings will be moved to the end
-// _eb mean empty is biggest
-int compare_raw_spl_eb(const void* p1, const void* p2) {
- if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
- return 1;
-
- if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
- return -1;
-
- return strcmp((static_cast<const RawSpelling*>(p1))->str,
- (static_cast<const RawSpelling*>(p2))->str);
-}
-
-size_t get_odd_next(size_t value) {
- size_t v_next = value;
- while (true) {
- size_t v_next_sqrt = (size_t)sqrt(v_next);
-
- bool is_odd = true;
- for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
- if (v_next % v_dv == 0) {
- is_odd = false;
- break;
- }
- }
-
- if (is_odd)
- return v_next;
-
- v_next++;
- }
-
- // never reach here
- return 0;
-}
-
-SpellingTable::SpellingTable() {
- need_score_ = false;
- raw_spellings_ = NULL;
- spelling_buf_ = NULL;
- spelling_num_ = 0;
- total_freq_ = 0;
- frozen_ = true;
-}
-
-SpellingTable::~SpellingTable() {
- free_resource();
-}
-
-size_t SpellingTable::get_hash_pos(const char* spelling_str) {
- size_t hash_pos = 0;
- for (size_t pos = 0; pos < spelling_size_; pos++) {
- if ('\0' == spelling_str[pos])
- break;
- hash_pos += (size_t)spelling_str[pos];
- }
-
- hash_pos = hash_pos % spelling_max_num_;
- return hash_pos;
-}
-
-size_t SpellingTable::hash_pos_next(size_t hash_pos) {
- hash_pos += 123;
- hash_pos = hash_pos % spelling_max_num_;
- return hash_pos;
-}
-
-void SpellingTable::free_resource() {
- if (NULL != raw_spellings_)
- delete [] raw_spellings_;
- raw_spellings_ = NULL;
-
- if (NULL != spelling_buf_)
- delete [] spelling_buf_;
- spelling_buf_ = NULL;
-}
-
-bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
- bool need_score) {
- if (pure_spl_size == 0 || spl_max_num ==0)
- return false;
-
- need_score_ = need_score;
-
- free_resource();
-
- spelling_size_ = pure_spl_size + 1;
- if (need_score)
- spelling_size_ += 1;
- spelling_max_num_ = get_odd_next(spl_max_num);
- spelling_num_ = 0;
-
- raw_spellings_ = new RawSpelling[spelling_max_num_];
- spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
- if (NULL == raw_spellings_ || NULL == spelling_buf_) {
- free_resource();
- return false;
- }
-
- memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
- memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
- frozen_ = false;
- total_freq_ = 0;
- return true;
-}
-
-bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
- if (frozen_ || NULL == spelling_str)
- return false;
-
- for (size_t pos = 0; pos < kNotSupportNum; pos++) {
- if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
- return false;
- }
- }
-
- total_freq_ += freq;
-
- size_t hash_pos = get_hash_pos(spelling_str);
-
- raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
-
- if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
- spelling_size_ - 1) == 0) {
- raw_spellings_[hash_pos].freq += freq;
- return true;
- }
-
- size_t hash_pos_ori = hash_pos;
-
- while (true) {
- if (strncmp(raw_spellings_[hash_pos].str,
- spelling_str, spelling_size_ - 1) == 0) {
- raw_spellings_[hash_pos].freq += freq;
- return true;
- }
-
- if ('\0' == raw_spellings_[hash_pos].str[0]) {
- raw_spellings_[hash_pos].freq += freq;
- strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
- raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
- spelling_num_++;
- return true;
- }
-
- hash_pos = hash_pos_next(hash_pos);
- if (hash_pos_ori == hash_pos)
- return false;
- }
-
- // never reach here
- return false;
-}
-
-bool SpellingTable::contain(const char* spelling_str) {
- if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
- return false;
-
- size_t hash_pos = get_hash_pos(spelling_str);
-
- if ('\0' == raw_spellings_[hash_pos].str[0])
- return false;
-
- if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
- == 0)
- return true;
-
- size_t hash_pos_ori = hash_pos;
-
- while (true) {
- hash_pos = hash_pos_next(hash_pos);
- if (hash_pos_ori == hash_pos)
- return false;
-
- if ('\0' == raw_spellings_[hash_pos].str[0])
- return false;
-
- if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
- == 0)
- return true;
- }
-
- // never reach here
- return false;
-}
-
-const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
- if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
- NULL == item_size || NULL == spl_num)
- return NULL;
-
- qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
- compare_raw_spl_eb);
-
- // After sorting, only the first spelling_num_ items are valid.
- // Copy them to the destination buffer.
- for (size_t pos = 0; pos < spelling_num_; pos++) {
- strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
- spelling_size_);
- }
-
- if (need_score_) {
- if (kPrintDebug0)
- printf("------------Spelling Possiblities--------------\n");
-
- double max_score = 0;
- double min_score = 0;
-
- // After sorting, only the first spelling_num_ items are valid.
- for (size_t pos = 0; pos < spelling_num_; pos++) {
- raw_spellings_[pos].freq /= total_freq_;
- if (need_score_) {
- if (0 == pos) {
- max_score = raw_spellings_[0].freq;
- min_score = max_score;
- } else {
- if (raw_spellings_[pos].freq > max_score)
- max_score = raw_spellings_[pos].freq;
- if (raw_spellings_[pos].freq < min_score)
- min_score = raw_spellings_[pos].freq;
- }
- }
- }
-
- if (kPrintDebug0)
- printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
-
- max_score = log(max_score);
- min_score = log(min_score);
-
- if (kPrintDebug0)
- printf("-----max log value: %f, min log value: %f\n",
- max_score, min_score);
-
- // The absolute value of min_score is bigger than that of max_score because
- // both of them are negative after log function.
- score_amplifier_ = 1.0 * 255 / min_score;
-
- double average_score = 0;
- for (size_t pos = 0; pos < spelling_num_; pos++) {
- double score = log(raw_spellings_[pos].freq) * score_amplifier_;
- assert(score >= 0);
-
- average_score += score;
-
- // Because of calculation precision issue, score might be a little bigger
- // than 255 after being amplified.
- if (score > 255)
- score = 255;
- char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
- this_spl_buf[spelling_size_ - 1] =
- static_cast<char>((unsigned char)score);
-
- if (kPrintDebug0) {
- printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
- (unsigned char)this_spl_buf[spelling_size_ -1]);
- }
- }
- average_score /= spelling_num_;
- assert(average_score <= 255);
- average_score_ = static_cast<uint8>(average_score);
-
- if (kPrintDebug0)
- printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
- average_score_);
- }
-
- *item_size = spelling_size_;
- *spl_num = spelling_num_;
- frozen_ = true;
- return spelling_buf_;
-}
-
-float SpellingTable::get_score_amplifier() {
- return static_cast<float>(score_amplifier_);
-}
-
-unsigned char SpellingTable::get_average_score() {
- return average_score_;
-}
-
-#endif // ___BUILD_MODEL___
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp
deleted file mode 100644
index e01c89a5..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/spellingtrie.cpp
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "../include/dictdef.h"
-
-#ifdef _WIN32
-#define snprintf _snprintf
-#endif
-
-#ifdef ___BUILD_MODEL___
-#include "../include/spellingtable.h"
-#endif
-
-#include "../include/spellingtrie.h"
-
-namespace ime_pinyin {
-
-SpellingTrie* SpellingTrie::instance_ = NULL;
-
-// z/c/s is for Zh/Ch/Sh
-const char SpellingTrie::kHalfId2Sc_[kFullSplIdStart + 1] =
- "0ABCcDEFGHIJKLMNOPQRSsTUVWXYZz";
-
-// Bit 0 : is it a Shengmu char?
-// Bit 1 : is it a Yunmu char? (one char is a Yunmu)
-// Bit 2 : is it enabled in ShouZiMu(first char) mode?
-unsigned char SpellingTrie::char_flags_[] = {
- // a b c d e f g
- 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01,
- // h i j k l m n
- 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01,
- // o p q r s t
- 0x02, 0x01, 0x01, 0x01, 0x01, 0x01,
- // u v w x y z
- 0x00, 0x00, 0x01, 0x01, 0x01, 0x01
-};
-
-int compare_spl(const void* p1, const void* p2) {
- return strcmp((const char*)(p1), (const char*)(p2));
-}
-
-SpellingTrie::SpellingTrie() {
- spelling_buf_ = NULL;
- spelling_size_ = 0;
- spelling_num_ = 0;
- spl_ym_ids_ = NULL;
- splstr_queried_ = NULL;
- splstr16_queried_ = NULL;
- root_ = NULL;
- dumb_node_ = NULL;
- splitter_node_ = NULL;
- instance_ = NULL;
- ym_buf_ = NULL;
- f2h_ = NULL;
-
- szm_enable_shm(true);
- szm_enable_ym(true);
-
-#ifdef ___BUILD_MODEL___
- node_num_ = 0;
-#endif
-}
-
-SpellingTrie::~SpellingTrie() {
- if (NULL != spelling_buf_)
- delete [] spelling_buf_;
-
- if (NULL != splstr_queried_)
- delete [] splstr_queried_;
-
- if (NULL != splstr16_queried_)
- delete [] splstr16_queried_;
-
- if (NULL != spl_ym_ids_)
- delete [] spl_ym_ids_;
-
- if (NULL != root_) {
- free_son_trie(root_);
- delete root_;
- }
-
- if (NULL != dumb_node_) {
- delete [] dumb_node_;
- }
-
- if (NULL != splitter_node_) {
- delete [] splitter_node_;
- }
-
- if (NULL != instance_) {
- delete instance_;
- instance_ = NULL;
- }
-
- if (NULL != ym_buf_)
- delete [] ym_buf_;
-
- if (NULL != f2h_)
- delete [] f2h_;
-}
-
-bool SpellingTrie::if_valid_id_update(uint16 *splid) const {
- if (NULL == splid || 0 == *splid)
- return false;
-
- if (*splid >= kFullSplIdStart)
- return true;
- if (*splid < kFullSplIdStart) {
- char ch = kHalfId2Sc_[*splid];
- if (ch > 'Z') {
- return true;
- } else {
- if (szm_is_enabled(ch)) {
- return true;
- } else if (is_yunmu_char(ch)) {
- assert(h2f_num_[*splid] > 0);
- *splid = h2f_start_[*splid];
- return true;
- }
- }
- }
- return false;
-}
-
-bool SpellingTrie::is_half_id(uint16 splid) const {
- if (0 == splid || splid >= kFullSplIdStart)
- return false;
-
- return true;
-}
-
-bool SpellingTrie::is_full_id(uint16 splid) const {
- if (splid < kFullSplIdStart || splid >= kFullSplIdStart + spelling_num_)
- return false;
- return true;
-}
-
-bool SpellingTrie::half_full_compatible(uint16 half_id, uint16 full_id) const {
- uint16 half_fr_full = full_to_half(full_id);
-
- if (half_fr_full == half_id)
- return true;
-
- // &~0x20 is used to conver the char to upper case.
- // So that Zh/Ch/Sh(whose char is z/c/s) can be matched with Z/C/S.
- char ch_f = (kHalfId2Sc_[half_fr_full] & (~0x20));
- char ch_h = kHalfId2Sc_[half_id];
- if (ch_f == ch_h)
- return true;
-
- return false;
-}
-
-bool SpellingTrie::is_half_id_yunmu(uint16 splid) const {
- if (0 == splid || splid >= kFullSplIdStart)
- return false;
-
- char ch = kHalfId2Sc_[splid];
- // If ch >= 'a', that means the half id is one of Zh/Ch/Sh
- if (ch >= 'a') {
- return false;
- }
-
- return char_flags_[ch - 'A'] & kHalfIdYunmuMask;
-}
-
-bool SpellingTrie::is_shengmu_char(char ch) const {
- return char_flags_[ch - 'A'] & kHalfIdShengmuMask;
-}
-
-bool SpellingTrie::is_yunmu_char(char ch) const {
- return char_flags_[ch - 'A'] & kHalfIdYunmuMask;
-}
-
-bool SpellingTrie::is_szm_char(char ch) const {
- return is_shengmu_char(ch) || is_yunmu_char(ch);
-}
-
-bool SpellingTrie::szm_is_enabled(char ch) const {
- return char_flags_[ch - 'A'] & kHalfIdSzmMask;
-}
-
-void SpellingTrie::szm_enable_shm(bool enable) {
- if (enable) {
- for (char ch = 'A'; ch <= 'Z'; ch++) {
- if (is_shengmu_char(ch))
- char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask;
- }
- } else {
- for (char ch = 'A'; ch <= 'Z'; ch++) {
- if (is_shengmu_char(ch))
- char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff);
- }
- }
-}
-
-void SpellingTrie::szm_enable_ym(bool enable) {
- if (enable) {
- for (char ch = 'A'; ch <= 'Z'; ch++) {
- if (is_yunmu_char(ch))
- char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask;
- }
- } else {
- for (char ch = 'A'; ch <= 'Z'; ch++) {
- if (is_yunmu_char(ch))
- char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff);
- }
- }
-}
-
-bool SpellingTrie::is_szm_enabled(char ch) const {
- return char_flags_[ch - 'A'] & kHalfIdSzmMask;
-}
-
-const SpellingTrie* SpellingTrie::get_cpinstance() {
- return &get_instance();
-}
-
-SpellingTrie& SpellingTrie::get_instance() {
- if (NULL == instance_)
- instance_ = new SpellingTrie();
-
- return *instance_;
-}
-
-uint16 SpellingTrie::half2full_num(uint16 half_id) const {
- if (NULL == root_ || half_id >= kFullSplIdStart)
- return 0;
- return h2f_num_[half_id];
-}
-
-uint16 SpellingTrie::half_to_full(uint16 half_id, uint16 *spl_id_start) const {
- if (NULL == spl_id_start || NULL == root_ || half_id >= kFullSplIdStart)
- return 0;
-
- *spl_id_start = h2f_start_[half_id];
- return h2f_num_[half_id];
-}
-
-uint16 SpellingTrie::full_to_half(uint16 full_id) const {
- if (NULL == root_ || full_id < kFullSplIdStart ||
- full_id > spelling_num_ + kFullSplIdStart)
- return 0;
-
- return f2h_[full_id - kFullSplIdStart];
-}
-
-void SpellingTrie::free_son_trie(SpellingNode* node) {
- if (NULL == node)
- return;
-
- for (size_t pos = 0; pos < node->num_of_son; pos++) {
- free_son_trie(node->first_son + pos);
- }
-
- if (NULL != node->first_son)
- delete [] node->first_son;
-}
-
-bool SpellingTrie::construct(const char* spelling_arr, size_t item_size,
- size_t item_num, float score_amplifier,
- unsigned char average_score) {
- if (spelling_arr == NULL)
- return false;
-
- memset(h2f_start_, 0, sizeof(uint16) * kFullSplIdStart);
- memset(h2f_num_, 0, sizeof(uint16) * kFullSplIdStart);
-
- // If the arr is the same as the buf, means this function is called by
- // load_table(), the table data are ready; otherwise the array should be
- // saved.
- if (spelling_arr != spelling_buf_) {
- if (NULL != spelling_buf_)
- delete [] spelling_buf_;
- spelling_buf_ = new char[item_size * item_num];
- if (NULL == spelling_buf_)
- return false;
- memcpy(spelling_buf_, spelling_arr, sizeof(char) * item_size * item_num);
- }
-
- spelling_size_ = item_size;
- spelling_num_ = item_num;
-
- score_amplifier_ = score_amplifier;
- average_score_ = average_score;
-
- if (NULL != splstr_queried_)
- delete [] splstr_queried_;
- splstr_queried_ = new char[spelling_size_];
- if (NULL == splstr_queried_)
- return false;
-
- if (NULL != splstr16_queried_)
- delete [] splstr16_queried_;
- splstr16_queried_ = new char16[spelling_size_];
- if (NULL == splstr16_queried_)
- return false;
-
- // First, sort the buf to ensure they are in ascendant order
- qsort(spelling_buf_, spelling_num_, spelling_size_, compare_spl);
-
-#ifdef ___BUILD_MODEL___
- node_num_ = 1;
-#endif
-
- root_ = new SpellingNode();
- memset(root_, 0, sizeof(SpellingNode));
-
- dumb_node_ = new SpellingNode();
- memset(dumb_node_, 0, sizeof(SpellingNode));
- dumb_node_->score = average_score_;
-
- splitter_node_ = new SpellingNode();
- memset(splitter_node_, 0, sizeof(SpellingNode));
- splitter_node_->score = average_score_;
-
- memset(level1_sons_, 0, sizeof(SpellingNode*) * kValidSplCharNum);
-
- root_->first_son = construct_spellings_subset(0, spelling_num_, 0, root_);
-
- // Root's score should be cleared.
- root_->score = 0;
-
- if (NULL == root_->first_son)
- return false;
-
- h2f_start_[0] = h2f_num_[0] = 0;
-
- if (!build_f2h())
- return false;
-
-#ifdef ___BUILD_MODEL___
- if (kPrintDebug0) {
- printf("---SpellingTrie Nodes: %d\n", (int)node_num_);
- }
- return build_ym_info();
-#else
- return true;
-#endif
-}
-
-#ifdef ___BUILD_MODEL___
-const char* SpellingTrie::get_ym_str(const char *spl_str) {
- bool start_ZCS = false;
- if (is_shengmu_char(*spl_str)) {
- if ('Z' == *spl_str || 'C' == *spl_str || 'S' == *spl_str)
- start_ZCS = true;
- spl_str += 1;
- if (start_ZCS && 'h' == *spl_str)
- spl_str += 1;
- }
- return spl_str;
-}
-
-bool SpellingTrie::build_ym_info() {
- bool sucess;
- SpellingTable *spl_table = new SpellingTable();
-
- sucess = spl_table->init_table(kMaxPinyinSize - 1, 2 * kMaxYmNum, false);
- assert(sucess);
-
- for (uint16 pos = 0; pos < spelling_num_; pos++) {
- const char *spl_str = spelling_buf_ + spelling_size_ * pos;
- spl_str = get_ym_str(spl_str);
- if ('\0' != spl_str[0]) {
- sucess = spl_table->put_spelling(spl_str, 0);
- assert(sucess);
- }
- }
-
- size_t ym_item_size; // '\0' is included
- size_t ym_num;
- const char* ym_buf;
- ym_buf = spl_table->arrange(&ym_item_size, &ym_num);
-
- if (NULL != ym_buf_)
- delete [] ym_buf_;
- ym_buf_ = new char[ym_item_size * ym_num];
- if (NULL == ym_buf_) {
- delete spl_table;
- return false;
- }
-
- memcpy(ym_buf_, ym_buf, sizeof(char) * ym_item_size * ym_num);
- ym_size_ = ym_item_size;
- ym_num_ = ym_num;
-
- delete spl_table;
-
- // Generate the maping from the spelling ids to the Yunmu ids.
- if (spl_ym_ids_)
- delete spl_ym_ids_;
- spl_ym_ids_ = new uint8[spelling_num_ + kFullSplIdStart];
- if (NULL == spl_ym_ids_)
- return false;
-
- memset(spl_ym_ids_, 0, sizeof(uint8) * (spelling_num_ + kFullSplIdStart));
-
- for (uint16 id = 1; id < spelling_num_ + kFullSplIdStart; id++) {
- const char *str = get_spelling_str(id);
-
- str = get_ym_str(str);
- if ('\0' != str[0]) {
- uint8 ym_id = get_ym_id(str);
- spl_ym_ids_[id] = ym_id;
- assert(ym_id > 0);
- } else {
- spl_ym_ids_[id] = 0;
- }
- }
- return true;
-}
-#endif
-
-SpellingNode* SpellingTrie::construct_spellings_subset(
- size_t item_start, size_t item_end, size_t level, SpellingNode* parent) {
- if (level >= spelling_size_ || item_end <= item_start || NULL == parent)
- return NULL;
-
- SpellingNode *first_son = NULL;
- uint16 num_of_son = 0;
- unsigned char min_son_score = 255;
-
- const char *spelling_last_start = spelling_buf_ + spelling_size_ * item_start;
- char char_for_node = spelling_last_start[level];
- assert((char_for_node >= 'A' && char_for_node <= 'Z') ||
- 'h' == char_for_node);
-
- // Scan the array to find how many sons
- for (size_t i = item_start + 1; i < item_end; i++) {
- const char *spelling_current = spelling_buf_ + spelling_size_ * i;
- char char_current = spelling_current[level];
- if (char_current != char_for_node) {
- num_of_son++;
- char_for_node = char_current;
- }
- }
- num_of_son++;
-
- // Allocate memory
-#ifdef ___BUILD_MODEL___
- node_num_ += num_of_son;
-#endif
- first_son = new SpellingNode[num_of_son];
- memset(first_son, 0, sizeof(SpellingNode)*num_of_son);
-
- // Now begin construct tree
- size_t son_pos = 0;
-
- spelling_last_start = spelling_buf_ + spelling_size_ * item_start;
- char_for_node = spelling_last_start[level];
-
- bool spelling_endable = true;
- if (spelling_last_start[level + 1] != '\0')
- spelling_endable = false;
-
- size_t item_start_next = item_start;
-
- for (size_t i = item_start + 1; i < item_end; i++) {
- const char *spelling_current = spelling_buf_ + spelling_size_ * i;
- char char_current = spelling_current[level];
- assert(is_valid_spl_char(char_current));
-
- if (char_current != char_for_node) {
- // Construct a node
- SpellingNode *node_current = first_son + son_pos;
- node_current->char_this_node = char_for_node;
-
- // For quick search in the first level
- if (0 == level)
- level1_sons_[char_for_node - 'A'] = node_current;
-
- if (spelling_endable) {
- node_current->spelling_idx = kFullSplIdStart + item_start_next;
- }
-
- if (spelling_last_start[level + 1] != '\0' || i - item_start_next > 1) {
- size_t real_start = item_start_next;
- if (spelling_last_start[level + 1] == '\0')
- real_start++;
-
- node_current->first_son =
- construct_spellings_subset(real_start, i, level + 1,
- node_current);
-
- if (real_start == item_start_next + 1) {
- uint16 score_this = static_cast<unsigned char>(
- spelling_last_start[spelling_size_ - 1]);
- if (score_this < node_current->score)
- node_current->score = score_this;
- }
- } else {
- node_current->first_son = NULL;
- node_current->score = static_cast<unsigned char>(
- spelling_last_start[spelling_size_ - 1]);
- }
-
- if (node_current->score < min_son_score)
- min_son_score = node_current->score;
-
- bool is_half = false;
- if (level == 0 && is_szm_char(char_for_node)) {
- node_current->spelling_idx =
- static_cast<uint16>(char_for_node - 'A' + 1);
-
- if (char_for_node > 'C')
- node_current->spelling_idx++;
- if (char_for_node > 'S')
- node_current->spelling_idx++;
-
- h2f_num_[node_current->spelling_idx] = i - item_start_next;
- is_half = true;
- } else if (level == 1 && char_for_node == 'h') {
- char ch_level0 = spelling_last_start[0];
- uint16 part_id = 0;
- if (ch_level0 == 'C')
- part_id = 'C' - 'A' + 1 + 1;
- else if (ch_level0 == 'S')
- part_id = 'S' - 'A' + 1 + 2;
- else if (ch_level0 == 'Z')
- part_id = 'Z' - 'A' + 1 + 3;
- if (0 != part_id) {
- node_current->spelling_idx = part_id;
- h2f_num_[node_current->spelling_idx] = i - item_start_next;
- is_half = true;
- }
- }
-
- if (is_half) {
- if (h2f_num_[node_current->spelling_idx] > 0)
- h2f_start_[node_current->spelling_idx] =
- item_start_next + kFullSplIdStart;
- else
- h2f_start_[node_current->spelling_idx] = 0;
- }
-
- // for next sibling
- spelling_last_start = spelling_current;
- char_for_node = char_current;
- item_start_next = i;
- spelling_endable = true;
- if (spelling_current[level + 1] != '\0')
- spelling_endable = false;
-
- son_pos++;
- }
- }
-
- // the last one
- SpellingNode *node_current = first_son + son_pos;
- node_current->char_this_node = char_for_node;
-
- // For quick search in the first level
- if (0 == level)
- level1_sons_[char_for_node - 'A'] = node_current;
-
- if (spelling_endable) {
- node_current->spelling_idx = kFullSplIdStart + item_start_next;
- }
-
- if (spelling_last_start[level + 1] != '\0' ||
- item_end - item_start_next > 1) {
- size_t real_start = item_start_next;
- if (spelling_last_start[level + 1] == '\0')
- real_start++;
-
- node_current->first_son =
- construct_spellings_subset(real_start, item_end, level + 1,
- node_current);
-
- if (real_start == item_start_next + 1) {
- uint16 score_this = static_cast<unsigned char>(
- spelling_last_start[spelling_size_ - 1]);
- if (score_this < node_current->score)
- node_current->score = score_this;
- }
- } else {
- node_current->first_son = NULL;
- node_current->score = static_cast<unsigned char>(
- spelling_last_start[spelling_size_ - 1]);
- }
-
- if (node_current->score < min_son_score)
- min_son_score = node_current->score;
-
- assert(son_pos + 1 == num_of_son);
-
- bool is_half = false;
- if (level == 0 && szm_is_enabled(char_for_node)) {
- node_current->spelling_idx = static_cast<uint16>(char_for_node - 'A' + 1);
-
- if (char_for_node > 'C')
- node_current->spelling_idx++;
- if (char_for_node > 'S')
- node_current->spelling_idx++;
-
- h2f_num_[node_current->spelling_idx] = item_end - item_start_next;
- is_half = true;
- } else if (level == 1 && char_for_node == 'h') {
- char ch_level0 = spelling_last_start[0];
- uint16 part_id = 0;
- if (ch_level0 == 'C')
- part_id = 'C' - 'A' + 1 + 1;
- else if (ch_level0 == 'S')
- part_id = 'S' - 'A' + 1 + 2;
- else if (ch_level0 == 'Z')
- part_id = 'Z' - 'A' + 1 + 3;
- if (0 != part_id) {
- node_current->spelling_idx = part_id;
- h2f_num_[node_current->spelling_idx] = item_end - item_start_next;
- is_half = true;
- }
- }
- if (is_half) {
- if (h2f_num_[node_current->spelling_idx] > 0)
- h2f_start_[node_current->spelling_idx] =
- item_start_next + kFullSplIdStart;
- else
- h2f_start_[node_current->spelling_idx] = 0;
- }
-
- parent->num_of_son = num_of_son;
- parent->score = min_son_score;
- return first_son;
-}
-
-bool SpellingTrie::save_spl_trie(FILE *fp) {
- if (NULL == fp || NULL == spelling_buf_)
- return false;
-
- if (fwrite(&spelling_size_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(&spelling_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fwrite(&score_amplifier_, sizeof(float), 1, fp) != 1)
- return false;
-
- if (fwrite(&average_score_, sizeof(unsigned char), 1, fp) != 1)
- return false;
-
- if (fwrite(spelling_buf_, sizeof(char) * spelling_size_,
- spelling_num_, fp) != spelling_num_)
- return false;
-
- return true;
-}
-
-bool SpellingTrie::load_spl_trie(FILE *fp) {
- if (NULL == fp)
- return false;
-
- if (fread(&spelling_size_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(&spelling_num_, sizeof(uint32), 1, fp) != 1)
- return false;
-
- if (fread(&score_amplifier_, sizeof(float), 1, fp) != 1)
- return false;
-
- if (fread(&average_score_, sizeof(unsigned char), 1, fp) != 1)
- return false;
-
- if (NULL != spelling_buf_)
- delete [] spelling_buf_;
-
- spelling_buf_ = new char[spelling_size_ * spelling_num_];
- if (NULL == spelling_buf_)
- return false;
-
- if (fread(spelling_buf_, sizeof(char) * spelling_size_,
- spelling_num_, fp) != spelling_num_)
- return false;
-
- return construct(spelling_buf_, spelling_size_, spelling_num_,
- score_amplifier_, average_score_);
-}
-
-bool SpellingTrie::build_f2h() {
- if (NULL != f2h_)
- delete [] f2h_;
- f2h_ = new uint16[spelling_num_];
- if (NULL == f2h_)
- return false;
-
- for (uint16 hid = 0; hid < kFullSplIdStart; hid++) {
- for (uint16 fid = h2f_start_[hid];
- fid < h2f_start_[hid] + h2f_num_[hid]; fid++)
- f2h_[fid - kFullSplIdStart] = hid;
- }
-
- return true;
-}
-
-size_t SpellingTrie::get_spelling_num() {
- return spelling_num_;
-}
-
-uint8 SpellingTrie::get_ym_id(const char *ym_str) {
- if (NULL == ym_str || NULL == ym_buf_)
- return 0;
-
- for (uint8 pos = 0; pos < ym_num_; pos++)
- if (strcmp(ym_buf_ + ym_size_ * pos, ym_str) == 0)
- return pos + 1;
-
- return 0;
-}
-
-const char* SpellingTrie::get_spelling_str(uint16 splid) {
- splstr_queried_[0] = '\0';
-
- if (splid >= kFullSplIdStart) {
- splid -= kFullSplIdStart;
- snprintf(splstr_queried_, spelling_size_, "%s",
- spelling_buf_ + splid * spelling_size_);
- } else {
- if (splid == 'C' - 'A' + 1 + 1) {
- snprintf(splstr_queried_, spelling_size_, "%s", "Ch");
- } else if (splid == 'S' - 'A' + 1 + 2) {
- snprintf(splstr_queried_, spelling_size_, "%s", "Sh");
- } else if (splid == 'Z' - 'A' + 1 + 3) {
- snprintf(splstr_queried_, spelling_size_, "%s", "Zh");
- } else {
- if (splid > 'C' - 'A' + 1)
- splid--;
- if (splid > 'S' - 'A' + 1)
- splid--;
- splstr_queried_[0] = 'A' + splid - 1;
- splstr_queried_[1] = '\0';
- }
- }
- return splstr_queried_;
-}
-
-const char16* SpellingTrie::get_spelling_str16(uint16 splid) {
- splstr16_queried_[0] = '\0';
-
- if (splid >= kFullSplIdStart) {
- splid -= kFullSplIdStart;
- for (size_t pos = 0; pos < spelling_size_; pos++) {
- splstr16_queried_[pos] = static_cast<char16>
- (spelling_buf_[splid * spelling_size_ + pos]);
- }
- } else {
- if (splid == 'C' - 'A' + 1 + 1) {
- splstr16_queried_[0] = static_cast<char16>('C');
- splstr16_queried_[1] = static_cast<char16>('h');
- splstr16_queried_[2] = static_cast<char16>('\0');
- } else if (splid == 'S' - 'A' + 1 + 2) {
- splstr16_queried_[0] = static_cast<char16>('S');
- splstr16_queried_[1] = static_cast<char16>('h');
- splstr16_queried_[2] = static_cast<char16>('\0');
- } else if (splid == 'Z' - 'A' + 1 + 3) {
- splstr16_queried_[0] = static_cast<char16>('Z');
- splstr16_queried_[1] = static_cast<char16>('h');
- splstr16_queried_[2] = static_cast<char16>('\0');
- } else {
- if (splid > 'C' - 'A' + 1)
- splid--;
- if (splid > 'S' - 'A' + 1)
- splid--;
- splstr16_queried_[0] = 'A' + splid - 1;
- splstr16_queried_[1] = '\0';
- }
- }
- return splstr16_queried_;
-}
-
-size_t SpellingTrie::get_spelling_str16(uint16 splid, char16 *splstr16,
- size_t splstr16_len) {
- if (NULL == splstr16 || splstr16_len < kMaxPinyinSize + 1) return 0;
-
- if (splid >= kFullSplIdStart) {
- splid -= kFullSplIdStart;
- for (size_t pos = 0; pos <= kMaxPinyinSize; pos++) {
- splstr16[pos] = static_cast<char16>
- (spelling_buf_[splid * spelling_size_ + pos]);
- if (static_cast<char16>('\0') == splstr16[pos]) {
- return pos;
- }
- }
- } else {
- if (splid == 'C' - 'A' + 1 + 1) {
- splstr16[0] = static_cast<char16>('C');
- splstr16[1] = static_cast<char16>('h');
- splstr16[2] = static_cast<char16>('\0');
- return 2;
- } else if (splid == 'S' - 'A' + 1 + 2) {
- splstr16[0] = static_cast<char16>('S');
- splstr16[1] = static_cast<char16>('h');
- splstr16[2] = static_cast<char16>('\0');
- return 2;
- } else if (splid == 'Z' - 'A' + 1 + 3) {
- splstr16[0] = static_cast<char16>('Z');
- splstr16[1] = static_cast<char16>('h');
- splstr16[2] = static_cast<char16>('\0');
- return 2;
- } else {
- if (splid > 'C' - 'A' + 1)
- splid--;
- if (splid > 'S' - 'A' + 1)
- splid--;
- splstr16[0] = 'A' + splid - 1;
- splstr16[1] = '\0';
- return 1;
- }
- }
-
- // Not reachable.
- return 0;
-}
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp
deleted file mode 100644
index d75aec6a..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/splparser.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include "../include/splparser.h"
-
-namespace ime_pinyin {
-
-SpellingParser::SpellingParser() {
- spl_trie_ = SpellingTrie::get_cpinstance();
-}
-
-bool SpellingParser::is_valid_to_parse(char ch) {
- return SpellingTrie::is_valid_spl_char(ch);
-}
-
-uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
- uint16 spl_idx[], uint16 start_pos[],
- uint16 max_size, bool &last_is_pre) {
- if (NULL == splstr || 0 == max_size || 0 == str_len)
- return 0;
-
- if (!SpellingTrie::is_valid_spl_char(splstr[0]))
- return 0;
-
- last_is_pre = false;
-
- const SpellingNode *node_this = spl_trie_->root_;
-
- uint16 str_pos = 0;
- uint16 idx_num = 0;
- if (NULL != start_pos)
- start_pos[0] = 0;
- bool last_is_splitter = false;
-
- while (str_pos < str_len) {
- char char_this = splstr[str_pos];
- // all characters outside of [a, z] are considered as splitters
- if (!SpellingTrie::is_valid_spl_char(char_this)) {
- // test if the current node is endable
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- str_pos++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- if (idx_num >= max_size)
- return idx_num;
-
- node_this = spl_trie_->root_;
- last_is_splitter = true;
- continue;
- } else {
- if (last_is_splitter) {
- str_pos++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- continue;
- } else {
- return idx_num;
- }
- }
- }
-
- last_is_splitter = false;
-
- SpellingNode *found_son = NULL;
-
- if (0 == str_pos) {
- if (char_this >= 'a')
- found_son = spl_trie_->level1_sons_[char_this - 'a'];
- else
- found_son = spl_trie_->level1_sons_[char_this - 'A'];
- } else {
- SpellingNode *first_son = node_this->first_son;
- // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
- // frequently used, so we scan from the end.
- for (int i = 0; i < node_this->num_of_son; i++) {
- SpellingNode *this_son = first_son + i;
- if (SpellingTrie::is_same_spl_char(
- this_son->char_this_node, char_this)) {
- found_son = this_son;
- break;
- }
- }
- }
-
- // found, just move the current node pointer to the the son
- if (NULL != found_son) {
- node_this = found_son;
- } else {
- // not found, test if it is endable
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- // endable, remember the index
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- if (idx_num >= max_size)
- return idx_num;
- node_this = spl_trie_->root_;
- continue;
- } else {
- return idx_num;
- }
- }
-
- str_pos++;
- }
-
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- // endable, remember the index
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- }
-
- last_is_pre = !last_is_splitter;
-
- return idx_num;
-}
-
-uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
- uint16 spl_idx[], uint16 start_pos[],
- uint16 max_size, bool &last_is_pre) {
- uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
- max_size, last_is_pre);
- for (uint16 pos = 0; pos < idx_num; pos++) {
- if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
- spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
- if (pos == idx_num - 1) {
- last_is_pre = false;
- }
- }
- }
- return idx_num;
-}
-
-uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
- uint16 spl_idx[], uint16 start_pos[],
- uint16 max_size, bool &last_is_pre) {
- if (NULL == splstr || 0 == max_size || 0 == str_len)
- return 0;
-
- if (!SpellingTrie::is_valid_spl_char(splstr[0]))
- return 0;
-
- last_is_pre = false;
-
- const SpellingNode *node_this = spl_trie_->root_;
-
- uint16 str_pos = 0;
- uint16 idx_num = 0;
- if (NULL != start_pos)
- start_pos[0] = 0;
- bool last_is_splitter = false;
-
- while (str_pos < str_len) {
- char16 char_this = splstr[str_pos];
- // all characters outside of [a, z] are considered as splitters
- if (!SpellingTrie::is_valid_spl_char(char_this)) {
- // test if the current node is endable
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- str_pos++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- if (idx_num >= max_size)
- return idx_num;
-
- node_this = spl_trie_->root_;
- last_is_splitter = true;
- continue;
- } else {
- if (last_is_splitter) {
- str_pos++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- continue;
- } else {
- return idx_num;
- }
- }
- }
-
- last_is_splitter = false;
-
- SpellingNode *found_son = NULL;
-
- if (0 == str_pos) {
- if (char_this >= 'a')
- found_son = spl_trie_->level1_sons_[char_this - 'a'];
- else
- found_son = spl_trie_->level1_sons_[char_this - 'A'];
- } else {
- SpellingNode *first_son = node_this->first_son;
- // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
- // frequently used, so we scan from the end.
- for (int i = 0; i < node_this->num_of_son; i++) {
- SpellingNode *this_son = first_son + i;
- if (SpellingTrie::is_same_spl_char(
- this_son->char_this_node, char_this)) {
- found_son = this_son;
- break;
- }
- }
- }
-
- // found, just move the current node pointer to the the son
- if (NULL != found_son) {
- node_this = found_son;
- } else {
- // not found, test if it is endable
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- // endable, remember the index
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- if (idx_num >= max_size)
- return idx_num;
- node_this = spl_trie_->root_;
- continue;
- } else {
- return idx_num;
- }
- }
-
- str_pos++;
- }
-
- uint16 id_this = node_this->spelling_idx;
- if (spl_trie_->if_valid_id_update(&id_this)) {
- // endable, remember the index
- spl_idx[idx_num] = id_this;
-
- idx_num++;
- if (NULL != start_pos)
- start_pos[idx_num] = str_pos;
- }
-
- last_is_pre = !last_is_splitter;
-
- return idx_num;
-}
-
-uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
- uint16 spl_idx[], uint16 start_pos[],
- uint16 max_size, bool &last_is_pre) {
- uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
- max_size, last_is_pre);
- for (uint16 pos = 0; pos < idx_num; pos++) {
- if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
- spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
- if (pos == idx_num - 1) {
- last_is_pre = false;
- }
- }
- }
- return idx_num;
-}
-
-uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
- bool *is_pre) {
- if (NULL == is_pre)
- return 0;
-
- uint16 spl_idx[2];
- uint16 start_pos[3];
-
- if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
- return 0;
-
- if (start_pos[1] != str_len)
- return 0;
- return spl_idx[0];
-}
-
-uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
- bool *is_pre) {
- if (NULL == is_pre)
- return 0;
-
- uint16 spl_idx[2];
- uint16 start_pos[3];
-
- if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
- return 0;
-
- if (start_pos[1] != str_len)
- return 0;
- if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
- spl_trie_->half_to_full(spl_idx[0], spl_idx);
- *is_pre = false;
- }
-
- return spl_idx[0];
-}
-
-uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
- uint16 splidx[], uint16 max_size,
- uint16 &full_id_num, bool &is_pre) {
- if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
- return 0;
-
- splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
- full_id_num = 0;
- if (0 != splidx[0]) {
- if (splidx[0] >= kFullSplIdStart)
- full_id_num = 1;
- return 1;
- }
- return 0;
-}
-
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp
deleted file mode 100644
index 91e27b88..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/sync.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/sync.h"
-#include <assert.h>
-#include <string.h>
-
-#ifdef ___SYNC_ENABLED___
-
-namespace ime_pinyin {
-
-Sync::Sync()
- : userdict_(NULL),
- dictfile_(NULL),
- last_count_(0) {
-};
-
-Sync::~Sync() {
-}
-
-
-bool Sync::begin(const char * filename) {
- if (userdict_) {
- finish();
- }
-
- if (!filename) {
- return false;
- }
-
- dictfile_ = strdup(filename);
- if (!dictfile_) {
- return false;
- }
-
- userdict_ = new UserDict();
- if (!userdict_) {
- free(dictfile_);
- dictfile_ = NULL;
- return false;
- }
-
- if (userdict_->load_dict((const char*)dictfile_, kUserDictIdStart,
- kUserDictIdEnd) == false) {
- delete userdict_;
- userdict_ = NULL;
- free(dictfile_);
- dictfile_ = NULL;
- return false;
- }
-
- userdict_->set_limit(kUserDictMaxLemmaCount, kUserDictMaxLemmaSize, kUserDictRatio);
-
- return true;
-}
-
-int Sync::put_lemmas(char16 * lemmas, int len) {
- return userdict_->put_lemmas_no_sync_from_utf16le_string(lemmas, len);
-}
-
-int Sync::get_lemmas(char16 * str, int size) {
- return userdict_->get_sync_lemmas_in_utf16le_string_from_beginning(str, size, &last_count_);
-}
-
-int Sync::get_last_got_count() {
- return last_count_;
-}
-
-int Sync::get_total_count() {
- return userdict_->get_sync_count();
-}
-
-void Sync::clear_last_got() {
- if (last_count_ < 0) {
- return;
- }
- userdict_->clear_sync_lemmas(0, last_count_);
- last_count_ = 0;
-}
-
-void Sync::finish() {
- if (userdict_) {
- userdict_->close_dict();
- delete userdict_;
- userdict_ = NULL;
- free(dictfile_);
- dictfile_ = NULL;
- last_count_ = 0;
- }
-}
-
-int Sync::get_capacity() {
- UserDict::UserDictStat stat;
- userdict_->state(&stat);
- return stat.limit_lemma_count - stat.lemma_count;
-}
-
-}
-#endif
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp
deleted file mode 100644
index 4687da2d..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/userdict.cpp
+++ /dev/null
@@ -1,2286 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/userdict.h"
-#include "../include/splparser.h"
-#include "../include/ngram.h"
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef ___DEBUG_PERF___
-#include <cutils/log.h>
-#endif
-#ifdef _WIN32
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <assert.h>
-#include <ctype.h>
-#include <sys/types.h>
-#ifdef _WIN32
-#undef max
-#undef min
-#include <QDateTime>
-#include <QMutex>
-#else
-#include <pthread.h>
-#endif
-#include <math.h>
-
-namespace ime_pinyin {
-
-#ifdef _WIN32
-static int gettimeofday(struct timeval *tp, void *) {
- const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch();
- tp->tv_sec = (long)(current_msecs_since_epoch / 1000);
- tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000);
- return 0;
-}
-#endif
-
-#ifdef ___DEBUG_PERF___
-static uint64 _ellapse_ = 0;
-static struct timeval _tv_start_, _tv_end_;
-#define DEBUG_PERF_BEGIN \
- do { \
- gettimeofday(&_tv_start_, NULL); \
- } while (0)
-#define DEBUG_PERF_END \
- do { \
- gettimeofday(&_tv_end_, NULL); \
- _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
- (_tv_end_.tv_usec - _tv_start_.tv_usec); \
- } while (0)
-#define LOGD_PERF(message) \
- ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
-#else
-#define DEBUG_PERF_BEGIN
-#define DEBUG_PERF_END
-#define LOGD_PERF(message)
-#endif
-
-// XXX File load and write are thread-safe by g_mutex_
-#ifdef _WIN32
-static QMutex g_mutex_;
-#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
-#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
-#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0))
-#else
-static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
-#endif
-static struct timeval g_last_update_ = {0, 0};
-
-inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
- return (4 + info->lemma_size + (info->lemma_count << 3)
-#ifdef ___PREDICT_ENABLED___
- + (info->lemma_count << 2)
-#endif
-#ifdef ___SYNC_ENABLED___
- + (info->sync_count << 2)
-#endif
- + sizeof(*info));
-}
-
-inline LmaScoreType UserDict::translate_score(int raw_score) {
- // 1) ori_freq: original user frequency
- uint32 ori_freq = extract_score_freq(raw_score);
- // 2) lmt_off: lmt index (week offset for example)
- uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
- if (kUserDictLMTBitWidth < 16) {
- uint64 mask = ~(1 << kUserDictLMTBitWidth);
- lmt_off &= mask;
- }
- // 3) now_off: current time index (current week offset for example)
- // assuming load_time_ is around current time
- uint64 now_off = load_time_.tv_sec;
- now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
- now_off = (now_off << (64 - kUserDictLMTBitWidth));
- now_off = (now_off >> (64 - kUserDictLMTBitWidth));
- // 4) factor: decide expand-factor
- int delta = now_off - lmt_off;
- if (delta > 4)
- delta = 4;
- int factor = 80 - (delta << 4);
-
- double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
- return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
- * NGram::kLogValueAmplifier);
-}
-
-inline int UserDict::extract_score_freq(int raw_score) {
- // Frequence stored in lowest 16 bits
- int freq = (raw_score & 0x0000ffff);
- return freq;
-}
-
-inline uint64 UserDict::extract_score_lmt(int raw_score) {
- uint64 lmt = ((raw_score & 0xffff0000) >> 16);
- if (kUserDictLMTBitWidth < 16) {
- uint64 mask = ~(1 << kUserDictLMTBitWidth);
- lmt &= mask;
- }
- lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
- return lmt;
-}
-
-inline int UserDict::build_score(uint64 lmt, int freq) {
- lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
- lmt = (lmt << (64 - kUserDictLMTBitWidth));
- lmt = (lmt >> (64 - kUserDictLMTBitWidth));
- uint16 lmt16 = (uint16)lmt;
- int s = freq;
- s &= 0x0000ffff;
- s = (lmt16 << 16) | s;
- return s;
-}
-
-inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
- int64 ret = 0;
- if (len <= 0)
- return ret;
-
- int flag = 1;
- const uint16 * endp = s + len;
- if (*s == '-') {
- flag = -1;
- s++;
- } else if (*s == '+') {
- s++;
- }
-
- while (*s >= '0' && *s <= '9' && s < endp) {
- ret += ret * 10 + (*s) - '0';
- s++;
- }
- return ret * flag;
-}
-
-inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
- if (!s || size <= 0)
- return 0;
- uint16 *endp = s + size;
- int ret_len = 0;
- if (v < 0) {
- *(s++) = '-';
- ++ret_len;
- v *= -1;
- }
-
- uint16 *b = s;
- while (s < endp && v != 0) {
- *(s++) = '0' + (v % 10);
- v = v / 10;
- ++ret_len;
- }
-
- if (v != 0)
- return 0;
-
- --s;
-
- while (b < s) {
- *b = *s;
- ++b, --s;
- }
-
- return ret_len;
-}
-
-inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
- offset &= kUserDictOffsetMask;
- lemmas_[offset] |= flag;
-}
-
-inline char UserDict::get_lemma_flag(uint32 offset) {
- offset &= kUserDictOffsetMask;
- return (char)(lemmas_[offset]);
-}
-
-inline char UserDict::get_lemma_nchar(uint32 offset) {
- offset &= kUserDictOffsetMask;
- return (char)(lemmas_[offset + 1]);
-}
-
-inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
- offset &= kUserDictOffsetMask;
- return (uint16 *)(lemmas_ + offset + 2);
-}
-
-inline uint16 * UserDict::get_lemma_word(uint32 offset) {
- offset &= kUserDictOffsetMask;
- uint8 nchar = get_lemma_nchar(offset);
- return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
-}
-
-inline LemmaIdType UserDict::get_max_lemma_id() {
- // When a lemma is deleted, we don't not claim its id back for
- // simplicity and performance
- return start_id_ + dict_info_.lemma_count - 1;
-}
-
-inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
- if (id >= start_id_ && id <= get_max_lemma_id())
- return true;
- return false;
-}
-
-inline bool UserDict::is_valid_state() {
- if (state_ == USER_DICT_NONE)
- return false;
- return true;
-}
-
-UserDict::UserDict()
- : start_id_(0),
- version_(0),
- lemmas_(NULL),
- offsets_(NULL),
- scores_(NULL),
- ids_(NULL),
-#ifdef ___PREDICT_ENABLED___
- predicts_(NULL),
-#endif
-#ifdef ___SYNC_ENABLED___
- syncs_(NULL),
- sync_count_size_(0),
-#endif
- offsets_by_id_(NULL),
- lemma_count_left_(0),
- lemma_size_left_(0),
- dict_file_(NULL),
- state_(USER_DICT_NONE) {
- memset(&dict_info_, 0, sizeof(dict_info_));
- memset(&load_time_, 0, sizeof(load_time_));
-#ifdef ___CACHE_ENABLED___
- cache_init();
-#endif
-}
-
-UserDict::~UserDict() {
- close_dict();
-}
-
-bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
- LemmaIdType end_id) {
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- dict_file_ = strdup(file_name);
- if (!dict_file_)
- return false;
-
- start_id_ = start_id;
-
- if (false == validate(file_name) && false == reset(file_name)) {
- goto error;
- }
- if (false == load(file_name, start_id)) {
- goto error;
- }
-
- state_ = USER_DICT_SYNC;
-
- gettimeofday(&load_time_, NULL);
-
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("load_dict");
-#endif
- return true;
- error:
- free((void*)dict_file_);
- dict_file_ = NULL;
- start_id_ = 0;
- return false;
-}
-
-bool UserDict::close_dict() {
- if (state_ == USER_DICT_NONE)
- return true;
- if (state_ == USER_DICT_SYNC)
- goto out;
-
- // If dictionary is written back by others,
- // we can not simply write back here
- // To do a safe flush, we have to discard all newly added
- // lemmas and try to reload dict file.
- pthread_mutex_lock(&g_mutex_);
- if (load_time_.tv_sec > g_last_update_.tv_sec ||
- (load_time_.tv_sec == g_last_update_.tv_sec &&
- load_time_.tv_usec > g_last_update_.tv_usec)) {
- write_back();
- gettimeofday(&g_last_update_, NULL);
- }
- pthread_mutex_unlock(&g_mutex_);
-
- out:
- free((void*)dict_file_);
- free(lemmas_);
- free(offsets_);
- free(offsets_by_id_);
- free(scores_);
- free(ids_);
-#ifdef ___PREDICT_ENABLED___
- free(predicts_);
-#endif
-
- version_ = 0;
- dict_file_ = NULL;
- lemmas_ = NULL;
-#ifdef ___SYNC_ENABLED___
- syncs_ = NULL;
- sync_count_size_ = 0;
-#endif
- offsets_ = NULL;
- offsets_by_id_ = NULL;
- scores_ = NULL;
- ids_ = NULL;
-#ifdef ___PREDICT_ENABLED___
- predicts_ = NULL;
-#endif
-
- memset(&dict_info_, 0, sizeof(dict_info_));
- lemma_count_left_ = 0;
- lemma_size_left_ = 0;
- state_ = USER_DICT_NONE;
-
- return true;
-}
-
-size_t UserDict::number_of_lemmas() {
- return dict_info_.lemma_count;
-}
-
-void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
- return;
-}
-
-MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
- const DictExtPara *dep,
- LmaPsbItem *lpi_items,
- size_t lpi_max, size_t *lpi_num) {
- if (is_valid_state() == false)
- return 0;
-
- bool need_extend = false;
-
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
- lpi_items, lpi_max, &need_extend);
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("extend_dict");
-#endif
- return ((*lpi_num > 0 || need_extend) ? 1 : 0);
-}
-
-int UserDict::is_fuzzy_prefix_spell_id(
- const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
- if (len1 < searchable->splids_len)
- return 0;
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- uint32 i = 0;
- for (i = 0; i < searchable->splids_len; i++) {
- const char py1 = *spl_trie.get_spelling_str(id1[i]);
- uint16 off = 8 * (i % 4);
- const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
- if (py1 == py2)
- continue;
- return 0;
- }
- return 1;
-}
-
-int UserDict::fuzzy_compare_spell_id(
- const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
- if (len1 < searchable->splids_len)
- return -1;
- if (len1 > searchable->splids_len)
- return 1;
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- uint32 i = 0;
- for (i = 0; i < len1; i++) {
- const char py1 = *spl_trie.get_spelling_str(id1[i]);
- uint16 off = 8 * (i % 4);
- const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
- if (py1 == py2)
- continue;
- if (py1 > py2)
- return 1;
- return -1;
- }
- return 0;
-}
-
-bool UserDict::is_prefix_spell_id(
- const uint16 * fullids, uint16 fulllen,
- const UserDictSearchable *searchable) {
- if (fulllen < searchable->splids_len)
- return false;
-
- uint32 i = 0;
- for (; i < searchable->splids_len; i++) {
- uint16 start_id = searchable->splid_start[i];
- uint16 count = searchable->splid_count[i];
- if (fullids[i] >= start_id && fullids[i] < start_id + count)
- continue;
- else
- return false;
- }
- return true;
-}
-
-bool UserDict::equal_spell_id(
- const uint16 * fullids, uint16 fulllen,
- const UserDictSearchable *searchable) {
- if (fulllen != searchable->splids_len)
- return false;
-
- uint32 i = 0;
- for (; i < fulllen; i++) {
- uint16 start_id = searchable->splid_start[i];
- uint16 count = searchable->splid_count[i];
- if (fullids[i] >= start_id && fullids[i] < start_id + count)
- continue;
- else
- return false;
- }
- return true;
-}
-
-int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
- int32 begin = 0;
- int32 end = dict_info_.lemma_count - 1;
- int32 middle = -1;
-
- int32 first_prefix = middle;
- int32 last_matched = middle;
-
- while (begin <= end) {
- middle = (begin + end) >> 1;
- uint32 offset = offsets_[middle];
- uint8 nchar = get_lemma_nchar(offset);
- const uint16 * splids = get_lemma_spell_ids(offset);
- int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
- int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
-
- if (pre)
- first_prefix = middle;
-
- if (cmp < 0) {
- begin = middle + 1;
- } else if (cmp > 0) {
- end = middle - 1;
- } else {
- end = middle - 1;
- last_matched = middle;
- }
- }
-
- return first_prefix;
-}
-
-void UserDict::prepare_locate(UserDictSearchable *searchable,
- const uint16 *splid_str,
- uint16 splid_str_len) {
- searchable->splids_len = splid_str_len;
- memset(searchable->signature, 0, sizeof(searchable->signature));
-
- SpellingTrie &spl_trie = SpellingTrie::get_instance();
- uint32 i = 0;
- for (; i < splid_str_len; i++) {
- if (spl_trie.is_half_id(splid_str[i])) {
- searchable->splid_count[i] =
- spl_trie.half_to_full(splid_str[i],
- &(searchable->splid_start[i]));
- } else {
- searchable->splid_count[i] = 1;
- searchable->splid_start[i] = splid_str[i];
- }
- const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
- searchable->signature[i>>2] |= (py << (8 * (i % 4)));
- }
-}
-
-size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
- LmaPsbItem *lpi_items, size_t lpi_max) {
- return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
-}
-
-size_t UserDict::_get_lpis(const uint16 *splid_str,
- uint16 splid_str_len, LmaPsbItem *lpi_items,
- size_t lpi_max, bool * need_extend) {
- bool tmp_extend;
- if (!need_extend)
- need_extend = &tmp_extend;
-
- *need_extend = false;
-
- if (is_valid_state() == false)
- return 0;
- if (lpi_max <= 0)
- return 0;
-
- if (0 == pthread_mutex_trylock(&g_mutex_)) {
- if (load_time_.tv_sec < g_last_update_.tv_sec ||
- (load_time_.tv_sec == g_last_update_.tv_sec &&
- load_time_.tv_usec < g_last_update_.tv_usec)) {
- // Others updated disk file, have to reload
- pthread_mutex_unlock(&g_mutex_);
- flush_cache();
- } else {
- pthread_mutex_unlock(&g_mutex_);
- }
- } else {
- }
-
- UserDictSearchable searchable;
- prepare_locate(&searchable, splid_str, splid_str_len);
-
- uint32 max_off = dict_info_.lemma_count;
-#ifdef ___CACHE_ENABLED___
- int32 middle;
- uint32 start, count;
- bool cached = cache_hit(&searchable, &start, &count);
- if (cached) {
- middle = start;
- max_off = start + count;
- } else {
- middle = locate_first_in_offsets(&searchable);
- start = middle;
- }
-#else
- int32 middle = locate_first_in_offsets(&searchable);
-#endif
-
- if (middle == -1) {
-#ifdef ___CACHE_ENABLED___
- if (!cached)
- cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
-#endif
- return 0;
- }
-
- size_t lpi_current = 0;
-
- bool fuzzy_break = false;
- bool prefix_break = false;
- while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
- if (lpi_current >= lpi_max)
- break;
- uint32 offset = offsets_[middle];
- // Ignore deleted lemmas
- if (offset & kUserDictOffsetFlagRemove) {
- middle++;
- continue;
- }
- uint8 nchar = get_lemma_nchar(offset);
- uint16 * splids = get_lemma_spell_ids(offset);
-#ifdef ___CACHE_ENABLED___
- if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
-#else
- if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
-#endif
- fuzzy_break = true;
- }
-
- if (prefix_break == false) {
- if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
- if (*need_extend == false &&
- is_prefix_spell_id(splids, nchar, &searchable)) {
- *need_extend = true;
- }
- } else {
- prefix_break = true;
- }
- }
-
- if (equal_spell_id(splids, nchar, &searchable) == true) {
- lpi_items[lpi_current].psb = translate_score(scores_[middle]);
- lpi_items[lpi_current].id = ids_[middle];
- lpi_items[lpi_current].lma_len = nchar;
- lpi_current++;
- }
- middle++;
- }
-
-#ifdef ___CACHE_ENABLED___
- if (!cached) {
- count = middle - start;
- cache_push(USER_DICT_CACHE, &searchable, start, count);
- }
-#endif
-
- return lpi_current;
-}
-
-uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
- uint16 str_max) {
- if (is_valid_state() == false)
- return 0;
- if (is_valid_lemma_id(id_lemma) == false)
- return 0;
- uint32 offset = offsets_by_id_[id_lemma - start_id_];
- uint8 nchar = get_lemma_nchar(offset);
- char16 * str = get_lemma_word(offset);
- uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
- int i = 0;
- for (; i < m; i++) {
- str_buf[i] = str[i];
- }
- str_buf[i] = 0;
- return m;
-}
-
-uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
- uint16 splids_max, bool arg_valid) {
- if (is_valid_lemma_id(id_lemma) == false)
- return 0;
- uint32 offset = offsets_by_id_[id_lemma - start_id_];
- uint8 nchar = get_lemma_nchar(offset);
- const uint16 * ids = get_lemma_spell_ids(offset);
- int i = 0;
- for (; i < nchar && i < splids_max; i++)
- splids[i] = ids[i];
- return i;
-}
-
-size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
- NPredictItem *npre_items, size_t npre_max,
- size_t b4_used) {
- uint32 new_added = 0;
-#ifdef ___PREDICT_ENABLED___
- int32 end = dict_info_.lemma_count - 1;
- int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
- if (j == -1)
- return 0;
-
- while (j <= end) {
- uint32 offset = predicts_[j];
- // Ignore deleted lemmas
- if (offset & kUserDictOffsetFlagRemove) {
- j++;
- continue;
- }
- uint32 nchar = get_lemma_nchar(offset);
- uint16 * words = get_lemma_word(offset);
- uint16 * splids = get_lemma_spell_ids(offset);
-
- if (nchar <= hzs_len) {
- j++;
- continue;
- }
-
- if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
- if (new_added >= npre_max) {
- return new_added;
- }
- uint32 cpy_len =
- (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
- - (hzs_len << 1);
- npre_items[new_added].his_len = hzs_len;
- npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
- memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
- if ((cpy_len >> 1) < kMaxPredictSize) {
- npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
- }
- new_added++;
- } else {
- break;
- }
-
- j++;
- }
-#endif
- return new_added;
-}
-
-int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
- uint16 lemma_len) {
- int32 max_off = dict_info_.lemma_count;
-
- UserDictSearchable searchable;
- prepare_locate(&searchable, splid_str, lemma_len);
-#ifdef ___CACHE_ENABLED___
- int32 off;
- uint32 start, count;
- bool cached = load_cache(&searchable, &start, &count);
- if (cached) {
- off = start;
- max_off = start + count;
- } else {
- off = locate_first_in_offsets(&searchable);
- start = off;
- }
-#else
- int32 off = locate_first_in_offsets(&searchable);
-#endif
-
- if (off == -1) {
- return off;
- }
-
- while (off < max_off) {
- uint32 offset = offsets_[off];
- if (offset & kUserDictOffsetFlagRemove) {
- off++;
- continue;
- }
- uint16 * splids = get_lemma_spell_ids(offset);
-#ifdef ___CACHE_ENABLED___
- if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
- break;
-#else
- if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
- break;
-#endif
- if (equal_spell_id(splids, lemma_len, &searchable) == true) {
- uint16 * str = get_lemma_word(offset);
- uint32 i = 0;
- for (i = 0; i < lemma_len; i++) {
- if (str[i] == lemma_str[i])
- continue;
- break;
- }
- if (i < lemma_len) {
- off++;
- continue;
- }
-#ifdef ___CACHE_ENABLED___
- // No need to save_cache here, since current function is invoked by
- // put_lemma. It's rarely possible for a user input same lemma twice.
- // That means first time user type a new lemma, it is newly added into
- // user dictionary, then it's possible that user type the same lemma
- // again.
- // Another reason save_cache can not be invoked here is this function
- // aborts when lemma is found, and it never knows the count.
-#endif
- return off;
- }
- off++;
- }
-
- return -1;
-}
-
-#ifdef ___PREDICT_ENABLED___
-uint32 UserDict::locate_where_to_insert_in_predicts(
- const uint16 * words, int lemma_len) {
- int32 begin = 0;
- int32 end = dict_info_.lemma_count - 1;
- int32 middle = end;
-
- uint32 last_matched = middle;
-
- while (begin <= end) {
- middle = (begin + end) >> 1;
- uint32 offset = offsets_[middle];
- uint8 nchar = get_lemma_nchar(offset);
- const uint16 * ws = get_lemma_word(offset);
-
- uint32 minl = nchar < lemma_len ? nchar : lemma_len;
- uint32 k = 0;
- int cmp = 0;
-
- for (; k < minl; k++) {
- if (ws[k] < words[k]) {
- cmp = -1;
- break;
- } else if (ws[k] > words[k]) {
- cmp = 1;
- break;
- }
- }
- if (cmp == 0) {
- if (nchar < lemma_len)
- cmp = -1;
- else if (nchar > lemma_len)
- cmp = 1;
- }
-
- if (cmp < 0) {
- begin = middle + 1;
- last_matched = middle;
- } else if (cmp > 0) {
- end = middle - 1;
- } else {
- end = middle - 1;
- last_matched = middle;
- }
- }
-
- return last_matched;
-}
-
-int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
- int32 begin = 0;
- int32 end = dict_info_.lemma_count - 1;
- int32 middle = -1;
-
- int32 last_matched = middle;
-
- while (begin <= end) {
- middle = (begin + end) >> 1;
- uint32 offset = offsets_[middle];
- uint8 nchar = get_lemma_nchar(offset);
- const uint16 * ws = get_lemma_word(offset);
-
- uint32 minl = nchar < lemma_len ? nchar : lemma_len;
- uint32 k = 0;
- int cmp = 0;
-
- for (; k < minl; k++) {
- if (ws[k] < words[k]) {
- cmp = -1;
- break;
- } else if (ws[k] > words[k]) {
- cmp = 1;
- break;
- }
- }
- if (cmp == 0) {
- if (nchar >= lemma_len)
- last_matched = middle;
- if (nchar < lemma_len)
- cmp = -1;
- else if (nchar > lemma_len)
- cmp = 1;
- }
-
- if (cmp < 0) {
- begin = middle + 1;
- } else if (cmp > 0) {
- end = middle - 1;
- } else {
- end = middle - 1;
- }
- }
-
- return last_matched;
-}
-
-#endif
-
-LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len) {
- int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
- if (off == -1) {
- return 0;
- }
-
- return ids_[off];
-}
-
-LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
- if (is_valid_state() == false)
- return 0;
- if (is_valid_lemma_id(lemma_id) == false)
- return 0;
-
- return translate_score(_get_lemma_score(lemma_id));
-}
-
-LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len) {
- if (is_valid_state() == false)
- return 0;
- return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
-}
-
-int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
- if (is_valid_state() == false)
- return 0;
- if (is_valid_lemma_id(lemma_id) == false)
- return 0;
-
- uint32 offset = offsets_by_id_[lemma_id - start_id_];
-
- uint32 nchar = get_lemma_nchar(offset);
- uint16 * spl = get_lemma_spell_ids(offset);
- uint16 * wrd = get_lemma_word(offset);
-
- int32 off = locate_in_offsets(wrd, spl, nchar);
- if (off == -1) {
- return 0;
- }
-
- return scores_[off];
-}
-
-int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len) {
- if (is_valid_state() == false)
- return 0;
-
- int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
- if (off == -1) {
- return 0;
- }
-
- return scores_[off];
-}
-
-#ifdef ___SYNC_ENABLED___
-void UserDict::remove_lemma_from_sync_list(uint32 offset) {
- offset &= kUserDictOffsetMask;
- uint32 i = 0;
- for (; i < dict_info_.sync_count; i++) {
- unsigned int off = (syncs_[i] & kUserDictOffsetMask);
- if (off == offset)
- break;
- }
- if (i < dict_info_.sync_count) {
- syncs_[i] = syncs_[dict_info_.sync_count - 1];
- dict_info_.sync_count--;
- }
-}
-#endif
-
-#ifdef ___PREDICT_ENABLED___
-void UserDict::remove_lemma_from_predict_list(uint32 offset) {
- offset &= kUserDictOffsetMask;
- uint32 i = 0;
- for (; i < dict_info_.lemma_count; i++) {
- unsigned int off = (predicts_[i] & kUserDictOffsetMask);
- if (off == offset) {
- predicts_[i] |= kUserDictOffsetFlagRemove;
- break;
- }
- }
-}
-#endif
-
-bool UserDict::remove_lemma_by_offset_index(int offset_index) {
- if (is_valid_state() == false)
- return 0;
-
- int32 off = offset_index;
- if (off == -1) {
- return false;
- }
-
- uint32 offset = offsets_[off];
- uint32 nchar = get_lemma_nchar(offset);
-
- offsets_[off] |= kUserDictOffsetFlagRemove;
-
-#ifdef ___SYNC_ENABLED___
- // Remove corresponding sync item
- remove_lemma_from_sync_list(offset);
-#endif
-
-#ifdef ___PREDICT_ENABLED___
- remove_lemma_from_predict_list(offset);
-#endif
- dict_info_.free_count++;
- dict_info_.free_size += (2 + (nchar << 2));
-
- if (state_ < USER_DICT_OFFSET_DIRTY)
- state_ = USER_DICT_OFFSET_DIRTY;
- return true;
-}
-
-bool UserDict::remove_lemma(LemmaIdType lemma_id) {
- if (is_valid_state() == false)
- return 0;
- if (is_valid_lemma_id(lemma_id) == false)
- return false;
- uint32 offset = offsets_by_id_[lemma_id - start_id_];
-
- uint32 nchar = get_lemma_nchar(offset);
- uint16 * spl = get_lemma_spell_ids(offset);
- uint16 * wrd = get_lemma_word(offset);
-
- int32 off = locate_in_offsets(wrd, spl, nchar);
-
- return remove_lemma_by_offset_index(off);
-}
-
-void UserDict::flush_cache() {
- LemmaIdType start_id = start_id_;
- if (!dict_file_)
- return;
- const char * file = strdup(dict_file_);
- if (!file)
- return;
- close_dict();
- load_dict(file, start_id, kUserDictIdEnd);
- free((void*)file);
-#ifdef ___CACHE_ENABLED___
- cache_init();
-#endif
- return;
-}
-
-bool UserDict::reset(const char *file) {
- FILE *fp = fopen(file, "w+");
- if (!fp) {
- return false;
- }
- uint32 version = kUserDictVersion;
- size_t wred = fwrite(&version, 1, 4, fp);
- UserDictInfo info;
- memset(&info, 0, sizeof(info));
- // By default, no limitation for lemma count and size
- // thereby, reclaim_ratio is never used
- wred += fwrite(&info, 1, sizeof(info), fp);
- if (wred != sizeof(info) + sizeof(version)) {
- fclose(fp);
- unlink(file);
- return false;
- }
- fclose(fp);
- return true;
-}
-
-bool UserDict::validate(const char *file) {
- // b is ignored in POSIX compatible os including Linux
- // while b is important flag for Windows to specify binary mode
- FILE *fp = fopen(file, "rb");
- if (!fp) {
- return false;
- }
-
- size_t size;
- size_t readed;
- uint32 version;
- UserDictInfo dict_info;
-
- // validate
- int err = fseek(fp, 0, SEEK_END);
- if (err) {
- goto error;
- }
-
- size = ftell(fp);
- if (size < 4 + sizeof(dict_info)) {
- goto error;
- }
-
- err = fseek(fp, 0, SEEK_SET);
- if (err) {
- goto error;
- }
-
- readed = fread(&version, 1, sizeof(version), fp);
- if (readed < sizeof(version)) {
- goto error;
- }
- if (version != kUserDictVersion) {
- goto error;
- }
-
- err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
- if (err) {
- goto error;
- }
-
- readed = fread(&dict_info, 1, sizeof(dict_info), fp);
- if (readed != sizeof(dict_info)) {
- goto error;
- }
-
- if (size != get_dict_file_size(&dict_info)) {
- goto error;
- }
-
- fclose(fp);
- return true;
-
- error:
- fclose(fp);
- return false;
-}
-
-bool UserDict::load(const char *file, LemmaIdType start_id) {
- if (0 != pthread_mutex_trylock(&g_mutex_)) {
- return false;
- }
- // b is ignored in POSIX compatible os including Linux
- // while b is important flag for Windows to specify binary mode
- FILE *fp = fopen(file, "rb");
- if (!fp) {
- pthread_mutex_unlock(&g_mutex_);
- return false;
- }
-
- size_t readed, toread;
- UserDictInfo dict_info;
- uint8 *lemmas = NULL;
- uint32 *offsets = NULL;
-#ifdef ___SYNC_ENABLED___
- uint32 *syncs = NULL;
-#endif
- uint32 *scores = NULL;
- uint32 *ids = NULL;
- uint32 *offsets_by_id = NULL;
-#ifdef ___PREDICT_ENABLED___
- uint32 *predicts = NULL;
-#endif
- size_t i;
- int err;
-
- err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
- if (err) goto error;
-
- readed = fread(&dict_info, 1, sizeof(dict_info), fp);
- if (readed != sizeof(dict_info)) goto error;
-
- lemmas = (uint8 *)malloc(
- dict_info.lemma_size +
- (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
-
- if (!lemmas) goto error;
-
- offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
- if (!offsets) goto error;
-
-#ifdef ___PREDICT_ENABLED___
- predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
- if (!predicts) goto error;
-#endif
-
-#ifdef ___SYNC_ENABLED___
- syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
- if (!syncs) goto error;
-#endif
-
- scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
- if (!scores) goto error;
-
- ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
- if (!ids) goto error;
-
- offsets_by_id = (uint32 *)malloc(
- (dict_info.lemma_count + kUserDictPreAlloc) << 2);
- if (!offsets_by_id) goto error;
-
- err = fseek(fp, 4, SEEK_SET);
- if (err) goto error;
-
- readed = 0;
- while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
- readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
- }
- if (readed < dict_info.lemma_size)
- goto error;
-
- toread = (dict_info.lemma_count << 2);
- readed = 0;
- while (readed < toread && !ferror(fp) && !feof(fp)) {
- readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
- }
- if (readed < toread)
- goto error;
-
-#ifdef ___PREDICT_ENABLED___
- toread = (dict_info.lemma_count << 2);
- readed = 0;
- while (readed < toread && !ferror(fp) && !feof(fp)) {
- readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
- }
- if (readed < toread)
- goto error;
-#endif
-
- readed = 0;
- while (readed < toread && !ferror(fp) && !feof(fp)) {
- readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
- }
- if (readed < toread)
- goto error;
-
-#ifdef ___SYNC_ENABLED___
- toread = (dict_info.sync_count << 2);
- readed = 0;
- while (readed < toread && !ferror(fp) && !feof(fp)) {
- readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
- }
- if (readed < toread)
- goto error;
-#endif
-
- for (i = 0; i < dict_info.lemma_count; i++) {
- ids[i] = start_id + i;
- offsets_by_id[i] = offsets[i];
- }
-
- lemmas_ = lemmas;
- offsets_ = offsets;
-#ifdef ___SYNC_ENABLED___
- syncs_ = syncs;
- sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
-#endif
- offsets_by_id_ = offsets_by_id;
- scores_ = scores;
- ids_ = ids;
-#ifdef ___PREDICT_ENABLED___
- predicts_ = predicts;
-#endif
- lemma_count_left_ = kUserDictPreAlloc;
- lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
- memcpy(&dict_info_, &dict_info, sizeof(dict_info));
- state_ = USER_DICT_SYNC;
-
- fclose(fp);
-
- pthread_mutex_unlock(&g_mutex_);
- return true;
-
- error:
- if (lemmas) free(lemmas);
- if (offsets) free(offsets);
-#ifdef ___SYNC_ENABLED___
- if (syncs) free(syncs);
-#endif
- if (scores) free(scores);
- if (ids) free(ids);
- if (offsets_by_id) free(offsets_by_id);
-#ifdef ___PREDICT_ENABLED___
- if (predicts) free(predicts);
-#endif
- fclose(fp);
- pthread_mutex_unlock(&g_mutex_);
- return false;
-}
-
-void UserDict::write_back() {
- // XXX write back is only allowed from close_dict due to thread-safe sake
- if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
- return;
- int fd = open(dict_file_, O_WRONLY);
- if (fd == -1)
- return;
- switch (state_) {
- case USER_DICT_DEFRAGMENTED:
- write_back_all(fd);
- break;
- case USER_DICT_LEMMA_DIRTY:
- write_back_lemma(fd);
- break;
- case USER_DICT_OFFSET_DIRTY:
- write_back_offset(fd);
- break;
- case USER_DICT_SCORE_DIRTY:
- write_back_score(fd);
- break;
-#ifdef ___SYNC_ENABLED___
- case USER_DICT_SYNC_DIRTY:
- write_back_sync(fd);
- break;
-#endif
- default:
- break;
- }
- // It seems truncate is not need on Linux, Windows except Mac
- // I am doing it here anyway for safety.
- off_t cur = lseek(fd, 0, SEEK_CUR);
-#ifndef _WIN32
- ftruncate(fd, cur);
-#endif
- close(fd);
- state_ = USER_DICT_SYNC;
-}
-
-#ifdef ___SYNC_ENABLED___
-void UserDict::write_back_sync(int fd) {
- int err = lseek(fd, 4 + dict_info_.lemma_size
- + (dict_info_.lemma_count << 3)
-#ifdef ___PREDICT_ENABLED___
- + (dict_info_.lemma_count << 2)
-#endif
- , SEEK_SET);
- if (err == -1)
- return;
- write(fd, syncs_, dict_info_.sync_count << 2);
- write(fd, &dict_info_, sizeof(dict_info_));
-}
-#endif
-
-void UserDict::write_back_offset(int fd) {
- int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
- if (err == -1)
- return;
- write(fd, offsets_, dict_info_.lemma_count << 2);
-#ifdef ___PREDICT_ENABLED___
- write(fd, predicts_, dict_info_.lemma_count << 2);
-#endif
- write(fd, scores_, dict_info_.lemma_count << 2);
-#ifdef ___SYNC_ENABLED___
- write(fd, syncs_, dict_info_.sync_count << 2);
-#endif
- write(fd, &dict_info_, sizeof(dict_info_));
-}
-
-void UserDict::write_back_score(int fd) {
- int err = lseek(fd, 4 + dict_info_.lemma_size
- + (dict_info_.lemma_count << 2)
-#ifdef ___PREDICT_ENABLED___
- + (dict_info_.lemma_count << 2)
-#endif
- , SEEK_SET);
- if (err == -1)
- return;
- write(fd, scores_, dict_info_.lemma_count << 2);
-#ifdef ___SYNC_ENABLED___
- write(fd, syncs_, dict_info_.sync_count << 2);
-#endif
- write(fd, &dict_info_, sizeof(dict_info_));
-}
-
-void UserDict::write_back_lemma(int fd) {
- int err = lseek(fd, 4, SEEK_SET);
- if (err == -1)
- return;
- // New lemmas are always appended, no need to write whole lemma block
- size_t need_write = kUserDictPreAlloc *
- (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
- err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
- if (err == -1)
- return;
- write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
-
- write(fd, offsets_, dict_info_.lemma_count << 2);
-#ifdef ___PREDICT_ENABLED___
- write(fd, predicts_, dict_info_.lemma_count << 2);
-#endif
- write(fd, scores_, dict_info_.lemma_count << 2);
-#ifdef ___SYNC_ENABLED___
- write(fd, syncs_, dict_info_.sync_count << 2);
-#endif
- write(fd, &dict_info_, sizeof(dict_info_));
-}
-
-void UserDict::write_back_all(int fd) {
- // XXX lemma_size is handled differently in writeall
- // and writelemma. I update lemma_size and lemma_count in different
- // places for these two cases. Should fix it to make it consistent.
- int err = lseek(fd, 4, SEEK_SET);
- if (err == -1)
- return;
- write(fd, lemmas_, dict_info_.lemma_size);
- write(fd, offsets_, dict_info_.lemma_count << 2);
-#ifdef ___PREDICT_ENABLED___
- write(fd, predicts_, dict_info_.lemma_count << 2);
-#endif
- write(fd, scores_, dict_info_.lemma_count << 2);
-#ifdef ___SYNC_ENABLED___
- write(fd, syncs_, dict_info_.sync_count << 2);
-#endif
- write(fd, &dict_info_, sizeof(dict_info_));
-}
-
-#ifdef ___CACHE_ENABLED___
-bool UserDict::load_cache(UserDictSearchable *searchable,
- uint32 *offset, uint32 *length) {
- UserDictCache *cache = &caches_[searchable->splids_len - 1];
- if (cache->head == cache->tail)
- return false;
-
- uint16 j, sig_len = kMaxLemmaSize / 4;
- uint16 i = cache->head;
- while (1) {
- j = 0;
- for (; j < sig_len; j++) {
- if (cache->signatures[i][j] != searchable->signature[j])
- break;
- }
- if (j < sig_len) {
- i++;
- if (i >= kUserDictCacheSize)
- i -= kUserDictCacheSize;
- if (i == cache->tail)
- break;
- continue;
- }
- *offset = cache->offsets[i];
- *length = cache->lengths[i];
- return true;
- }
- return false;
-}
-
-void UserDict::save_cache(UserDictSearchable *searchable,
- uint32 offset, uint32 length) {
- UserDictCache *cache = &caches_[searchable->splids_len - 1];
- uint16 next = cache->tail;
-
- cache->offsets[next] = offset;
- cache->lengths[next] = length;
- uint16 sig_len = kMaxLemmaSize / 4;
- uint16 j = 0;
- for (; j < sig_len; j++) {
- cache->signatures[next][j] = searchable->signature[j];
- }
-
- if (++next >= kUserDictCacheSize) {
- next -= kUserDictCacheSize;
- }
- if (next == cache->head) {
- cache->head++;
- if (cache->head >= kUserDictCacheSize) {
- cache->head -= kUserDictCacheSize;
- }
- }
- cache->tail = next;
-}
-
-void UserDict::reset_cache() {
- memset(caches_, 0, sizeof(caches_));
-}
-
-bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
- UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
- if (cache->head == cache->tail)
- return false;
-
- uint16 j, sig_len = kMaxLemmaSize / 4;
- uint16 i = cache->head;
- while (1) {
- j = 0;
- for (; j < sig_len; j++) {
- if (cache->signatures[i][j] != searchable->signature[j])
- break;
- }
- if (j < sig_len) {
- i++;
- if (i >= kUserDictMissCacheSize)
- i -= kUserDictMissCacheSize;
- if (i == cache->tail)
- break;
- continue;
- }
- return true;
- }
- return false;
-}
-
-void UserDict::save_miss_cache(UserDictSearchable *searchable) {
- UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
- uint16 next = cache->tail;
-
- uint16 sig_len = kMaxLemmaSize / 4;
- uint16 j = 0;
- for (; j < sig_len; j++) {
- cache->signatures[next][j] = searchable->signature[j];
- }
-
- if (++next >= kUserDictMissCacheSize) {
- next -= kUserDictMissCacheSize;
- }
- if (next == cache->head) {
- cache->head++;
- if (cache->head >= kUserDictMissCacheSize) {
- cache->head -= kUserDictMissCacheSize;
- }
- }
- cache->tail = next;
-}
-
-void UserDict::reset_miss_cache() {
- memset(miss_caches_, 0, sizeof(miss_caches_));
-}
-
-void UserDict::cache_init() {
- reset_cache();
- reset_miss_cache();
-}
-
-bool UserDict::cache_hit(UserDictSearchable *searchable,
- uint32 *offset, uint32 *length) {
- bool hit = load_miss_cache(searchable);
- if (hit) {
- *offset = 0;
- *length = 0;
- return true;
- }
- hit = load_cache(searchable, offset, length);
- if (hit) {
- return true;
- }
- return false;
-}
-
-void UserDict::cache_push(UserDictCacheType type,
- UserDictSearchable *searchable,
- uint32 offset, uint32 length) {
- switch (type) {
- case USER_DICT_MISS_CACHE:
- save_miss_cache(searchable);
- break;
- case USER_DICT_CACHE:
- save_cache(searchable, offset, length);
- break;
- default:
- break;
- }
-}
-
-#endif
-
-void UserDict::defragment(void) {
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- if (is_valid_state() == false)
- return;
- // Fixup offsets_, set REMOVE flag to lemma's flag if needed
- size_t first_freed = 0;
- size_t first_inuse = 0;
- while (first_freed < dict_info_.lemma_count) {
- // Find first freed offset
- while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
- first_freed < dict_info_.lemma_count) {
- first_freed++;
- }
- if (first_freed < dict_info_.lemma_count) {
- // Save REMOVE flag to lemma flag
- int off = offsets_[first_freed];
- set_lemma_flag(off, kUserDictLemmaFlagRemove);
- } else {
- break;
- }
- // Find first inuse offse after first_freed
- first_inuse = first_freed + 1;
- while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
- (first_inuse < dict_info_.lemma_count)) {
- // Save REMOVE flag to lemma flag
- int off = offsets_[first_inuse];
- set_lemma_flag(off, kUserDictLemmaFlagRemove);
- first_inuse++;
- }
- if (first_inuse >= dict_info_.lemma_count) {
- break;
- }
- // Swap offsets_
- int tmp = offsets_[first_inuse];
- offsets_[first_inuse] = offsets_[first_freed];
- offsets_[first_freed] = tmp;
- // Move scores_, no need to swap
- tmp = scores_[first_inuse];
- scores_[first_inuse] = scores_[first_freed];
- scores_[first_freed] = tmp;
- // Swap ids_
- LemmaIdType tmpid = ids_[first_inuse];
- ids_[first_inuse] = ids_[first_freed];
- ids_[first_freed] = tmpid;
- // Go on
- first_freed++;
- }
-#ifdef ___PREDICT_ENABLED___
- // Fixup predicts_
- first_freed = 0;
- first_inuse = 0;
- while (first_freed < dict_info_.lemma_count) {
- // Find first freed offset
- while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
- first_freed < dict_info_.lemma_count) {
- first_freed++;
- }
- if (first_freed >= dict_info_.lemma_count)
- break;
- // Find first inuse offse after first_freed
- first_inuse = first_freed + 1;
- while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
- && (first_inuse < dict_info_.lemma_count)) {
- first_inuse++;
- }
- if (first_inuse >= dict_info_.lemma_count) {
- break;
- }
- // Swap offsets_
- int tmp = predicts_[first_inuse];
- predicts_[first_inuse] = predicts_[first_freed];
- predicts_[first_freed] = tmp;
- // Go on
- first_freed++;
- }
-#endif
- dict_info_.lemma_count = first_freed;
- // Fixup lemmas_
- size_t begin = 0;
- size_t end = 0;
- size_t dst = 0;
- int total_size = dict_info_.lemma_size + lemma_size_left_;
- int total_count = dict_info_.lemma_count + lemma_count_left_;
- size_t real_size = total_size - lemma_size_left_;
- while (dst < real_size) {
- unsigned char flag = get_lemma_flag(dst);
- unsigned char nchr = get_lemma_nchar(dst);
- if ((flag & kUserDictLemmaFlagRemove) == 0) {
- dst += nchr * 4 + 2;
- continue;
- }
- break;
- }
- if (dst >= real_size)
- return;
-
- end = dst;
- while (end < real_size) {
- begin = end + get_lemma_nchar(end) * 4 + 2;
- repeat:
- // not used any more
- if (begin >= real_size)
- break;
- unsigned char flag = get_lemma_flag(begin);
- unsigned char nchr = get_lemma_nchar(begin);
- if (flag & kUserDictLemmaFlagRemove) {
- begin += nchr * 4 + 2;
- goto repeat;
- }
- end = begin + nchr * 4 + 2;
- while (end < real_size) {
- unsigned char eflag = get_lemma_flag(end);
- unsigned char enchr = get_lemma_nchar(end);
- if ((eflag & kUserDictLemmaFlagRemove) == 0) {
- end += enchr * 4 + 2;
- continue;
- }
- break;
- }
- memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
- for (size_t j = 0; j < dict_info_.lemma_count; j++) {
- if (offsets_[j] >= begin && offsets_[j] < end) {
- offsets_[j] -= (begin - dst);
- offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
- }
-#ifdef ___PREDICT_ENABLED___
- if (predicts_[j] >= begin && predicts_[j] < end) {
- predicts_[j] -= (begin - dst);
- }
-#endif
- }
-#ifdef ___SYNC_ENABLED___
- for (size_t j = 0; j < dict_info_.sync_count; j++) {
- if (syncs_[j] >= begin && syncs_[j] < end) {
- syncs_[j] -= (begin - dst);
- }
- }
-#endif
- dst += (end - begin);
- }
-
- dict_info_.free_count = 0;
- dict_info_.free_size = 0;
- dict_info_.lemma_size = dst;
- lemma_size_left_ = total_size - dict_info_.lemma_size;
- lemma_count_left_ = total_count - dict_info_.lemma_count;
-
- // XXX Without following code,
- // offsets_by_id_ is not reordered.
- // That's to say, all removed lemmas' ids are not collected back.
- // There may not be room for addition of new lemmas due to
- // offsests_by_id_ reason, although lemma_size_left_ is fixed.
- // By default, we do want defrag as fast as possible, because
- // during defrag procedure, other peers can not write new lemmas
- // to user dictionary file.
- // XXX If write-back is invoked immediately after
- // this defragment, no need to fix up following in-mem data.
- for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
- ids_[i] = start_id_ + i;
- offsets_by_id_[i] = offsets_[i];
- }
-
- state_ = USER_DICT_DEFRAGMENTED;
-
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("defragment");
-#endif
-}
-
-#ifdef ___SYNC_ENABLED___
-void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
- if (is_valid_state() == false)
- return;
- if (end > dict_info_.sync_count)
- end = dict_info_.sync_count;
- memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
- dict_info_.sync_count -= (end - start);
- if (state_ < USER_DICT_SYNC_DIRTY)
- state_ = USER_DICT_SYNC_DIRTY;
-}
-
-int UserDict::get_sync_count() {
- if (is_valid_state() == false)
- return 0;
- return dict_info_.sync_count;
-}
-
-LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt) {
- int again = 0;
- begin:
- LemmaIdType id;
- uint32 * syncs_bak = syncs_;
- syncs_ = NULL;
- id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
- syncs_ = syncs_bak;
- if (id == 0 && again == 0) {
- if ((dict_info_.limit_lemma_count > 0 &&
- dict_info_.lemma_count >= dict_info_.limit_lemma_count)
- || (dict_info_.limit_lemma_size > 0 &&
- dict_info_.lemma_size + (2 + (lemma_len << 2))
- > dict_info_.limit_lemma_size)) {
- // XXX Always reclaim and defrag in sync code path
- // sync thread is background thread and ok with heavy work
- reclaim();
- defragment();
- flush_cache();
- again = 1;
- goto begin;
- }
- }
- return id;
-}
-
-int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
- int newly_added = 0;
-
- SpellingParser * spl_parser = new SpellingParser();
- if (!spl_parser) {
- return 0;
- }
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- char16 *ptr = lemmas;
-
- // Extract pinyin,words,frequence,last_mod_time
- char16 * p = ptr, * py16 = ptr;
- char16 * hz16 = NULL;
- int py16_len = 0;
- uint16 splid[kMaxLemmaSize];
- int splid_len = 0;
- int hz16_len = 0;
- char16 * fr16 = NULL;
- int fr16_len = 0;
-
- while (p - ptr < len) {
- // Pinyin
- py16 = p;
- splid_len = 0;
- while (*p != 0x2c && (p - ptr) < len) {
- if (*p == 0x20)
- splid_len++;
- p++;
- }
- splid_len++;
- if (p - ptr == len)
- break;
- py16_len = p - py16;
- if (kMaxLemmaSize < splid_len) {
- break;
- }
- bool is_pre;
- int splidl = spl_parser->splstr16_to_idxs_f(
- py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
- if (splidl != splid_len)
- break;
- // Phrase
- hz16 = ++p;
- while (*p != 0x2c && (p - ptr) < len) {
- p++;
- }
- hz16_len = p - hz16;
- if (hz16_len != splid_len)
- break;
- // Frequency
- fr16 = ++p;
- fr16_len = 0;
- while (*p != 0x2c && (p - ptr) < len) {
- p++;
- }
- fr16_len = p - fr16;
- uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
- // Last modified time
- fr16 = ++p;
- fr16_len = 0;
- while (*p != 0x3b && (p - ptr) < len) {
- p++;
- }
- fr16_len = p - fr16;
- uint64 last_mod = utf16le_atoll(fr16, fr16_len);
-
- put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
- newly_added++;
-
- p++;
- }
-
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
-#endif
- return newly_added;
-}
-
-int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
- char16 * str, int size, int * count) {
- int len = 0;
- *count = 0;
-
- int left_len = size;
-
- if (is_valid_state() == false)
- return len;
-
- SpellingTrie * spl_trie = &SpellingTrie::get_instance();
- if (!spl_trie) {
- return 0;
- }
-
- uint32 i;
- for (i = 0; i < dict_info_.sync_count; i++) {
- int offset = syncs_[i];
- uint32 nchar = get_lemma_nchar(offset);
- uint16 *spl = get_lemma_spell_ids(offset);
- uint16 *wrd = get_lemma_word(offset);
- int score = _get_lemma_score(wrd, spl, nchar);
-
- static char score_temp[32], *pscore_temp = score_temp;
- static char16 temp[256], *ptemp = temp;
-
- pscore_temp = score_temp;
- ptemp = temp;
-
- uint32 j;
- // Add pinyin
- for (j = 0; j < nchar; j++) {
- int ret_len = spl_trie->get_spelling_str16(
- spl[j], ptemp, temp + sizeof(temp) - ptemp);
- if (ret_len <= 0)
- break;
- ptemp += ret_len;
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = ' ';
- } else {
- j = 0;
- break;
- }
- }
- if (j < nchar) {
- continue;
- }
- ptemp--;
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = ',';
- } else {
- continue;
- }
- // Add phrase
- for (j = 0; j < nchar; j++) {
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = wrd[j];
- } else {
- break;
- }
- }
- if (j < nchar) {
- continue;
- }
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = ',';
- } else {
- continue;
- }
- // Add frequency
- uint32 intf = extract_score_freq(score);
- int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
- if (ret_len <= 0)
- continue;
- ptemp += ret_len;
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = ',';
- } else {
- continue;
- }
- // Add last modified time
- uint64 last_mod = extract_score_lmt(score);
- ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
- if (ret_len <= 0)
- continue;
- ptemp += ret_len;
- if (ptemp < temp + sizeof(temp) - 1) {
- *(ptemp++) = ';';
- } else {
- continue;
- }
-
- // Write to string
- int need_len = ptemp - temp;
- if (need_len > left_len)
- break;
- memcpy(str + len, temp, need_len * 2);
- left_len -= need_len;
-
- len += need_len;
- (*count)++;
- }
-
- if (len > 0) {
- if (state_ < USER_DICT_SYNC_DIRTY)
- state_ = USER_DICT_SYNC_DIRTY;
- }
- return len;
-}
-
-#endif
-
-bool UserDict::state(UserDictStat * stat) {
- if (is_valid_state() == false)
- return false;
- if (!stat)
- return false;
- stat->version = version_;
- stat->file_name = dict_file_;
- stat->load_time.tv_sec = load_time_.tv_sec;
- stat->load_time.tv_usec = load_time_.tv_usec;
- pthread_mutex_lock(&g_mutex_);
- stat->last_update.tv_sec = g_last_update_.tv_sec;
- stat->last_update.tv_usec = g_last_update_.tv_usec;
- pthread_mutex_unlock(&g_mutex_);
- stat->disk_size = get_dict_file_size(&dict_info_);
- stat->lemma_count = dict_info_.lemma_count;
- stat->lemma_size = dict_info_.lemma_size;
- stat->delete_count = dict_info_.free_count;
- stat->delete_size = dict_info_.free_size;
-#ifdef ___SYNC_ENABLED___
- stat->sync_count = dict_info_.sync_count;
-#endif
- stat->limit_lemma_count = dict_info_.limit_lemma_count;
- stat->limit_lemma_size = dict_info_.limit_lemma_size;
- stat->reclaim_ratio = dict_info_.reclaim_ratio;
- return true;
-}
-
-void UserDict::set_limit(uint32 max_lemma_count,
- uint32 max_lemma_size, uint32 reclaim_ratio) {
- dict_info_.limit_lemma_count = max_lemma_count;
- dict_info_.limit_lemma_size = max_lemma_size;
- if (reclaim_ratio > 100)
- reclaim_ratio = 100;
- dict_info_.reclaim_ratio = reclaim_ratio;
-}
-
-void UserDict::reclaim() {
- if (is_valid_state() == false)
- return;
-
- switch (dict_info_.reclaim_ratio) {
- case 0:
- return;
- case 100:
- // TODO: CLEAR to be implemented
- assert(false);
- return;
- default:
- break;
- }
-
- // XXX Reclaim is only based on count, not size
- uint32 count = dict_info_.lemma_count;
- int rc = count * dict_info_.reclaim_ratio / 100;
-
- UserDictScoreOffsetPair * score_offset_pairs = NULL;
- score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
- sizeof(UserDictScoreOffsetPair) * rc);
- if (score_offset_pairs == NULL) {
- return;
- }
-
- for (int i = 0; i < rc; i++) {
- int s = scores_[i];
- score_offset_pairs[i].score = s;
- score_offset_pairs[i].offset_index = i;
- }
-
- for (int i = (rc + 1) / 2; i >= 0; i--)
- shift_down(score_offset_pairs, i, rc);
-
- for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
- int s = scores_[i];
- if (s < score_offset_pairs[0].score) {
- score_offset_pairs[0].score = s;
- score_offset_pairs[0].offset_index = i;
- shift_down(score_offset_pairs, 0, rc);
- }
- }
-
- for (int i = 0; i < rc; i++) {
- int off = score_offset_pairs[i].offset_index;
- remove_lemma_by_offset_index(off);
- }
- if (rc > 0) {
- if (state_ < USER_DICT_OFFSET_DIRTY)
- state_ = USER_DICT_OFFSET_DIRTY;
- }
-
- free(score_offset_pairs);
-}
-
-inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
- int s = sop[i].score;
- int p = sop[i].offset_index;
- sop[i].score = sop[j].score;
- sop[i].offset_index = sop[j].offset_index;
- sop[j].score = s;
- sop[j].offset_index = p;
-}
-
-void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
- int par = i;
- while (par < n) {
- int left = par * 2 + 1;
- int right = left + 1;
- if (left >= n && right >= n)
- break;
- if (right >= n) {
- if (sop[left].score > sop[par].score) {
- swap(sop, left, par);
- par = left;
- continue;
- }
- } else if (sop[left].score > sop[right].score &&
- sop[left].score > sop[par].score) {
- swap(sop, left, par);
- par = left;
- continue;
- } else if (sop[right].score > sop[left].score &&
- sop[right].score > sop[par].score) {
- swap(sop, right, par);
- par = right;
- continue;
- }
- break;
- }
-}
-
-LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count) {
- return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
-}
-
-LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt) {
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- if (is_valid_state() == false)
- return 0;
- int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
- if (off != -1) {
- int delta_score = count - scores_[off];
- dict_info_.total_nfreq += delta_score;
- scores_[off] = build_score(lmt, count);
- if (state_ < USER_DICT_SCORE_DIRTY)
- state_ = USER_DICT_SCORE_DIRTY;
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("_put_lemma(update)");
-#endif
- return ids_[off];
- } else {
- if ((dict_info_.limit_lemma_count > 0 &&
- dict_info_.lemma_count >= dict_info_.limit_lemma_count)
- || (dict_info_.limit_lemma_size > 0 &&
- dict_info_.lemma_size + (2 + (lemma_len << 2))
- > dict_info_.limit_lemma_size)) {
- // XXX Don't defragment here, it's too time-consuming.
- return 0;
- }
- int flushed = 0;
- if (lemma_count_left_ == 0 ||
- lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
-
- // XXX When there is no space for new lemma, we flush to disk
- // flush_cache() may be called by upper user
- // and better place shoule be found instead of here
- flush_cache();
- flushed = 1;
- // Or simply return and do nothing
- // return 0;
- }
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
-#endif
- LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
-#ifdef ___SYNC_ENABLED___
- if (syncs_ && id != 0) {
- queue_lemma_for_sync(id);
- }
-#endif
- return id;
- }
- return 0;
-}
-
-#ifdef ___SYNC_ENABLED___
-void UserDict::queue_lemma_for_sync(LemmaIdType id) {
- if (dict_info_.sync_count < sync_count_size_) {
- syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
- } else {
- uint32 * syncs = (uint32*)realloc(
- syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
- if (syncs) {
- sync_count_size_ += kUserDictPreAlloc;
- syncs_ = syncs;
- syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
- }
- }
-}
-#endif
-
-LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
- bool selected) {
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_BEGIN;
-#endif
- if (is_valid_state() == false)
- return 0;
- if (is_valid_lemma_id(lemma_id) == false)
- return 0;
- uint32 offset = offsets_by_id_[lemma_id - start_id_];
- uint8 lemma_len = get_lemma_nchar(offset);
- char16 * lemma_str = get_lemma_word(offset);
- uint16 * splids = get_lemma_spell_ids(offset);
-
- int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
- if (off != -1) {
- int score = scores_[off];
- int count = extract_score_freq(score);
- uint64 lmt = extract_score_lmt(score);
- if (count + delta_count > kUserDictMaxFrequency ||
- count + delta_count < count) {
- delta_count = kUserDictMaxFrequency - count;
- }
- count += delta_count;
- dict_info_.total_nfreq += delta_count;
- if (selected) {
- lmt = time(NULL);
- }
- scores_[off] = build_score(lmt, count);
- if (state_ < USER_DICT_SCORE_DIRTY)
- state_ = USER_DICT_SCORE_DIRTY;
-#ifdef ___DEBUG_PERF___
- DEBUG_PERF_END;
- LOGD_PERF("update_lemma");
-#endif
-#ifdef ___SYNC_ENABLED___
- queue_lemma_for_sync(ids_[off]);
-#endif
- return ids_[off];
- }
- return 0;
-}
-
-size_t UserDict::get_total_lemma_count() {
- return dict_info_.total_nfreq;
-}
-
-void UserDict::set_total_lemma_count_of_others(size_t count) {
- total_other_nfreq_ = count;
-}
-
-LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
- uint16 lemma_len, uint16 count, uint64 lmt) {
- LemmaIdType id = get_max_lemma_id() + 1;
- size_t offset = dict_info_.lemma_size;
- if (offset > kUserDictOffsetMask)
- return 0;
-
- lemmas_[offset] = 0;
- lemmas_[offset + 1] = (uint8)lemma_len;
- for (size_t i = 0; i < lemma_len; i++) {
- *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
- *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
- = lemma_str[i];
- }
- uint32 off = dict_info_.lemma_count;
- offsets_[off] = offset;
- scores_[off] = build_score(lmt, count);
- ids_[off] = id;
-#ifdef ___PREDICT_ENABLED___
- predicts_[off] = offset;
-#endif
-
- offsets_by_id_[id - start_id_] = offset;
-
- dict_info_.lemma_count++;
- dict_info_.lemma_size += (2 + (lemma_len << 2));
- lemma_count_left_--;
- lemma_size_left_ -= (2 + (lemma_len << 2));
-
- // Sort
-
- UserDictSearchable searchable;
- prepare_locate(&searchable, splids, lemma_len);
-
- size_t i = 0;
- while (i < off) {
- offset = offsets_[i];
- uint32 nchar = get_lemma_nchar(offset);
- uint16 * spl = get_lemma_spell_ids(offset);
-
- if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
- break;
- i++;
- }
- if (i != off) {
- uint32 temp = offsets_[off];
- memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
- offsets_[i] = temp;
-
- temp = scores_[off];
- memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
- scores_[i] = temp;
-
- temp = ids_[off];
- memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
- ids_[i] = temp;
- }
-
-#ifdef ___PREDICT_ENABLED___
- uint32 j = 0;
- uint16 * words_new = get_lemma_word(predicts_[off]);
- j = locate_where_to_insert_in_predicts(words_new, lemma_len);
- if (j != off) {
- uint32 temp = predicts_[off];
- memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
- predicts_[j] = temp;
- }
-#endif
-
- if (state_ < USER_DICT_LEMMA_DIRTY)
- state_ = USER_DICT_LEMMA_DIRTY;
-
-#ifdef ___CACHE_ENABLED___
- cache_init();
-#endif
-
- dict_info_.total_nfreq += count;
- return id;
-}
-}
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp
deleted file mode 100644
index fadb6cf2..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/utf16char.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include "../include/utf16char.h"
-
-namespace ime_pinyin {
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- char16* utf16_strtok(char16 *utf16_str, size_t *token_size,
- char16 **utf16_str_next) {
- if (NULL == utf16_str || NULL == token_size || NULL == utf16_str_next) {
- return NULL;
- }
-
- // Skip the splitters
- size_t pos = 0;
- while ((char16)' ' == utf16_str[pos] || (char16)'\n' == utf16_str[pos]
- || (char16)'\t' == utf16_str[pos])
- pos++;
-
- utf16_str += pos;
- pos = 0;
-
- while ((char16)'\0' != utf16_str[pos] && (char16)' ' != utf16_str[pos]
- && (char16)'\n' != utf16_str[pos]
- && (char16)'\t' != utf16_str[pos]) {
- pos++;
- }
-
- char16 *ret_val = utf16_str;
- if ((char16)'\0' == utf16_str[pos]) {
- *utf16_str_next = NULL;
- if (0 == pos)
- return NULL;
- } else {
- *utf16_str_next = utf16_str + pos + 1;
- }
-
- utf16_str[pos] = (char16)'\0';
- *token_size = pos;
-
- return ret_val;
- }
-
- int utf16_atoi(const char16 *utf16_str) {
- if (NULL == utf16_str)
- return 0;
-
- int value = 0;
- int sign = 1;
- size_t pos = 0;
-
- if ((char16)'-' == utf16_str[pos]) {
- sign = -1;
- pos++;
- }
-
- while ((char16)'0' <= utf16_str[pos] &&
- (char16)'9' >= utf16_str[pos]) {
- value = value * 10 + static_cast<int>(utf16_str[pos] - (char16)'0');
- pos++;
- }
-
- return value*sign;
- }
-
- float utf16_atof(const char16 *utf16_str) {
- // A temporary implemetation.
- char char8[256];
- if (utf16_strlen(utf16_str) >= 256) return 0;
-
- utf16_strcpy_tochar(char8, utf16_str);
- return atof(char8);
- }
-
- size_t utf16_strlen(const char16 *utf16_str) {
- if (NULL == utf16_str)
- return 0;
-
- size_t size = 0;
- while ((char16)'\0' != utf16_str[size])
- size++;
- return size;
- }
-
- int utf16_strcmp(const char16* str1, const char16* str2) {
- size_t pos = 0;
- while (str1[pos] == str2[pos] && (char16)'\0' != str1[pos])
- pos++;
-
- return static_cast<int>(str1[pos]) - static_cast<int>(str2[pos]);
- }
-
- int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size) {
- size_t pos = 0;
- while (pos < size && str1[pos] == str2[pos] && (char16)'\0' != str1[pos])
- pos++;
-
- if (pos == size)
- return 0;
-
- return static_cast<int>(str1[pos]) - static_cast<int>(str2[pos]);
- }
-
- // we do not consider overlapping
- char16* utf16_strcpy(char16 *dst, const char16 *src) {
- if (NULL == src || NULL == dst)
- return NULL;
-
- char16* cp = dst;
-
- while ((char16)'\0' != *src) {
- *cp = *src;
- cp++;
- src++;
- }
-
- *cp = *src;
-
- return dst;
- }
-
- char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size) {
- if (NULL == src || NULL == dst || 0 == size)
- return NULL;
-
- if (src == dst)
- return dst;
-
- char16* cp = dst;
-
- if (dst < src || (dst > src && dst >= src + size)) {
- while (size-- && (*cp++ = *src++))
- ;
- } else {
- cp += size - 1;
- src += size - 1;
- while (size-- && (*cp-- == *src--))
- ;
- }
- return dst;
- }
-
- // We do not handle complicated cases like overlapping, because in this
- // codebase, it is not necessary.
- char* utf16_strcpy_tochar(char *dst, const char16 *src) {
- if (NULL == src || NULL == dst)
- return NULL;
-
- char* cp = dst;
-
- while ((char16)'\0' != *src) {
- *cp = static_cast<char>(*src);
- cp++;
- src++;
- }
- *cp = *src;
-
- return dst;
- }
-
-#ifdef __cplusplus
-}
-#endif
-} // namespace ime_pinyin
diff --git a/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp b/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp
deleted file mode 100644
index d8e5de59..00000000
--- a/src/virtualkeyboard/3rdparty/pinyin/share/utf16reader.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/utf16reader.h"
-
-namespace ime_pinyin {
-
-#define MIN_BUF_LEN 128
-#define MAX_BUF_LEN 65535
-
-Utf16Reader::Utf16Reader() {
- fp_ = NULL;
- buffer_ = NULL;
- buffer_total_len_ = 0;
- buffer_next_pos_ = 0;
- buffer_valid_len_ = 0;
-}
-
-Utf16Reader::~Utf16Reader() {
- if (NULL != fp_)
- fclose(fp_);
-
- if (NULL != buffer_)
- delete [] buffer_;
-}
-
-
-bool Utf16Reader::open(const char* filename, size_t buffer_len) {
- if (filename == NULL)
- return false;
-
- if (buffer_len < MIN_BUF_LEN)
- buffer_len = MIN_BUF_LEN;
- else if (buffer_len > MAX_BUF_LEN)
- buffer_len = MAX_BUF_LEN;
-
- buffer_total_len_ = buffer_len;
-
- if (NULL != buffer_)
- delete [] buffer_;
- buffer_ = new char16[buffer_total_len_];
- if (NULL == buffer_)
- return false;
-
- if ((fp_ = fopen(filename, "rb")) == NULL)
- return false;
-
- // the UTF16 file header, skip
- char16 header;
- if (fread(&header, sizeof(header), 1, fp_) != 1 || header != 0xfeff) {
- fclose(fp_);
- fp_ = NULL;
- return false;
- }
-
- return true;
-}
-
-char16* Utf16Reader::readline(char16* read_buf, size_t max_len) {
- if (NULL == fp_ || NULL == read_buf || 0 == max_len)
- return NULL;
-
- size_t ret_len = 0;
-
- do {
- if (buffer_valid_len_ == 0) {
- buffer_next_pos_ = 0;
- buffer_valid_len_ = fread(buffer_, sizeof(char16),
- buffer_total_len_, fp_);
- if (buffer_valid_len_ == 0) {
- if (0 == ret_len)
- return NULL;
- read_buf[ret_len] = (char16)'\0';
- return read_buf;
- }
- }
-
- for (size_t i = 0; i < buffer_valid_len_; i++) {
- if (i == max_len - 1 ||
- buffer_[buffer_next_pos_ + i] == (char16)'\n') {
- if (ret_len + i > 0 && read_buf[ret_len + i - 1] == (char16)'\r') {
- read_buf[ret_len + i - 1] = (char16)'\0';
- } else {
- read_buf[ret_len + i] = (char16)'\0';
- }
-
- i++;
- buffer_next_pos_ += i;
- buffer_valid_len_ -= i;
- if (buffer_next_pos_ == buffer_total_len_) {
- buffer_next_pos_ = 0;
- buffer_valid_len_ = 0;
- }
- return read_buf;
- } else {
- read_buf[ret_len + i] = buffer_[buffer_next_pos_ + i];
- }
- }
-
- ret_len += buffer_valid_len_;
- buffer_valid_len_ = 0;
- } while (true);
-
- // Never reach here
- return NULL;
-}
-
-bool Utf16Reader::close() {
- if (NULL != fp_)
- fclose(fp_);
- fp_ = NULL;
-
- if (NULL != buffer_)
- delete [] buffer_;
- buffer_ = NULL;
- return true;
-}
-} // namespace ime_pinyin