aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/pinyin/3rdparty/pinyin/include/userdict.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/pinyin/3rdparty/pinyin/include/userdict.h')
-rw-r--r--src/plugins/pinyin/3rdparty/pinyin/include/userdict.h434
1 files changed, 434 insertions, 0 deletions
diff --git a/src/plugins/pinyin/3rdparty/pinyin/include/userdict.h b/src/plugins/pinyin/3rdparty/pinyin/include/userdict.h
new file mode 100644
index 00000000..db010912
--- /dev/null
+++ b/src/plugins/pinyin/3rdparty/pinyin/include/userdict.h
@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_USERDICT_H__
+#define PINYINIME_INCLUDE_USERDICT_H__
+
+#define ___CACHE_ENABLED___
+#define ___SYNC_ENABLED___
+#define ___PREDICT_ENABLED___
+
+// Debug performance for operations
+// #define ___DEBUG_PERF___
+
+#ifdef _WIN32
+#include <time.h>
+#include <winsock.h> // timeval
+#else
+#include <pthread.h>
+#include <sys/time.h>
+#endif
+#include "atomdictbase.h"
+
+namespace ime_pinyin {
+
+class UserDict : public AtomDictBase {
+ public:
+ UserDict();
+ ~UserDict();
+
+ bool load_dict(const char *file_name, LemmaIdType start_id,
+ LemmaIdType end_id);
+
+ bool close_dict();
+
+ size_t number_of_lemmas();
+
+ void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
+
+ MileStoneHandle extend_dict(MileStoneHandle from_handle,
+ const DictExtPara *dep, LmaPsbItem *lpi_items,
+ size_t lpi_max, size_t *lpi_num);
+
+ size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
+ LmaPsbItem *lpi_items, size_t lpi_max);
+
+ uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
+ uint16 str_max);
+
+ uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
+ uint16 splids_max, bool arg_valid);
+
+ size_t predict(const char16 last_hzs[], uint16 hzs_len,
+ NPredictItem *npre_items, size_t npre_max,
+ size_t b4_used);
+
+ // Full spelling ids are required
+ LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len, uint16 count);
+
+ LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
+ bool selected);
+
+ LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len);
+
+ LmaScoreType get_lemma_score(LemmaIdType lemma_id);
+
+ LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len);
+
+ bool remove_lemma(LemmaIdType lemma_id);
+
+ size_t get_total_lemma_count();
+ void set_total_lemma_count_of_others(size_t count);
+
+ void flush_cache();
+
+ void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
+ uint32 reclaim_ratio);
+
+ void reclaim();
+
+ void defragment();
+
+#ifdef ___SYNC_ENABLED___
+ void clear_sync_lemmas(unsigned int start, unsigned int end);
+
+ int get_sync_count();
+
+ LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len, uint16 count, uint64 lmt);
+ /**
+ * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
+ *
+ * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
+ * @param len length of lemmas string in UTF-16LE
+ * @return newly added lemma count
+ */
+ int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
+
+ /**
+ * Get lemmas need sync to a UTF-16LE string of above format.
+ * Note: input buffer (str) must not be too small. If str is too small to
+ * contain single one lemma, there might be a dead loop.
+ *
+ * @param str buffer to write lemmas
+ * @param size buffer size in UTF-16LE
+ * @param count output value of lemma returned
+ * @return UTF-16LE string length
+ */
+ int get_sync_lemmas_in_utf16le_string_from_beginning(
+ char16 * str, int size, int * count);
+
+#endif
+
+ struct UserDictStat {
+ uint32 version;
+ const char * file_name;
+ struct timeval load_time;
+ struct timeval last_update;
+ uint32 disk_size;
+ uint32 lemma_count;
+ uint32 lemma_size;
+ uint32 delete_count;
+ uint32 delete_size;
+#ifdef ___SYNC_ENABLED___
+ uint32 sync_count;
+#endif
+ uint32 reclaim_ratio;
+ uint32 limit_lemma_count;
+ uint32 limit_lemma_size;
+ };
+
+ bool state(UserDictStat * stat);
+
+ private:
+ uint32 total_other_nfreq_;
+ struct timeval load_time_;
+ LemmaIdType start_id_;
+ uint32 version_;
+ uint8 * lemmas_;
+
+ // In-Memory-Only flag for each lemma
+ static const uint8 kUserDictLemmaFlagRemove = 1;
+ // Inuse lemmas' offset
+ uint32 * offsets_;
+ // Highest bit in offset tells whether corresponding lemma is removed
+ static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
+ // Maximum possible for the offset
+ static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
+ // Bit width for last modified time, from 1 to 16
+ static const uint32 kUserDictLMTBitWidth = 16;
+ // Granularity for last modified time in second
+ static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
+ // Maximum frequency count
+ static const uint16 kUserDictMaxFrequency = 0xFFFF;
+
+#define COARSE_UTC(year, month, day, hour, minute, second) \
+ ( \
+ (year - 1970) * 365 * 24 * 60 * 60 + \
+ (month - 1) * 30 * 24 * 60 * 60 + \
+ (day - 1) * 24 * 60 * 60 + \
+ (hour - 0) * 60 * 60 + \
+ (minute - 0) * 60 + \
+ (second - 0) \
+ )
+ static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
+
+ // Correspond to offsets_
+ uint32 * scores_;
+ // Following two fields are only valid in memory
+ uint32 * ids_;
+#ifdef ___PREDICT_ENABLED___
+ uint32 * predicts_;
+#endif
+#ifdef ___SYNC_ENABLED___
+ uint32 * syncs_;
+ size_t sync_count_size_;
+#endif
+ uint32 * offsets_by_id_;
+
+ size_t lemma_count_left_;
+ size_t lemma_size_left_;
+
+ const char * dict_file_;
+
+ // Be sure size is 4xN
+ struct UserDictInfo {
+ // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
+ uint32 reclaim_ratio;
+ // maximum lemma count, 0 means no limitation
+ uint32 limit_lemma_count;
+ // Maximum lemma size, it's different from
+ // whole disk file size or in-mem dict size
+ // 0 means no limitation
+ uint32 limit_lemma_size;
+ // Total lemma count including deleted and inuse
+ // Also indicate offsets_ size
+ uint32 lemma_count;
+ // Total size of lemmas including used and freed
+ uint32 lemma_size;
+ // Freed lemma count
+ uint32 free_count;
+ // Freed lemma size in byte
+ uint32 free_size;
+#ifdef ___SYNC_ENABLED___
+ uint32 sync_count;
+#endif
+ int32 total_nfreq;
+ } dict_info_;
+
+ static const uint32 kUserDictVersion = 0x0ABCDEF0;
+
+ static const uint32 kUserDictPreAlloc = 32;
+ static const uint32 kUserDictAverageNchar = 8;
+
+ enum UserDictState {
+ // Keep in order
+ USER_DICT_NONE = 0,
+ USER_DICT_SYNC,
+#ifdef ___SYNC_ENABLED___
+ USER_DICT_SYNC_DIRTY,
+#endif
+ USER_DICT_SCORE_DIRTY,
+ USER_DICT_OFFSET_DIRTY,
+ USER_DICT_LEMMA_DIRTY,
+
+ USER_DICT_DEFRAGMENTED,
+ } state_;
+
+ struct UserDictSearchable {
+ uint16 splids_len;
+ uint16 splid_start[kMaxLemmaSize];
+ uint16 splid_count[kMaxLemmaSize];
+ // Compact inital letters for both FuzzyCompareSpellId and cache system
+ uint32 signature[kMaxLemmaSize / 4];
+ };
+
+#ifdef ___CACHE_ENABLED___
+ enum UserDictCacheType {
+ USER_DICT_CACHE,
+ USER_DICT_MISS_CACHE,
+ };
+
+ static const int kUserDictCacheSize = 4;
+ static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
+
+ struct UserDictMissCache {
+ uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
+ uint16 head, tail;
+ } miss_caches_[kMaxLemmaSize];
+
+ struct UserDictCache {
+ uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
+ uint32 offsets[kUserDictCacheSize];
+ uint32 lengths[kUserDictCacheSize];
+ // Ring buffer
+ uint16 head, tail;
+ } caches_[kMaxLemmaSize];
+
+ void cache_init();
+
+ void cache_push(UserDictCacheType type,
+ UserDictSearchable *searchable,
+ uint32 offset, uint32 length);
+
+ bool cache_hit(UserDictSearchable *searchable,
+ uint32 *offset, uint32 *length);
+
+ bool load_cache(UserDictSearchable *searchable,
+ uint32 *offset, uint32 *length);
+
+ void save_cache(UserDictSearchable *searchable,
+ uint32 offset, uint32 length);
+
+ void reset_cache();
+
+ bool load_miss_cache(UserDictSearchable *searchable);
+
+ void save_miss_cache(UserDictSearchable *searchable);
+
+ void reset_miss_cache();
+#endif
+
+ LmaScoreType translate_score(int f);
+
+ int extract_score_freq(int raw_score);
+
+ uint64 extract_score_lmt(int raw_score);
+
+ inline int build_score(uint64 lmt, int freq);
+
+ inline int64 utf16le_atoll(uint16 *s, int len);
+
+ inline int utf16le_lltoa(int64 v, uint16 *s, int size);
+
+ LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len, uint16 count, uint64 lmt);
+
+ size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
+ LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
+
+ int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
+
+ int _get_lemma_score(LemmaIdType lemma_id);
+
+ int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
+ const UserDictSearchable *searchable);
+
+ bool is_prefix_spell_id(const uint16 * fullids,
+ uint16 fulllen, const UserDictSearchable *searchable);
+
+ uint32 get_dict_file_size(UserDictInfo * info);
+
+ bool reset(const char *file);
+
+ bool validate(const char *file);
+
+ bool load(const char *file, LemmaIdType start_id);
+
+ bool is_valid_state();
+
+ bool is_valid_lemma_id(LemmaIdType id);
+
+ LemmaIdType get_max_lemma_id();
+
+ void set_lemma_flag(uint32 offset, uint8 flag);
+
+ char get_lemma_flag(uint32 offset);
+
+ char get_lemma_nchar(uint32 offset);
+
+ uint16 * get_lemma_spell_ids(uint32 offset);
+
+ uint16 * get_lemma_word(uint32 offset);
+
+ // Prepare searchable to fasten locate process
+ void prepare_locate(UserDictSearchable *searchable,
+ const uint16 * splids, uint16 len);
+
+ // Compare initial letters only
+ int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
+ const UserDictSearchable *searchable);
+
+ // Compare exactly two spell ids
+ // First argument must be a full id spell id
+ bool equal_spell_id(const uint16 * fullids,
+ uint16 fulllen, const UserDictSearchable *searchable);
+
+ // Find first item by initial letters
+ int32 locate_first_in_offsets(const UserDictSearchable *searchable);
+
+ LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len, uint16 count, uint64 lmt);
+
+ // Check if a lemma is in dictionary
+ int32 locate_in_offsets(char16 lemma_str[],
+ uint16 splid_str[], uint16 lemma_len);
+
+ bool remove_lemma_by_offset_index(int offset_index);
+#ifdef ___PREDICT_ENABLED___
+ uint32 locate_where_to_insert_in_predicts(const uint16 * words,
+ int lemma_len);
+
+ int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
+
+ void remove_lemma_from_predict_list(uint32 offset);
+#endif
+#ifdef ___SYNC_ENABLED___
+ void queue_lemma_for_sync(LemmaIdType id);
+
+ void remove_lemma_from_sync_list(uint32 offset);
+
+ void write_back_sync(int fd);
+#endif
+ void write_back_score(int fd);
+ void write_back_offset(int fd);
+ void write_back_lemma(int fd);
+ void write_back_all(int fd);
+ void write_back();
+
+ struct UserDictScoreOffsetPair {
+ int score;
+ uint32 offset_index;
+ };
+
+ inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
+
+ void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
+
+ // On-disk format for each lemma
+ // +-------------+
+ // | Version (4) |
+ // +-------------+
+ // +-----------+-----------+--------------------+-------------------+
+ // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
+ // +-----------+-----------+--------------------+-------------------+
+ // ...
+ // +-----------------------+ +-------------+ <---Offset of offset
+ // | Offset1 by_splids (4) | ... | OffsetN (4) |
+ // +-----------------------+ +-------------+
+#ifdef ___PREDICT_ENABLED___
+ // +----------------------+ +-------------+
+ // | Offset1 by_lemma (4) | ... | OffsetN (4) |
+ // +----------------------+ +-------------+
+#endif
+ // +------------+ +------------+
+ // | Score1 (4) | ... | ScoreN (4) |
+ // +------------+ +------------+
+#ifdef ___SYNC_ENABLED___
+ // +-------------+ +-------------+
+ // | NewAdd1 (4) | ... | NewAddN (4) |
+ // +-------------+ +-------------+
+#endif
+ // +----------------+
+ // | Dict Info (4x) |
+ // +----------------+
+};
+}
+
+#endif