src/plugins/pinyin/3rdparty/pinyin/include/matrixsearch.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__

#include <stdlib.h>
#include "./atomdictbase.h"
#include "./dicttrie.h"
#include "./searchutility.h"
#include "./spellingtrie.h"
#include "./splparser.h"

namespace ime_pinyin {

static const size_t kMaxRowNum = kMaxSearchSteps;

typedef struct {
  // MileStoneHandle objects for the system and user dictionaries.
  MileStoneHandle dict_handles[2];
  // From which DMI node. -1 means it's from root.
  PoolPosType dmi_fr;
  // The spelling id for the Pinyin string from the previous DMI to this node.
  // If it is a half id like Shengmu, the node pointed by dict_node is the first
  // node with this Shengmu,
  uint16 spl_id;
  // What's the level of the dict node. Level of root is 0, but root is never
  // recorded by dict_node.
  unsigned char dict_level:7;
  // If this node is for composing phrase, this bit is 1.
  unsigned char c_phrase:1;
  // Whether the spl_id is parsed with a split character at the end.
  unsigned char splid_end_split:1;
  // What's the length of the spelling string for this match, for the whole
  // word.
  unsigned char splstr_len:7;
  // Used to indicate whether all spelling ids from the root are full spelling
  // ids. This information is useful for keymapping mode(not finished). Because
  // in this mode, there is no clear boundaries, we prefer those results which
  // have full spelling ids.
  unsigned char all_full_id:1;
} DictMatchInfo, *PDictMatchInfo;

typedef struct MatrixNode {
  LemmaIdType id;
  float score;
  MatrixNode *from;
  // From which DMI node. Used to trace the spelling segmentation.
  PoolPosType dmi_fr;
  uint16 step;
} MatrixNode, *PMatrixNode;

typedef struct {
  // The MatrixNode position in the matrix pool
  PoolPosType mtrx_nd_pos;
  // The DictMatchInfo position in the DictMatchInfo pool.
  PoolPosType dmi_pos;
  uint16 mtrx_nd_num;
  uint16 dmi_num:15;
  // Used to indicate whether there are dmi nodes in this step with full
  // spelling id. This information is used to decide whether a substring of a
  // valid Pinyin should be extended.
  //
  // Example1: shoudao
  // When the last char 'o' is added, the parser will find "dao" is a valid
  // Pinyin, and because all dmi nodes at location 'd' (including those for
  // "shoud", and those for "d") have Shengmu id only, so it is not necessary
  // to extend "ao", otherwise the result may be "shoud ao", that is not
  // reasonable.
  //
  // Example2: hengao
  // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
  // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
  // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
  // "heng ao" can also be the result.
  //
  // Similarly, "ganga" is expanded to "gang a".
  //
  // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
  // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
  // is not valid either). If the parser uses break in the loop, the result
  // always be "xian"; but if the parser uses continue in the loop, "xi an" will
  // also be tried. This behaviour can be set via the function
  // set_xi_an_switch().
  uint16 dmi_has_full_id:1;
  // Points to a MatrixNode of the current step to indicate which choice the
  // user selects.
  MatrixNode *mtrx_nd_fixed;
} MatrixRow, *PMatrixRow;

// When user inputs and selects candidates, the fixed lemma ids are stored in
// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
// lemmas from the beginning are fixed. If user deletes Pinyin characters one
// by one from the end, these fixed lemmas can be unlocked one by one when
// necessary. Whenever user deletes a Chinese character and its spelling string
// in these fixed lemmas, all fixed lemmas will be merged together into a unit
// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
// phrase will be the first lemma in the sentence. Because it contains some
// modified lemmas (by deleting a character), these merged lemmas are called
// sub lemmas (sublma), and each of them are represented individually, so that
// when user deletes Pinyin characters from the end, these sub lemmas can also
// be unlocked one by one.
typedef struct {
  uint16 spl_ids[kMaxRowNum];
  uint16 spl_start[kMaxRowNum];
  char16 chn_str[kMaxRowNum];       // Chinese string.
  uint16 sublma_start[kMaxRowNum];  // Counted in Chinese characters.
  size_t sublma_num;
  uint16 length;                    // Counted in Chinese characters.
} ComposingPhrase, *TComposingPhrase;

class MatrixSearch {
 private:
  // If it is true, prediction list by string whose length is greater than 1
  // will be limited to a reasonable number.
  static const bool kPredictLimitGt1 = false;

  // If it is true, the engine will prefer long history based prediction,
  // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
  // based on the two-character history.
  static const bool kPreferLongHistoryPredict = true;

  // If it is true, prediction will only be based on user dictionary. this flag
  // is for debug purpose.
  static const bool kOnlyUserDictPredict = false;

  // The maximum buffer to store LmaPsbItems.
  static const size_t kMaxLmaPsbItems = 1450;

  // How many rows for each step.
  static const size_t kMaxNodeARow = 5;

  // The maximum length of the sentence candidates counted in chinese
  // characters
  static const size_t kMaxSentenceLength = 16;

  // The size of the matrix node pool.
  static const size_t kMtrxNdPoolSize = 200;

  // The size of the DMI node pool.
  static const size_t kDmiPoolSize = 800;

  // Used to indicate whether this object has been initialized.
  bool inited_;

  // Spelling trie.
  const SpellingTrie *spl_trie_;

  // Used to indicate this switcher status: when "xian" is parseed, should
  // "xi an" also be extended. Default is false.
  // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
  // should be valid for a FULL spelling, or a combination of two spellings,
  // first of which is a FULL id too. So even it is true, "da" will never be
  // split into "d a", because "d" is not a full spelling id.
  bool xi_an_enabled_;

  // System dictionary.
  DictTrie* dict_trie_;

  // User dictionary.
  AtomDictBase* user_dict_;

  // Spelling parser.
  SpellingParser* spl_parser_;

  // The maximum allowed length of spelling string (such as a Pinyin string).
  size_t max_sps_len_;

  // The maximum allowed length of a result Chinese string.
  size_t max_hzs_len_;

  // Pinyin string. Max length: kMaxRowNum - 1
  char pys_[kMaxRowNum];

  // The length of the string that has been decoded successfully.
  size_t pys_decoded_len_;

  // Shared buffer for multiple purposes.
  size_t *share_buf_;

  MatrixNode *mtrx_nd_pool_;
  PoolPosType mtrx_nd_pool_used_;    // How many nodes used in the pool
  DictMatchInfo *dmi_pool_;
  PoolPosType dmi_pool_used_;        // How many items used in the pool

  MatrixRow *matrix_;                // The first row is for starting

  DictExtPara *dep_;                 // Parameter used to extend DMI nodes.

  NPredictItem *npre_items_;         // Used to do prediction
  size_t npre_items_len_;

  // The starting positions and lemma ids for the full sentence candidate.
  size_t lma_id_num_;
  uint16 lma_start_[kMaxRowNum];     // Counted in spelling ids.
  LemmaIdType lma_id_[kMaxRowNum];
  size_t fixed_lmas_;

  // If fixed_lmas_ is bigger than i,  Element i is used to indicate whether
  // the i'th lemma id in lma_id_ is the first candidate for that step.
  // If all candidates are the first one for that step, the whole string can be
  // decoded by the engine automatically, so no need to add it to user
  // dictionary. (We are considering to add it to user dictionary in the
  // future).
  uint8 fixed_lmas_no1_[kMaxRowNum];

  // Composing phrase
  ComposingPhrase c_phrase_;

  // If dmi_c_phrase_ is true, the decoder will try to match the
  // composing phrase (And definitely it will match successfully). If it
  // is false, the decoder will try to match lemmas items in dictionaries.
  bool dmi_c_phrase_;

  // The starting positions and spelling ids for the first full sentence
  // candidate.
  size_t spl_id_num_;                // Number of splling ids
  uint16 spl_start_[kMaxRowNum];     // Starting positions
  uint16 spl_id_[kMaxRowNum];        // Spelling ids
  // Used to remember the last fixed position, counted in Hanzi.
  size_t fixed_hzs_;

  // Lemma Items with possibility score, two purposes:
  // 1. In Viterbi decoding, this buffer is used to get all possible candidates
  // for current step;
  // 2. When the search is done, this buffer is used to get candiates from the
  // first un-fixed step and show them to the user.
  LmaPsbItem lpi_items_[kMaxLmaPsbItems];
  size_t lpi_total_;

  // Assign the pointers with NULL. The caller makes sure that all pointers are
  // not valid before calling it. This function only will be called in the
  // construction function and free_resource().
  void reset_pointers_to_null();

  bool alloc_resource();

  void free_resource();

  // Reset the search space totally.
  bool reset_search0();

  // Reset the search space from ch_pos step. For example, if the original
  // input Pinyin is "an", reset_search(1) will reset the search space to the
  // result of "a". If the given position is out of range, return false.
  // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
  // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
  // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
  // The DMI nodes will be kept.
  //
  // Note: this function should not destroy content of pys_.
  bool reset_search(size_t ch_pos, bool clear_fixed_this_step,
                    bool clear_dmi_this_step, bool clear_mtrx_this_step);

  // Delete a part of the content in pys_.
  void del_in_pys(size_t start, size_t len);

  // Delete a spelling id and its corresponding Chinese character, and merge
  // the fixed lemmas into the composing phrase.
  // del_spl_pos indicates which spelling id needs to be delete.
  // This function will update the lemma and spelling segmentation information.
  // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
  // the fixed lemmas.
  void merge_fixed_lmas(size_t del_spl_pos);

  // Get spelling start posistions and ids. The result will be stored in
  // spl_id_num_, spl_start_[], spl_id_[].
  // fixed_hzs_ will be also assigned.
  void get_spl_start_id();

  // Get all lemma ids with match the given spelling id stream(shorter than the
  // maximum length of a word).
  // If pfullsent is not NULL, means the full sentence candidate may be the
  // same with the coming lemma string, if so, remove that lemma.
  // The result is sorted in descendant order by the frequency score.
  size_t get_lpis(const uint16* splid_str, size_t splid_str_len,
                  LmaPsbItem* lma_buf, size_t max_lma_buf,
                  const char16 *pfullsent, bool sort_by_psb);

  uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);

  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
                          uint16 splids_max, bool arg_valid);


  // Extend a DMI node with a spelling id. ext_len is the length of the rows
  // to extend, actually, it is the size of the spelling string of splid.
  // return value can be 1 or 0.
  // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
  // the pool).
  // 0 means either the dmi node can not be extended with splid, or the splid
  // is a Shengmu id, which is only used to get lpi_items, or the result node
  // in DictTrie has no son, it is not nccessary to keep the new DMI.
  //
  // This function modifies the content of lpi_items_ and lpi_total_.
  // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
  // The function's returned value has no relation with the value of lpi_num.
  //
  // If dmi == NULL, this function will extend the root node of DictTrie
  //
  // This function will not change dmi_nd_pool_used_. Please change it after
  // calling this function if necessary.
  //
  // The caller should guarantees that NULL != dep.
  size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);

  // Extend dmi for the composing phrase.
  size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);

  // Extend a MatrixNode with the give LmaPsbItem list.
  // res_row is the destination row number.
  // This function does not change mtrx_nd_pool_used_. Please change it after
  // calling this function if necessary.
  // return 0 always.
  size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
                        size_t lpi_num, PoolPosType dmi_fr, size_t res_row);


  // Try to find a dmi node at step_to position, and the found dmi node should
  // match the given spelling id strings.
  PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num);

  bool add_char(char ch);
  bool prepare_add_char(char ch);

  // Called after prepare_add_char, so the input char has been saved.
  bool add_char_qwerty();

  // Prepare candidates from the last fixed hanzi position.
  void prepare_candidates();

  // Is the character in step pos a splitter character?
  // The caller guarantees that the position is valid.
  bool is_split_at(uint16 pos);

  void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
                PoolPosType dmi_fr,
                uint16 spl_id, uint16 node_num, unsigned char dict_level,
                bool splid_end_split, unsigned char splstr_len,
                unsigned char all_full_id);

  size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
                       char16 predict_buf[][kMaxPredictSize + 1],
                       size_t buf_len);

  // Add the first candidate to the user dictionary.
  bool try_add_cand0_to_userdict();

  // Add a user lemma to the user dictionary. This lemma is a subset of
  // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
  // number of lemmas to be combined together as a new lemma. The caller
  // gurantees that the combined new lemma's length is less or equal to
  // kMaxLemmaSize.
  bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);

  // Update dictionary frequencies.
  void update_dict_freq();

  void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);

 public:
  MatrixSearch();
  ~MatrixSearch();

  bool init(const char *fn_sys_dict, const char *fn_usr_dict);

  bool init_fd(int sys_fd, long start_offset, long length,
               const char *fn_usr_dict);

  void init_user_dictionary(const char *fn_usr_dict);

  bool is_user_dictionary_enabled() const;

  void set_max_lens(size_t max_sps_len, size_t max_hzs_len);

  void close();

  void flush_cache();

  void set_xi_an_switch(bool xi_an_enabled);

  bool get_xi_an_switch();

  // Reset the search space. Equivalent to reset_search(0).
  // If inited, always return true;
  bool reset_search();

  // Search a Pinyin string.
  // Return value is the position successfully parsed.
  size_t search(const char *py, size_t py_len);

  // Used to delete something in the Pinyin string kept by the engine, and do
  // a re-search.
  // Return value is the new length of Pinyin string kept by the engine which
  // is parsed successfully.
  // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
  // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
  // characters for pos-th spelling id needs to be deleted.
  // If the deleted character(s) is just after a fixed lemma or sub lemma in
  // composing phrase, clear_fixed_this_step indicates whether we needs to
  // unlock the last fixed lemma or sub lemma.
  // If is_pos_in_splid is false, and pos-th character is in the range for the
  // fixed lemmas or composing string, this function will do nothing and just
  // return the result of the previous search.
  size_t delsearch(size_t pos, bool is_pos_in_splid,
                   bool clear_fixed_this_step);

  // Get the number of candiates, called after search().
  size_t get_candidate_num();

  // Get the Pinyin string stored by the engine.
  // *decoded_len returns the length of the successfully decoded string.
  const char* get_pystr(size_t *decoded_len);

  // Get the spelling boundaries for the first sentence candidate.
  // Number of spellings will be returned. The number of valid elements in
  // spl_start is one more than the return value because the last one is used
  // to indicate the beginning of the next un-input speling.
  // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
  size_t get_spl_start(const uint16 *&spl_start);

  // Get one candiate string. If full sentence candidate is available, it will
  // be the first one.
  char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len);

  // Get the first candiate, which is a "full sentence".
  // retstr_len is not NULL, it will be used to return the string length.
  // If only_unfixed is true, only unfixed part will be fetched.
  char16* get_candidate0(char16* cand_str, size_t max_len,
                         uint16 *retstr_len, bool only_unfixed);

  // Choose a candidate. The decoder will do a search after the fixed position.
  size_t choose(size_t cand_id);

  // Cancel the last choosing operation, and return the new number of choices.
  size_t cancel_last_choice();

  // Get the length of fixed Hanzis.
  size_t get_fixedlen();

  size_t get_predicts(const char16 fixed_buf[],
                      char16 predict_buf[][kMaxPredictSize + 1],
                      size_t buf_len);
};
}

#endif  // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__