summaryrefslogtreecommitdiffstats
path: root/src/linguist/shared/simtexth.h
blob: 1d0ebcb077e3b39561deaa18468cd37f79eada3d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0

#ifndef SIMTEXTH_H
#define SIMTEXTH_H

const int textSimilarityThreshold = 190;

#include <QString>
#include <QList>

QT_BEGIN_NAMESPACE

class Translator;

struct Candidate
{
    Candidate() {}
    Candidate(const QString &c, const QString &s, const QString &d, const QString &t)
        : context(c), source(s), disambiguation(d), translation(t)
    {}

    QString context;
    QString source;
    QString disambiguation;
    QString translation;
};

inline bool operator==( const Candidate& c, const Candidate& d ) {
    return c.translation == d.translation && c.source == d.source && c.context == d.context
           && c.disambiguation == d.disambiguation;
}
inline bool operator!=( const Candidate& c, const Candidate& d ) {
    return !operator==( c, d );
}

typedef QList<Candidate> CandidateList;

struct CoMatrix
{
    CoMatrix(const QString &str);
    CoMatrix() {}

    /*
      The matrix has 20 * 20 = 400 entries.  This requires 50 bytes, or 13
      words.  Some operations are performed on words for more efficiency.
    */
    union {
        quint8 b[52];
        quint32 w[13];
    };
};

/**
 * This class is more efficient for searching through a large array of candidate strings, since we only
 * have to construct the CoMatrix for the \a stringToMatch once,
 * after that we just call getSimilarityScore(strCandidate).
 * \sa getSimilarityScore
 */
class StringSimilarityMatcher {
public:
    StringSimilarityMatcher(const QString &stringToMatch);
    int getSimilarityScore(const QString &strCandidate);

private:
    CoMatrix m_cm;
    int m_length;
};

/**
 * Checks how similar two strings are.
 * The return value is the score, and a higher score is more similar
 * than one with a low score.
 * Linguist considers a score over 190 to be a good match.
 * \sa StringSimilarityMatcher
 */
static inline int getSimilarityScore(const QString &str1, const QString &str2)
{
    return StringSimilarityMatcher(str1).getSimilarityScore(str2);
}

CandidateList similarTextHeuristicCandidates( const Translator *tor,
                                              const QString &text,
                                              int maxCandidates );

QT_END_NAMESPACE

#endif