src/bm/dataqualitystats.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

/****************************************************************************
**
** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
** Contact: Qt Software Information (qt-info@nokia.com)
**
** This file is part of the BM project on Qt Labs.
**
** This file may be used under the terms of the GNU General Public
** License version 2.0 or 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of
** this file.  Please review the following information to ensure GNU
** General Public Licensing requirements will be met:
** http://www.fsf.org/licensing/licenses/info/GPLv2.html and
** http://www.gnu.org/copyleft/gpl.html.
**
** If you are unsure which license is appropriate for your use, please
** contact the sales department at qt-sales@nokia.com.
**
** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
**
****************************************************************************/

#include "dataqualitystats.h"
#include <QList>
#include <QMap>
#include <QDebug>

/* NOTES

Params:

- diffTolerance (in percent: 0 <= x <= 100)
- stabTolerance (a positive integer: x >= 2)

-------------------------------------------------

Def: An equality subsequence (ESS) is a subsequence v1, v2, ..., vn of a result history
     for which the following condition holds:

         ∀ i >= 1 : 100 * (max(vi, v1) / min(vi, v1) - 1) <= diffTolerance


Def: A maximal equality subsequence (MaxESS) is one of the subsequences formed by
     partitioning a result history into the smallest possible number of ESS'es.


Def: The stability fraction (SF) of a result history is the fraction (given as a
     percentage: 0 <= SF <= 100) of its MaxESS'es that are stable.
     More precisely,

         SF = 100 * (stableMaxESS / totalMaxESS),

     where stableMaxESS is the the number of MaxESS'es that have a length of at least
     stabTolerance and totalMaxESS is the total number of MaxESS'es.

-------

Stats 1: Percentile distribution of the SF values for the contributing result histories.

P_95 = x -> x is the smallest SF value that is larger than or equal to that
            of 95% of the result histories (i.e. 95% of the result histories have an SF value
            that is smaller than or equal to x)

Example:

100 100 100 100 100 (good)
100  80  30  20  10 (bad)

The following 10 values: 95, 90, 80, 50, 40, 40, 40, 40, 5, 0
gives the following percentile distribution:

P_100 =  95   (the worst 100 % of the RHs have a SF of 95 or worse, and 95 is also the max SF)
P_90  =  90   (           90                           90         )
P_80  =  80   (           80                           80         )
P_70  =  50   (           70                           50         )
P_60  =  40   (           60                           40         )
P_50  =  40   (           50                           40         )
P_40  =  40   (           40                           40         )
P_30  =  40   (           30                           40         )
P_20  =   5   (           20                            5         )
P_10  =   0   (           10                            0         )

Note: The quality of a RH is proportional to its SF value, so we want the percentile distribution
      to start (at P_100) as high as possible (ideally at 100), and end (at P_10) as high
      as possible.

*/


// ### 2 B DOCUMENTED!
void DataQualityStats::compute(const QList<ResultHistoryInfo *> &rhInfos)
{
    // Step 1: Compute the MaxESSTotalCount and MaxESSStableCount for each RH
    //         (compute for the exact median-smoothed values that formed the
    //         basis for computing the index, i.e. simply ignore the outliers)
    //
    // Step 2: Compute the complete distribution of MaxESSTotalCount
    //         (note that the number of distinct counts are likely to be only a small
    //         fraction of the number of RHs):
    //
    //           TC(c1) = <# of RHs with a MaxESSTotalCount of c1>
    //           TC(c2) = <# of RHs with a MaxESSTotalCount of c2>
    //           ...
    //           TC(cN) = <# of RHs with a MaxESSTotalCount of cN>
    //
    //         (TC = Total Count, and N is number of distinct counts)
    //          
    // Step 3: Compute the percentile distribution (for the 10 levels 10%, 20%, ..., 100%)
    //         of the SF (stability fraction) values (where the SF for a given
    //         RH is MaxESSStableCount / MaxESSTotalCount):
    //
    //           SFP(100) = <the max SF value for the worst 100% of the RHs (i.e. all RHs!)>
    //           SFP(90)  = <the max SF value for the worst  90% of the RHs>
    //           SFP(80)  = <the max SF value for the worst  80% of the RHs>
    //           ...
    //           SFP(10)  = <the max SF value for the worst  10% of the RHs>
    //
    //           (SFP = Stability Fraction Percentile)


    // *** Step 1: Extract total and stable MaxESS counts for each result history ***

    Q_ASSERT(diffTolerance >= 0.0);
    Q_ASSERT(stabTolerance >= 2);

    QList<int> totalMaxESS;
    QList<int> stableMaxESS;

    for (int i = 0; i < rhInfos.size(); ++i) {
        int total = 0;
        int stable = 0;
        rhInfos.at(i)->computeMaxESSStats(diffTolerance, stabTolerance, &total, &stable);
        totalMaxESS.append(total);
        stableMaxESS.append(stable);
    }


    // *** Step 2: Compute the frequency distribution of the total MaxESS counts ***
    totalMaxESSFreq_.clear();
    for (int i = 0; i < totalMaxESS.size(); ++i)
        ++(totalMaxESSFreq_[totalMaxESS.at(i)]);


    // *** Step 3: Compute the percentile distribution of the stability fractions ***

    // All subsequence counts (subsequence counts >= 0):
    QList<qreal> stabFractions0;

    // Subsequence counts >= 2 (i.e. result histories with actual changes in them and thus
    //                          the ones that are really interesting):
    QList<qreal> stabFractions2;

    for (int i = 0; i < totalMaxESS.size(); ++i) {
        if (totalMaxESS.at(i) > 0) {
            const qreal sf = stableMaxESS.at(i) / static_cast<qreal>(totalMaxESS.at(i));
            stabFractions0.append(sf);
            if (totalMaxESS.at(i) >= 2)
                stabFractions2.append(sf);
        }
    }
    qSort(stabFractions0);
    qSort(stabFractions2);


    // --------
    const int hiPos0 = stabFractions0.size() - 1;
    const int hiPos2 = stabFractions2.size() - 1;
    for (int p = 10; p <= 100; p += 10) {
        const int i0 = qMin(hiPos0, static_cast<int>(hiPos0 * (p / 100.0)));
        const int i2 = qMin(hiPos2, static_cast<int>(hiPos2 * (p / 100.0)));
        stabFracPercentiles_.insert(
            p, qMakePair(
                (i0 >= 0) ? (100 * stabFractions0.at(i0)) : -1,
                (i2 >= 0) ? (100 * stabFractions2.at(i2)) : -1)
            );
    }
}