1 files changed, 367 insertions, 62 deletions
diff --git a/scripts/misc.py b/scripts/misc.py
index b3a2274..8bd8516 100644
--- a/scripts/misc.py
+++ b/scripts/misc.py
@@ -45,6 +45,33 @@ def textToId(table, text):
 
 
 # ### 2 B DOCUMENTED!
+def findOrInsertId(table, value, *args):
+
+    query_result = execQuery(
+        "SELECT id FROM " + table + " WHERE value = %s", (value,))
+    if len(query_result) == 1:
+        # Found, so return ID:
+        return query_result[0][0]
+
+    # Not found, so insert:
+    query = "INSERT INTO " + table + " (value"
+    for i in range(0, len(args), 2):
+        query += ", " + args[i]
+    query += ") VALUES (%s"
+    values = [value]
+    for i in range(0, len(args), 2):
+        query += ", %s"
+        values.append(args[i + 1])
+
+    # ... and retrieve ID:
+    query += ") RETURNING id"
+    query_result = execQuery(query, values)
+
+    assert len(query_result) == 1
+    return query_result[0][0]
+
+
+# ### 2 B DOCUMENTED!
 # Maybe also rename to lowerIsBetter() ? (but note that a global function with
 # that name already exists in uploadresults.py)
 def metricIdToLowerIsBetter(metric_id):
@@ -64,6 +91,13 @@ def metricIdToLowerIsBetter(metric_id):
 
 # Returns the non-negative ID of the given context, or -1 if not found.
 def getContext(host_id, platform_id, branch_id, sha1_id):
+    global contextIdCache
+    if not 'contextIdCache' in globals():
+        contextIdCache = {}
+
+    if (host_id, platform_id, branch_id, sha1_id) in contextIdCache:
+        return contextIdCache[host_id, platform_id, branch_id, sha1_id]
+
     result = execQuery(
         "SELECT id FROM context"
         " WHERE hostId = %s"
@@ -72,9 +106,10 @@ def getContext(host_id, platform_id, branch_id, sha1_id):
         " AND sha1Id = %s"
         "LIMIT 1",
         (host_id, platform_id, branch_id, sha1_id))
-    if len(result):
-        return result[0][0]
-    return -1
+    result = result[0][0] if len(result) else -1
+    contextIdCache[host_id, platform_id, branch_id, sha1_id] = result
+
+    return result
 
 
 # Returns the test case, test function, and data tag components of
@@ -100,32 +135,46 @@ def getTimestampFromContext(context_id):
 
 # Finds snapshots that match a host/platform/branch combination and that
 # lie within the range
-#    [sha11, sha12] if sha12_id >= 0, or
-#    [sha11, +inf) if sha12_ is < 0.
+#    [sha11, sha12] if both sha11_id and sha12_id are >= 0, or
+#    ( -inf, sha12] if only sha11_id is < 0, or
+#    [sha11,  +inf) if only sha12_id is < 0, or
+#    ( -inf,  +inf) if both sha11_id and sha2_id are < 0
+#
 # Returns a chronologically order n-tuple of 2-tuples:
 #   (sha1, first upload timestamp).
 def getSnapshots(host_id, platform_id, branch_id, sha11_id, sha12_id):
-    timestamp1 = execQuery(
+
+    timestamp1 = (execQuery(
         "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context"
         " WHERE hostId = %s"
         " AND platformId = %s"
         " AND branchId = %s"
         " AND sha1Id = %s",
         (host_id, platform_id, branch_id, sha11_id))[0][0]
-    if sha12_id >= 0:
-        timestamp2 = execQuery(
-            "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context"
-            " WHERE hostId = %s"
-            " AND platformId = %s"
-            " AND branchId = %s"
-            " AND sha1Id = %s",
-            (host_id, platform_id, branch_id, sha12_id))[0][0]
+                  if (sha11_id >= 0) else -1)
+
+    timestamp2 = (execQuery(
+        "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context"
+        " WHERE hostId = %s"
+        " AND platformId = %s"
+        " AND branchId = %s"
+        " AND sha1Id = %s",
+        (host_id, platform_id, branch_id, sha12_id))[0][0]
+                  if (sha12_id >= 0) else -1)
+
+    prefix = "AND EXTRACT(EPOCH FROM timestamp)::INT"
+
+    if (timestamp1 == -1) and (timestamp2 == -1):
+        range_expr = ""
+    elif (timestamp1 >= 0) and (timestamp2 < 0):
+        range_expr = ("%s >= %d" % (prefix, timestamp1))
+    elif (timestamp1 < 0) and (timestamp2 >= 0):
+        range_expr = ("%s <= %d" % (prefix, timestamp2))
+    else:
         # Ensure chronological order:
         if timestamp1 > timestamp2:
             timestamp1, timestamp2 = timestamp2, timestamp1
-        range_expr = "BETWEEN %d AND %d" % (timestamp1, timestamp2)
-    else:
-        range_expr = ">= %d" % timestamp1
+        range_expr = ("%s BETWEEN %d AND %d" % (prefix, timestamp1, timestamp2))
 
     # Each distinct SHA-1 that occurs for this host/platform/branch
     # combination may occur multiple times with different upload times.
@@ -138,8 +187,7 @@ def getSnapshots(host_id, platform_id, branch_id, sha11_id, sha12_id):
         " FROM context"
         " WHERE hostId = %s"
         " AND platformId = %s"
-        " AND branchId = %s"
-        " AND EXTRACT(EPOCH FROM timestamp)::INT " + range_expr +
+        " AND branchId = %s " + range_expr +
         " ORDER BY timestamp ASC",
         (host_id, platform_id, branch_id))
 
@@ -169,48 +217,8 @@ def getAllSnapshots(host_id, platform_id, branch_id, reverse = False):
     return tuple(snapshots)
 
 
-# Returns the (SHA-1 ID, timestamp) pair associated with the most recent
-# rankings computed for the given host/platform/branch combination, or
-# (-1, -1) if no match is found.
-def getLastRankingSnapshot(host_id, platform_id, branch_id):
-    result = execQuery(
-        "SELECT matchingcontext.sha1id, EXTRACT(EPOCH FROM timestamp)::INT"
-        " FROM ranking,"
-        " (SELECT id, sha1Id, timestamp"
-        "   FROM context"
-        "   WHERE hostId = %s"
-        "   AND platformId = %s"
-        "   AND branchId = %s) AS matchingContext"
-        " WHERE context2Id = matchingContext.id"
-        " ORDER BY timestamp DESC LIMIT 1",
-        (host_id, platform_id, branch_id))
-    if len(result):
-        return result[0]
-    return -1, -1
-
-
-# For the given host/platform/branch combination, this function returns
-# all contexts for which rankings exist. The return value is a list of
-# (context ID, timestamp) pairs sorted in descending order on timestamp
-# (latest timestamp first).
-def getRankingContexts(host_id, platform_id, branch_id):
-    result = execQuery(
-        "SELECT DISTINCT matchingcontext.id,"
-        " EXTRACT(EPOCH FROM timestamp)::INT AS etimestamp"
-        " FROM ranking,"
-        " (SELECT id, sha1Id, timestamp"
-        "   FROM context"
-        "   WHERE hostId = %s"
-        "   AND platformId = %s"
-        "   AND branchId = %s) AS matchingContext"
-        " WHERE context2Id = matchingContext.id"
-        " ORDER BY etimestamp DESC",
-        (host_id, platform_id, branch_id))
-    return result
-
-
-# Retrieves the time series of valid median results for the given
-# benchmark/metric combination. Only the part of the time series that
+# Retrieves the time series + additional stats of valid median results for
+# the given benchmark/metric combination. Only the part of the time series that
 # is within the selected snapshot interval is considered.
 #
 # Returns a 7-tuple:
@@ -355,6 +363,61 @@ def getTimeSeries(
         ms, lsd)
 
 
+# Retrieves the time series of valid median results for the given
+# benchmark/metric combination within the given contexts.
+# Only the part of the time series that is within the selected snapshot
+# interval is considered.
+#
+# Returns an n-tuple of 2-tuples:
+#
+#     (
+#       <corresponding index in the 'contexts' list>,
+#       <median of valid observations or -1 if all obs. are invalid>
+#     )
+#
+def getBasicTimeSeries(contexts, benchmark_id, metric_id):
+
+    # Fetch raw values:
+    assert len(contexts) > 0
+    raw_values = (execQuery(
+        "SELECT value, valid, contextId FROM result"
+        " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")" +
+        " AND benchmarkId = %s"
+        " AND metricId = %s"
+        " ORDER BY contextId",
+        tuple(contexts) + (benchmark_id, metric_id)) +
+        [(-1, -1, -1)]) # Note sentinel item
+
+    # Compute per-sample stats:
+    curr_context_id = -1
+    valid_and_positive_sample = []
+    median_obs_map = {}
+    # Loop over all observations (which are grouped on sample;
+    # note the 1-1 correspondence between samples and contexts):
+    for obs, valid, context_id in raw_values:
+        if context_id != curr_context_id:
+            # A new sample has been collected, so register it and
+            # prepare for the next one:
+            median_obs = stats.medianscore(valid_and_positive_sample) if (
+                len(valid_and_positive_sample) > 0) else -1
+            median_obs_map[curr_context_id] = median_obs
+            valid_and_positive_sample = []
+            curr_context_id = context_id
+        # Append a valid and positive observation to the current sample:
+        if valid and (obs > 0):
+            valid_and_positive_sample.append(obs)
+
+    # Order chronologically:
+    ts = []
+    index = 0
+    for context in contexts:
+        if context in median_obs_map:
+            ts.append((index, median_obs_map[context]))
+        index = index + 1
+
+    return ts
+
+
 # Returns the factor by which val improves over base_val by taking the
 # lower_is_better property into consideration.
 # Example: base_val = 10 and val = 20 results in 0.5 if lower_is_better is true,
@@ -369,7 +432,7 @@ def metricAdjustedRatio(base_val, val, lower_is_better):
 # Whether a change is significant depends on the difftol argument.
 # Only positive values are considered.
 #
-# The output is an n-tuple of 7-tuples, one per change:
+# The output is a list of 7-tuples, one per change:
 #
 #   1: Base index, i.e. the index in the time series that contains the base
 #      value used to compute the change.
@@ -545,6 +608,84 @@ def getChanges(time_series, lower_is_better, difftol, durtolmin, durtolmax):
     return tuple(changes)
 
 
+# Extracts the (significant) changes of all time series in a given
+# host/platform/branch combination.
+#
+# The output is a list of 3-tuples, one per time series having at least one
+# change:
+#
+#   1: Benchmark ID.
+#   2: Metric ID.
+#   3: The changes as a list of 9-tuples. The first 7 elements correspond to
+#      the output from getChanges() (documented elsewhere). Element 8
+#      and 9 are the SHA1 ID and timestamp corresponding to the change.
+#
+def getAllChanges(
+    host_id, platform_id, branch_id, difftol, durtolmin, durtolmax,
+    progress_func = None, progress_arg = None):
+
+    if progress_func != None:
+        progress_func(0.0, progress_arg)
+
+    # Get all snapshots matching the host/platform/branch combination:
+    snapshots = getAllSnapshots(host_id, platform_id, branch_id)
+
+    # Get the list of contexts that corresponds to these snapshots in this
+    # host/platform/branch combination:
+    contexts = []
+    for sha1_id, timestamp in snapshots:
+        contexts.append(getContext(host_id, platform_id, branch_id, sha1_id))
+
+    if len(contexts) == 0:
+        return []
+
+    # Get all distinct benchmark/metric combinations that match the
+    # host/platform/branch context and are within the selected snapshot
+    # interval. Each such combination corresponds to a time series.
+    tseries_list = execQuery(
+        "SELECT DISTINCT benchmarkId, metricId FROM result"
+        " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")",
+#        " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ") LIMIT 10",
+        contexts)
+
+    changes = []
+
+    # Loop over time series:
+    if progress_func != None:
+        i = 0
+    for benchmark_id, metric_id in tseries_list:
+
+        # Get the time series (without extra stats):
+        time_series = getBasicTimeSeries(contexts, benchmark_id, metric_id)
+
+        # Extract the significant changes:
+        basic_tschanges = getChanges(
+            time_series, metricIdToLowerIsBetter(metric_id), difftol,
+            durtolmin, durtolmax)
+        # ... add the SHA-1 and timestamp to each item:
+        tschanges = []
+        for change in basic_tschanges:
+            index = time_series[change[1]][0]
+            sha1_id = snapshots[index][0]
+            timestamp = snapshots[index][1]
+            tschanges.append(change + (sha1_id, timestamp))
+
+        if len(tschanges) > 0:
+            changes.append((benchmark_id, metric_id, tschanges))
+
+        if progress_func != None:
+            i = i + 1
+            divisor = len(tseries_list) // 100 # report at most 100 times
+            if (divisor > 0) and ((i % divisor) == 0):
+                perc_done = (i / float(len(tseries_list))) * 100.0
+                progress_func(perc_done, progress_arg)
+
+    if progress_func != None:
+        progress_func(100.0, progress_arg)
+
+    return changes
+
+
 # ### 2 B DOCUMENTED!
 def getTimeSeriesMiscStats(time_series, changes, snapshots, stats):
     if len(changes) > 0:
@@ -654,6 +795,170 @@ def getBMTimeSeriesStatsList(
     return tuple(bmstats_list)
 
 
+# Returns the score for the most recent change in 'changes'.
+# If 'regression' is true, only regressions are considered, and otherwise
+# only improvements.
+# If 'premature' is true, the premature score (not considering post-change
+# durability) is returned instead of the regular score.
+#
+# Returns -1 if no score is found.
+#
+
+
+# OBSOLETE ???
+
+
+def getLastChangeScore(changes, regression, premature):
+    pass
+
+
+# Returns the highest score for any change in a certain time interval.
+#
+# The time interval is defined like this: <2 B DONE!>
+#
+# If 'regression' is true, only regressions are considered, and otherwise
+# only improvements.
+# If 'premature' is true, the premature score (not considering post-change
+# durability) is returned instead of the regular score.
+#
+# Returns -1 if no score is found.
+#
+
+
+#
+# OBSOLETE ???
+
+
+def getHighestChangeScore(changes, regression, premature, days):
+    pass
+
+
+# Computes change scores for each time series (benchmark/metric combination)
+# of the given host/platform/branch combination.
+#
+# Change types:
+#   - most recent change (i.e. significant regression or improvement)
+#   - strongest change last n days, for n in {7, 30, 180, and -1 (infinite)}
+# ... and all of these with and without inclusion of premature changes.
+#
+# ADD MORE DOCS HERE ... 2 B DONE!
+#
+
+
+# OBSOLETE ???
+
+
+# def getChangeScores(
+#     host_id, platform_id, branch_id, snapshots, test_case_filter,
+#     difftol, durtolmin, durtolmax, progress_func = None, progress_arg = None):
+
+#     if progress_func != None:
+#         progress_func(0.0, progress_arg)
+
+#     contexts = []
+#     for sha1_id, timestamp in snapshots:
+#         contexts.append(getContext(host_id, platform_id, branch_id, sha1_id))
+
+#     # Get all distinct benchmark/metric combinations that match the
+#     # host/platform/branch context and are within the selected snapshot
+#     # interval. Each such combination corresponds to a time series.
+#     assert len(contexts) > 0
+#     tseries = execQuery(
+#         "SELECT DISTINCT benchmarkId, metricId FROM result"
+#         " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")",
+#         contexts)
+
+#     scores = []
+
+#     # Loop over time series:
+#     if progress_func != None:
+#         i = 0
+#     #for benchmark_id, metric_id in tseries[800:810]:
+#     for benchmark_id, metric_id in tseries:
+
+#         benchmark = idToText("benchmark", benchmark_id)
+#         #if benchmark != "tst_qmetaobject:indexOfMethod(_q_columnsAboutToBeRemoved(QModelIndex,int,int))":
+#         #    continue
+
+#         test_case, test_function, data_tag = (
+#             benchmarkToComponents(benchmark))
+
+#         # Skip this time series if it doesn't match the test case filter:
+#         if ((test_case_filter != None)
+#             and (not test_case in test_case_filter)):
+#             continue
+
+#         # Get the time series (without extra stats):
+#         time_series = getBasicTimeSeries(
+#             host_id, platform_id, branch_id, snapshots, benchmark_id, metric_id)
+
+#         # Extract the significant changes:
+#         changes = getChanges(
+#             time_series, metricIdToLowerIsBetter(metric_id), difftol,
+#             durtolmin, durtolmax)
+
+#         tsscores = {}
+
+#         tsscores["benchmark_id"] = benchmark_id
+#         tsscores["metric_id"] = metric_id
+
+#         tsscores["regr_last"]     = getLastChangeScore(changes, True,  False)
+#         tsscores["regr_last_pmt"] = getLastChangeScore(changes, True,  True)
+#         tsscores["impr_last"]     = getLastChangeScore(changes, False, False)
+#         tsscores["impr_last_pmt"] = getLastChangeScore(changes, False, True)
+
+#         tsscores["regr_7"]       = getHighestChangeScore(
+#             changes, True,  False, 7)
+#         tsscores["regr_7_pmt"]   = getHighestChangeScore(
+#             changes, True,  True,  7)
+#         tsscores["impr_7"]       = getHighestChangeScore(
+#             changes, False, False, 7)
+#         tsscores["impr_7_pmt"]   = getHighestChangeScore(
+#             changes, False, True,  7)
+
+#         tsscores["regr_30"]      = getHighestChangeScore(
+#             changes, True,  False, 30)
+#         tsscores["regr_30_pmt"]  = getHighestChangeScore(
+#             changes, True,  True,  30)
+#         tsscores["impr_30"]      = getHighestChangeScore(
+#             changes, False, False, 30)
+#         tsscores["impr_30_pmt"]  = getHighestChangeScore(
+#             changes, False, True,  30)
+
+#         tsscores["regr_180"]     = getHighestChangeScore(
+#             changes, True,  False, 180)
+#         tsscores["regr_180_pmt"] = getHighestChangeScore(
+#             changes, True,  True,  180)
+#         tsscores["impr_180"]     = getHighestChangeScore(
+#             changes, False, False, 180)
+#         tsscores["impr_180_pmt"] = getHighestChangeScore(
+#             changes, False, True,  180)
+
+#         tsscores["regr_all"]     = getHighestChangeScore(
+#             changes, True,  False, -1)
+#         tsscores["regr_all_pmt"] = getHighestChangeScore(
+#             changes, True,  True,  -1)
+#         tsscores["impr_all"]     = getHighestChangeScore(
+#             changes, False, False, -1)
+#         tsscores["impr_all_pmt"] = getHighestChangeScore(
+#             changes, False, True,  -1)
+
+#         scores.append(tsscores)
+
+#         if progress_func != None:
+#             i = i + 1
+#             divisor = len(tseries) // 100 # report at most 100 times
+#             if (divisor > 0) and ((i % divisor) == 0):
+#                 perc_done = (i / float(len(tseries))) * 100.0
+#                 progress_func(perc_done, progress_arg)
+
+#     if progress_func != None:
+#         progress_func(100.0, progress_arg)
+
+#     return scores
+
+
+
 # Returns True iff s is a valid SHA-1 string.
 def isValidSHA1(s):
     def containsOnlyHexDigits(s):