diff options
Diffstat (limited to 'scripts/misc.py')
-rw-r--r-- | scripts/misc.py | 429 |
1 files changed, 367 insertions, 62 deletions
diff --git a/scripts/misc.py b/scripts/misc.py index b3a2274..8bd8516 100644 --- a/scripts/misc.py +++ b/scripts/misc.py @@ -45,6 +45,33 @@ def textToId(table, text): # ### 2 B DOCUMENTED! +def findOrInsertId(table, value, *args): + + query_result = execQuery( + "SELECT id FROM " + table + " WHERE value = %s", (value,)) + if len(query_result) == 1: + # Found, so return ID: + return query_result[0][0] + + # Not found, so insert: + query = "INSERT INTO " + table + " (value" + for i in range(0, len(args), 2): + query += ", " + args[i] + query += ") VALUES (%s" + values = [value] + for i in range(0, len(args), 2): + query += ", %s" + values.append(args[i + 1]) + + # ... and retrieve ID: + query += ") RETURNING id" + query_result = execQuery(query, values) + + assert len(query_result) == 1 + return query_result[0][0] + + +# ### 2 B DOCUMENTED! # Maybe also rename to lowerIsBetter() ? (but note that a global function with # that name already exists in uploadresults.py) def metricIdToLowerIsBetter(metric_id): @@ -64,6 +91,13 @@ def metricIdToLowerIsBetter(metric_id): # Returns the non-negative ID of the given context, or -1 if not found. def getContext(host_id, platform_id, branch_id, sha1_id): + global contextIdCache + if not 'contextIdCache' in globals(): + contextIdCache = {} + + if (host_id, platform_id, branch_id, sha1_id) in contextIdCache: + return contextIdCache[host_id, platform_id, branch_id, sha1_id] + result = execQuery( "SELECT id FROM context" " WHERE hostId = %s" @@ -72,9 +106,10 @@ def getContext(host_id, platform_id, branch_id, sha1_id): " AND sha1Id = %s" "LIMIT 1", (host_id, platform_id, branch_id, sha1_id)) - if len(result): - return result[0][0] - return -1 + result = result[0][0] if len(result) else -1 + contextIdCache[host_id, platform_id, branch_id, sha1_id] = result + + return result # Returns the test case, test function, and data tag components of @@ -100,32 +135,46 @@ def getTimestampFromContext(context_id): # Finds snapshots that match a host/platform/branch combination and that # lie within the range -# [sha11, sha12] if sha12_id >= 0, or -# [sha11, +inf) if sha12_ is < 0. +# [sha11, sha12] if both sha11_id and sha12_id are >= 0, or +# ( -inf, sha12] if only sha11_id is < 0, or +# [sha11, +inf) if only sha12_id is < 0, or +# ( -inf, +inf) if both sha11_id and sha2_id are < 0 +# # Returns a chronologically order n-tuple of 2-tuples: # (sha1, first upload timestamp). def getSnapshots(host_id, platform_id, branch_id, sha11_id, sha12_id): - timestamp1 = execQuery( + + timestamp1 = (execQuery( "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context" " WHERE hostId = %s" " AND platformId = %s" " AND branchId = %s" " AND sha1Id = %s", (host_id, platform_id, branch_id, sha11_id))[0][0] - if sha12_id >= 0: - timestamp2 = execQuery( - "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context" - " WHERE hostId = %s" - " AND platformId = %s" - " AND branchId = %s" - " AND sha1Id = %s", - (host_id, platform_id, branch_id, sha12_id))[0][0] + if (sha11_id >= 0) else -1) + + timestamp2 = (execQuery( + "SELECT EXTRACT(EPOCH FROM timestamp)::INT FROM context" + " WHERE hostId = %s" + " AND platformId = %s" + " AND branchId = %s" + " AND sha1Id = %s", + (host_id, platform_id, branch_id, sha12_id))[0][0] + if (sha12_id >= 0) else -1) + + prefix = "AND EXTRACT(EPOCH FROM timestamp)::INT" + + if (timestamp1 == -1) and (timestamp2 == -1): + range_expr = "" + elif (timestamp1 >= 0) and (timestamp2 < 0): + range_expr = ("%s >= %d" % (prefix, timestamp1)) + elif (timestamp1 < 0) and (timestamp2 >= 0): + range_expr = ("%s <= %d" % (prefix, timestamp2)) + else: # Ensure chronological order: if timestamp1 > timestamp2: timestamp1, timestamp2 = timestamp2, timestamp1 - range_expr = "BETWEEN %d AND %d" % (timestamp1, timestamp2) - else: - range_expr = ">= %d" % timestamp1 + range_expr = ("%s BETWEEN %d AND %d" % (prefix, timestamp1, timestamp2)) # Each distinct SHA-1 that occurs for this host/platform/branch # combination may occur multiple times with different upload times. @@ -138,8 +187,7 @@ def getSnapshots(host_id, platform_id, branch_id, sha11_id, sha12_id): " FROM context" " WHERE hostId = %s" " AND platformId = %s" - " AND branchId = %s" - " AND EXTRACT(EPOCH FROM timestamp)::INT " + range_expr + + " AND branchId = %s " + range_expr + " ORDER BY timestamp ASC", (host_id, platform_id, branch_id)) @@ -169,48 +217,8 @@ def getAllSnapshots(host_id, platform_id, branch_id, reverse = False): return tuple(snapshots) -# Returns the (SHA-1 ID, timestamp) pair associated with the most recent -# rankings computed for the given host/platform/branch combination, or -# (-1, -1) if no match is found. -def getLastRankingSnapshot(host_id, platform_id, branch_id): - result = execQuery( - "SELECT matchingcontext.sha1id, EXTRACT(EPOCH FROM timestamp)::INT" - " FROM ranking," - " (SELECT id, sha1Id, timestamp" - " FROM context" - " WHERE hostId = %s" - " AND platformId = %s" - " AND branchId = %s) AS matchingContext" - " WHERE context2Id = matchingContext.id" - " ORDER BY timestamp DESC LIMIT 1", - (host_id, platform_id, branch_id)) - if len(result): - return result[0] - return -1, -1 - - -# For the given host/platform/branch combination, this function returns -# all contexts for which rankings exist. The return value is a list of -# (context ID, timestamp) pairs sorted in descending order on timestamp -# (latest timestamp first). -def getRankingContexts(host_id, platform_id, branch_id): - result = execQuery( - "SELECT DISTINCT matchingcontext.id," - " EXTRACT(EPOCH FROM timestamp)::INT AS etimestamp" - " FROM ranking," - " (SELECT id, sha1Id, timestamp" - " FROM context" - " WHERE hostId = %s" - " AND platformId = %s" - " AND branchId = %s) AS matchingContext" - " WHERE context2Id = matchingContext.id" - " ORDER BY etimestamp DESC", - (host_id, platform_id, branch_id)) - return result - - -# Retrieves the time series of valid median results for the given -# benchmark/metric combination. Only the part of the time series that +# Retrieves the time series + additional stats of valid median results for +# the given benchmark/metric combination. Only the part of the time series that # is within the selected snapshot interval is considered. # # Returns a 7-tuple: @@ -355,6 +363,61 @@ def getTimeSeries( ms, lsd) +# Retrieves the time series of valid median results for the given +# benchmark/metric combination within the given contexts. +# Only the part of the time series that is within the selected snapshot +# interval is considered. +# +# Returns an n-tuple of 2-tuples: +# +# ( +# <corresponding index in the 'contexts' list>, +# <median of valid observations or -1 if all obs. are invalid> +# ) +# +def getBasicTimeSeries(contexts, benchmark_id, metric_id): + + # Fetch raw values: + assert len(contexts) > 0 + raw_values = (execQuery( + "SELECT value, valid, contextId FROM result" + " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")" + + " AND benchmarkId = %s" + " AND metricId = %s" + " ORDER BY contextId", + tuple(contexts) + (benchmark_id, metric_id)) + + [(-1, -1, -1)]) # Note sentinel item + + # Compute per-sample stats: + curr_context_id = -1 + valid_and_positive_sample = [] + median_obs_map = {} + # Loop over all observations (which are grouped on sample; + # note the 1-1 correspondence between samples and contexts): + for obs, valid, context_id in raw_values: + if context_id != curr_context_id: + # A new sample has been collected, so register it and + # prepare for the next one: + median_obs = stats.medianscore(valid_and_positive_sample) if ( + len(valid_and_positive_sample) > 0) else -1 + median_obs_map[curr_context_id] = median_obs + valid_and_positive_sample = [] + curr_context_id = context_id + # Append a valid and positive observation to the current sample: + if valid and (obs > 0): + valid_and_positive_sample.append(obs) + + # Order chronologically: + ts = [] + index = 0 + for context in contexts: + if context in median_obs_map: + ts.append((index, median_obs_map[context])) + index = index + 1 + + return ts + + # Returns the factor by which val improves over base_val by taking the # lower_is_better property into consideration. # Example: base_val = 10 and val = 20 results in 0.5 if lower_is_better is true, @@ -369,7 +432,7 @@ def metricAdjustedRatio(base_val, val, lower_is_better): # Whether a change is significant depends on the difftol argument. # Only positive values are considered. # -# The output is an n-tuple of 7-tuples, one per change: +# The output is a list of 7-tuples, one per change: # # 1: Base index, i.e. the index in the time series that contains the base # value used to compute the change. @@ -545,6 +608,84 @@ def getChanges(time_series, lower_is_better, difftol, durtolmin, durtolmax): return tuple(changes) +# Extracts the (significant) changes of all time series in a given +# host/platform/branch combination. +# +# The output is a list of 3-tuples, one per time series having at least one +# change: +# +# 1: Benchmark ID. +# 2: Metric ID. +# 3: The changes as a list of 9-tuples. The first 7 elements correspond to +# the output from getChanges() (documented elsewhere). Element 8 +# and 9 are the SHA1 ID and timestamp corresponding to the change. +# +def getAllChanges( + host_id, platform_id, branch_id, difftol, durtolmin, durtolmax, + progress_func = None, progress_arg = None): + + if progress_func != None: + progress_func(0.0, progress_arg) + + # Get all snapshots matching the host/platform/branch combination: + snapshots = getAllSnapshots(host_id, platform_id, branch_id) + + # Get the list of contexts that corresponds to these snapshots in this + # host/platform/branch combination: + contexts = [] + for sha1_id, timestamp in snapshots: + contexts.append(getContext(host_id, platform_id, branch_id, sha1_id)) + + if len(contexts) == 0: + return [] + + # Get all distinct benchmark/metric combinations that match the + # host/platform/branch context and are within the selected snapshot + # interval. Each such combination corresponds to a time series. + tseries_list = execQuery( + "SELECT DISTINCT benchmarkId, metricId FROM result" + " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")", +# " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ") LIMIT 10", + contexts) + + changes = [] + + # Loop over time series: + if progress_func != None: + i = 0 + for benchmark_id, metric_id in tseries_list: + + # Get the time series (without extra stats): + time_series = getBasicTimeSeries(contexts, benchmark_id, metric_id) + + # Extract the significant changes: + basic_tschanges = getChanges( + time_series, metricIdToLowerIsBetter(metric_id), difftol, + durtolmin, durtolmax) + # ... add the SHA-1 and timestamp to each item: + tschanges = [] + for change in basic_tschanges: + index = time_series[change[1]][0] + sha1_id = snapshots[index][0] + timestamp = snapshots[index][1] + tschanges.append(change + (sha1_id, timestamp)) + + if len(tschanges) > 0: + changes.append((benchmark_id, metric_id, tschanges)) + + if progress_func != None: + i = i + 1 + divisor = len(tseries_list) // 100 # report at most 100 times + if (divisor > 0) and ((i % divisor) == 0): + perc_done = (i / float(len(tseries_list))) * 100.0 + progress_func(perc_done, progress_arg) + + if progress_func != None: + progress_func(100.0, progress_arg) + + return changes + + # ### 2 B DOCUMENTED! def getTimeSeriesMiscStats(time_series, changes, snapshots, stats): if len(changes) > 0: @@ -654,6 +795,170 @@ def getBMTimeSeriesStatsList( return tuple(bmstats_list) +# Returns the score for the most recent change in 'changes'. +# If 'regression' is true, only regressions are considered, and otherwise +# only improvements. +# If 'premature' is true, the premature score (not considering post-change +# durability) is returned instead of the regular score. +# +# Returns -1 if no score is found. +# + + +# OBSOLETE ??? + + +def getLastChangeScore(changes, regression, premature): + pass + + +# Returns the highest score for any change in a certain time interval. +# +# The time interval is defined like this: <2 B DONE!> +# +# If 'regression' is true, only regressions are considered, and otherwise +# only improvements. +# If 'premature' is true, the premature score (not considering post-change +# durability) is returned instead of the regular score. +# +# Returns -1 if no score is found. +# + + +# +# OBSOLETE ??? + + +def getHighestChangeScore(changes, regression, premature, days): + pass + + +# Computes change scores for each time series (benchmark/metric combination) +# of the given host/platform/branch combination. +# +# Change types: +# - most recent change (i.e. significant regression or improvement) +# - strongest change last n days, for n in {7, 30, 180, and -1 (infinite)} +# ... and all of these with and without inclusion of premature changes. +# +# ADD MORE DOCS HERE ... 2 B DONE! +# + + +# OBSOLETE ??? + + +# def getChangeScores( +# host_id, platform_id, branch_id, snapshots, test_case_filter, +# difftol, durtolmin, durtolmax, progress_func = None, progress_arg = None): + +# if progress_func != None: +# progress_func(0.0, progress_arg) + +# contexts = [] +# for sha1_id, timestamp in snapshots: +# contexts.append(getContext(host_id, platform_id, branch_id, sha1_id)) + +# # Get all distinct benchmark/metric combinations that match the +# # host/platform/branch context and are within the selected snapshot +# # interval. Each such combination corresponds to a time series. +# assert len(contexts) > 0 +# tseries = execQuery( +# "SELECT DISTINCT benchmarkId, metricId FROM result" +# " WHERE contextId IN (%s" + ", %s"*(len(contexts) - 1) + ")", +# contexts) + +# scores = [] + +# # Loop over time series: +# if progress_func != None: +# i = 0 +# #for benchmark_id, metric_id in tseries[800:810]: +# for benchmark_id, metric_id in tseries: + +# benchmark = idToText("benchmark", benchmark_id) +# #if benchmark != "tst_qmetaobject:indexOfMethod(_q_columnsAboutToBeRemoved(QModelIndex,int,int))": +# # continue + +# test_case, test_function, data_tag = ( +# benchmarkToComponents(benchmark)) + +# # Skip this time series if it doesn't match the test case filter: +# if ((test_case_filter != None) +# and (not test_case in test_case_filter)): +# continue + +# # Get the time series (without extra stats): +# time_series = getBasicTimeSeries( +# host_id, platform_id, branch_id, snapshots, benchmark_id, metric_id) + +# # Extract the significant changes: +# changes = getChanges( +# time_series, metricIdToLowerIsBetter(metric_id), difftol, +# durtolmin, durtolmax) + +# tsscores = {} + +# tsscores["benchmark_id"] = benchmark_id +# tsscores["metric_id"] = metric_id + +# tsscores["regr_last"] = getLastChangeScore(changes, True, False) +# tsscores["regr_last_pmt"] = getLastChangeScore(changes, True, True) +# tsscores["impr_last"] = getLastChangeScore(changes, False, False) +# tsscores["impr_last_pmt"] = getLastChangeScore(changes, False, True) + +# tsscores["regr_7"] = getHighestChangeScore( +# changes, True, False, 7) +# tsscores["regr_7_pmt"] = getHighestChangeScore( +# changes, True, True, 7) +# tsscores["impr_7"] = getHighestChangeScore( +# changes, False, False, 7) +# tsscores["impr_7_pmt"] = getHighestChangeScore( +# changes, False, True, 7) + +# tsscores["regr_30"] = getHighestChangeScore( +# changes, True, False, 30) +# tsscores["regr_30_pmt"] = getHighestChangeScore( +# changes, True, True, 30) +# tsscores["impr_30"] = getHighestChangeScore( +# changes, False, False, 30) +# tsscores["impr_30_pmt"] = getHighestChangeScore( +# changes, False, True, 30) + +# tsscores["regr_180"] = getHighestChangeScore( +# changes, True, False, 180) +# tsscores["regr_180_pmt"] = getHighestChangeScore( +# changes, True, True, 180) +# tsscores["impr_180"] = getHighestChangeScore( +# changes, False, False, 180) +# tsscores["impr_180_pmt"] = getHighestChangeScore( +# changes, False, True, 180) + +# tsscores["regr_all"] = getHighestChangeScore( +# changes, True, False, -1) +# tsscores["regr_all_pmt"] = getHighestChangeScore( +# changes, True, True, -1) +# tsscores["impr_all"] = getHighestChangeScore( +# changes, False, False, -1) +# tsscores["impr_all_pmt"] = getHighestChangeScore( +# changes, False, True, -1) + +# scores.append(tsscores) + +# if progress_func != None: +# i = i + 1 +# divisor = len(tseries) // 100 # report at most 100 times +# if (divisor > 0) and ((i % divisor) == 0): +# perc_done = (i / float(len(tseries))) * 100.0 +# progress_func(perc_done, progress_arg) + +# if progress_func != None: +# progress_func(100.0, progress_arg) + +# return scores + + + # Returns True iff s is a valid SHA-1 string. def isValidSHA1(s): def containsOnlyHexDigits(s): |