diff options
Diffstat (limited to 'sources/pyside6/tests/registry/scrape_testresults.py')
-rw-r--r-- | sources/pyside6/tests/registry/scrape_testresults.py | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/sources/pyside6/tests/registry/scrape_testresults.py b/sources/pyside6/tests/registry/scrape_testresults.py new file mode 100644 index 000000000..b7b6b58aa --- /dev/null +++ b/sources/pyside6/tests/registry/scrape_testresults.py @@ -0,0 +1,338 @@ +# Copyright (C) 2022 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +""" +scrape_testresults.py + +Read the testresults website of COIN and find the pages that contain an +embedded exists_{platform}_{version}_ci.py . + +The found pages will then be sorted by date/time and put into the registry. + +This program utilizes the multiprocessing package for speedy access to +the web pages. The program works well in typically less than half an hour. + +After the cache has been created, the runtime is substantially smaller. + +""" + +import sys +if sys.version_info[:2] < (3, 6): + print("This program is written for Python 3.6 or higher.") + sys.exit(1) + +DEMO_URL = ("https://testresults.qt.io/coin/api/results/pyside/pyside-setup/" + # The above URL part is fixed. + "30c1193ec56a86b8d0920c325185b9870f96941e/" + "MacOSMacOS_10_12x86_64MacOSMacOS_10_12x86_64Clangqtci-macos-" + "10.12-x86_64-8-425364DebugAndRelease_Release/" + "d80c5d4547ea2b3d74188bd458955aae39cb32b4/" + "test_1535865484/" + "log.txt.gz") + +from bs4 import BeautifulSoup +from datetime import datetime +from multiprocessing import Pool +from textwrap import dedent +import requests +import os +import time +import re +import json +import argparse + +my_name = __file__ if __file__.endswith(".py") else __file__[:-1] +test_path = os.path.join(os.path.dirname(__file__), "testresults", "embedded") +if not os.path.exists(test_path): + os.makedirs(test_path) +cache_path = os.path.dirname(test_path) +target_path = os.path.dirname(__file__) +start_time = time.time() + + +def get_name(url): + """ + Return the last piece of an url, including trailing slash. + + In effect, this undoes the accumulation of URL pieces. + """ + name = url.rstrip("/").rsplit("/", 1)[-1] + if url.endswith("/"): + name += "/" + return name + + +def rel_url(url): + """ + throw the top URL away + """ + return url[len(top_url):] + + +stop_all = False + + +def find_all_links(text, url, ignore=()): + """ + Find all links in a page. + + Only simple links are allowed. That means safe characters and + at most one "/" at the end for directories. + """ + global stop_all + soup = BeautifulSoup(text, "html.parser") + lis = soup.find_all("a") + names = list(row["href"] for row in lis) + names = list(name for name in names if name not in ignore) + for name in names: + if not re.match(r"^[A-Za-z0-9_\-.]+/?$", name): + print("Unexpected character in link:", name) + # Not clear how to terminate the pool quick and clean. + # We crash badly in handle_suburl_tup, ugly but works. + stop_all = True + return [] + urls = list(url + name for name in names) + return urls + + +def read_url(url): + # We intentionally let things fail, because we re-run things on failure. + try: + response = requests.get(url) + except requests.exceptions.ContentDecodingError as e: + # This is a permanent error which is in the data. We ignore that. + print(os.getpid(), "Decoding Error:", e) + print(os.getpid(), "Cannot fix this, ignored.") + return None + except requests.exceptions.RequestException as e: + print("Read error:", e) + raise + else: + return response + + +def get_timestamp(text): + # agent:2018/06/29 15:02:15 + global stop_all + prefix = "\nagent:" + try: + startpos = text.index(prefix) + except ValueError: + print("this is not the usual format of COIN log files") + stop_all = True + raise + startpos += len(prefix) + text = text[startpos : startpos + 80] + ts = text[:19] + ts = re.sub(r'[^0-9]', '_', ts) + # check that it is a valid time stamp + try: + datetime.strptime(ts, "%Y_%m_%d_%H_%M_%S") + except ValueError as e: + print("Unexpected time stamp", e) + stop_all = True + raise + return ts + + +def write_data(name, text): + try: + ts = get_timestamp(text) + except ValueError: + print() + print(name) + print() + print(text) + raise + lines = text.split("\n") + for idx, line in enumerate(lines): + if "BEGIN_FILE" in line: + start = idx + 1 + offset = line.index("BEGIN_FILE") + if "END_FILE" in line: + stop = idx + lines = lines[start : stop] + if offset: + lines = list(line[offset:] for line in lines) + # fix the lines - the original has no empty line after "# eof" + while lines[-1] == "": + lines.pop() + text = "\n".join(lines) + "\n" + modname = re.search(r"'(..*?)'", text).group(1) + fn = os.path.join(test_path, f"{ts}-{name}-{modname}.py") + if os.path.exists(fn): + # do not change the file, we want to skip it + return + with open(fn, "w") as f: + f.write(text) + + +def eval_data(force=False): + """ + Read all found files, sort them and keep the latest version. + """ + files = [] + for entry in os.scandir(test_path): + if "exists_" in entry.name and entry.name.endswith(".py"): + if force or os.path.getmtime(entry.path) >= start_time: + # this file is newly created + files.append(entry.path) + files.sort() + # read the files and update in chronological order + results = {} + for fn in files: + with open(fn) as f: + text = f.read() + modname = re.search("'(..*?)'", text).group(1) + results[modname] = text + for fn in results: + name = os.path.join(target_path, fn + ".py") + with open(name, "w") as f: + f.write(results[fn]) + print("+++ generated:", name) + return len(results) + + +def handle_suburl(idx, n, url, level): + if level == 1: + print(os.getpid(), "Reading", idx + 1, "of", n, rel_url(url)) + response = read_url(url) + urls = find_all_links(response.text, url) + for sub_url in urls: + name = get_name(sub_url) + if name.endswith("/"): + if name.startswith("build_"): + continue + if name == "tasks/": + continue + handle_suburl(0, 0, sub_url, level + 1) + else: + if name.startswith("log.txt"): + test_name = sub_url.split("/")[-2] + print(os.getpid(), test_name) + response = read_url(sub_url) + txt = response.text if response else '' + if "BEGIN_FILE" in txt and "'BEGIN_FILE'" not in txt: + # find the text, but not a traceback with that text + print(os.getpid(), test_name, "FOUND!") + write_data(test_name, response.text) + else: + print(os.getpid(), test_name) + + +def handle_suburl_tup(idx_n_url_level): + if stop_all: + return # bad solution, but it stops fast + idx, n, url, level = idx_n_url_level + try: + ret = handle_suburl(idx, n, url, level) + return url, None + except requests.exceptions.RequestException as e: + return url, e + + +def handle_batch(urls, level): + n = len(urls) + args = ((idx, n, url, level) for (idx, url) in enumerate(urls)) + with Pool(10) as p: + records = list(p.imap_unordered(handle_suburl_tup, args)) + # re-read the failed ones + runs = [n] + for idx in range(10): + urls = list(x[0] for x in records if x[-1]) + if not urls: + break + print("Pausing 5 seconds") + time.sleep(5) + n = len(urls) + runs.append(n) + args = ((idx, n, url, level) for (idx, url) in enumerate(urls)) + with Pool(10) as p: + records = list(p.imap_unordered(handle_suburl_tup, args)) + # Return success when the remaining URLs are empty. + print("Runs:", ", ".join(map(str, runs))) + return not urls + + +def handle_topurl(url): + """ + Find all links to directories. + + We maintain a cache of these links. The cache is only updated + when all URLs have been successfully processed. + """ + try: + response = requests.get(url) + except requests.exceptions.RequestException as e: + print("Skipped", e) + return + global top_url + top_url = url + urls = find_all_links(response.text, url, ignore=("tasks/",)) + work_urls = set(urls) + cache_file = os.path.join(cache_path, "known_urls.json") + if os.path.exists(cache_file): + with open(cache_file, 'r') as fp: + known_urls = json.load(fp) + work_urls -= set(known_urls) + level = 1 + for sub_url in work_urls: + name = get_name(sub_url) + if name.endswith("/"): + if name.startswith("build_"): + continue + work_urls.add(sub_url) + success = handle_batch(work_urls, 1) + if success: + with open(cache_file, 'w') as fp: + json.dump(urls, fp, sort_keys=True, indent=4) + return success + + +def get_test_results(starturl): + ok = handle_topurl(starturl) + stop_time = time.time() + runtime = stop_time - start_time + hours, remainder = divmod(runtime, 3600) + minutes, seconds = divmod(remainder, 60) + + runtime_formatted = f'{hours}:{minutes:%02d}:{seconds:%06.3f}' + print(f"Run time: {runtime_formatted}s") + if ok: + found = eval_data() + print(f"Successful scan, {found} new files.") + if found: + print("Please check if a git push is necessary.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + usage=dedent(f"""\ + {os.path.basename(my_name)} [-h] scan + + Scan the COIN testresults website for embedded exists_{{platf}}_{{version}}_ci.py files. + + Warning: On the first call, this script may take almost 30 minutes to run. + Subsequent calls are *much* faster due to caching. + + {os.path.basename(my_name)} [-h] eval + + Enforces evaluation when a scan did not complete yet. + + For more information, see the file + sources/shiboken6/libshiboken/signature_doc.rst + """)) + subparsers = parser.add_subparsers(dest="command", metavar="", title="required argument") + # create the parser for the "scan" command + parser_scan = subparsers.add_parser("scan", help="run the scan") + parser_eval = subparsers.add_parser("eval", help="force evaluation") + args = parser.parse_args() + if not args.command: + parser.print_usage() + exit(1) + if args.command == "scan": + # Using this from the intranet require an internal URL + get_test_results("https://testresults.qt.io/coin/api/results/pyside/pyside-setup/") + elif args.command == "eval": + eval_data(force=True) |