aboutsummaryrefslogtreecommitdiffstats
path: root/sources/pyside6/tests/registry/scrape_testresults.py
diff options
context:
space:
mode:
Diffstat (limited to 'sources/pyside6/tests/registry/scrape_testresults.py')
-rw-r--r--sources/pyside6/tests/registry/scrape_testresults.py338
1 files changed, 338 insertions, 0 deletions
diff --git a/sources/pyside6/tests/registry/scrape_testresults.py b/sources/pyside6/tests/registry/scrape_testresults.py
new file mode 100644
index 000000000..b7b6b58aa
--- /dev/null
+++ b/sources/pyside6/tests/registry/scrape_testresults.py
@@ -0,0 +1,338 @@
+# Copyright (C) 2022 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+"""
+scrape_testresults.py
+
+Read the testresults website of COIN and find the pages that contain an
+embedded exists_{platform}_{version}_ci.py .
+
+The found pages will then be sorted by date/time and put into the registry.
+
+This program utilizes the multiprocessing package for speedy access to
+the web pages. The program works well in typically less than half an hour.
+
+After the cache has been created, the runtime is substantially smaller.
+
+"""
+
+import sys
+if sys.version_info[:2] < (3, 6):
+ print("This program is written for Python 3.6 or higher.")
+ sys.exit(1)
+
+DEMO_URL = ("https://testresults.qt.io/coin/api/results/pyside/pyside-setup/"
+ # The above URL part is fixed.
+ "30c1193ec56a86b8d0920c325185b9870f96941e/"
+ "MacOSMacOS_10_12x86_64MacOSMacOS_10_12x86_64Clangqtci-macos-"
+ "10.12-x86_64-8-425364DebugAndRelease_Release/"
+ "d80c5d4547ea2b3d74188bd458955aae39cb32b4/"
+ "test_1535865484/"
+ "log.txt.gz")
+
+from bs4 import BeautifulSoup
+from datetime import datetime
+from multiprocessing import Pool
+from textwrap import dedent
+import requests
+import os
+import time
+import re
+import json
+import argparse
+
+my_name = __file__ if __file__.endswith(".py") else __file__[:-1]
+test_path = os.path.join(os.path.dirname(__file__), "testresults", "embedded")
+if not os.path.exists(test_path):
+ os.makedirs(test_path)
+cache_path = os.path.dirname(test_path)
+target_path = os.path.dirname(__file__)
+start_time = time.time()
+
+
+def get_name(url):
+ """
+ Return the last piece of an url, including trailing slash.
+
+ In effect, this undoes the accumulation of URL pieces.
+ """
+ name = url.rstrip("/").rsplit("/", 1)[-1]
+ if url.endswith("/"):
+ name += "/"
+ return name
+
+
+def rel_url(url):
+ """
+ throw the top URL away
+ """
+ return url[len(top_url):]
+
+
+stop_all = False
+
+
+def find_all_links(text, url, ignore=()):
+ """
+ Find all links in a page.
+
+ Only simple links are allowed. That means safe characters and
+ at most one "/" at the end for directories.
+ """
+ global stop_all
+ soup = BeautifulSoup(text, "html.parser")
+ lis = soup.find_all("a")
+ names = list(row["href"] for row in lis)
+ names = list(name for name in names if name not in ignore)
+ for name in names:
+ if not re.match(r"^[A-Za-z0-9_\-.]+/?$", name):
+ print("Unexpected character in link:", name)
+ # Not clear how to terminate the pool quick and clean.
+ # We crash badly in handle_suburl_tup, ugly but works.
+ stop_all = True
+ return []
+ urls = list(url + name for name in names)
+ return urls
+
+
+def read_url(url):
+ # We intentionally let things fail, because we re-run things on failure.
+ try:
+ response = requests.get(url)
+ except requests.exceptions.ContentDecodingError as e:
+ # This is a permanent error which is in the data. We ignore that.
+ print(os.getpid(), "Decoding Error:", e)
+ print(os.getpid(), "Cannot fix this, ignored.")
+ return None
+ except requests.exceptions.RequestException as e:
+ print("Read error:", e)
+ raise
+ else:
+ return response
+
+
+def get_timestamp(text):
+ # agent:2018/06/29 15:02:15
+ global stop_all
+ prefix = "\nagent:"
+ try:
+ startpos = text.index(prefix)
+ except ValueError:
+ print("this is not the usual format of COIN log files")
+ stop_all = True
+ raise
+ startpos += len(prefix)
+ text = text[startpos : startpos + 80]
+ ts = text[:19]
+ ts = re.sub(r'[^0-9]', '_', ts)
+ # check that it is a valid time stamp
+ try:
+ datetime.strptime(ts, "%Y_%m_%d_%H_%M_%S")
+ except ValueError as e:
+ print("Unexpected time stamp", e)
+ stop_all = True
+ raise
+ return ts
+
+
+def write_data(name, text):
+ try:
+ ts = get_timestamp(text)
+ except ValueError:
+ print()
+ print(name)
+ print()
+ print(text)
+ raise
+ lines = text.split("\n")
+ for idx, line in enumerate(lines):
+ if "BEGIN_FILE" in line:
+ start = idx + 1
+ offset = line.index("BEGIN_FILE")
+ if "END_FILE" in line:
+ stop = idx
+ lines = lines[start : stop]
+ if offset:
+ lines = list(line[offset:] for line in lines)
+ # fix the lines - the original has no empty line after "# eof"
+ while lines[-1] == "":
+ lines.pop()
+ text = "\n".join(lines) + "\n"
+ modname = re.search(r"'(..*?)'", text).group(1)
+ fn = os.path.join(test_path, f"{ts}-{name}-{modname}.py")
+ if os.path.exists(fn):
+ # do not change the file, we want to skip it
+ return
+ with open(fn, "w") as f:
+ f.write(text)
+
+
+def eval_data(force=False):
+ """
+ Read all found files, sort them and keep the latest version.
+ """
+ files = []
+ for entry in os.scandir(test_path):
+ if "exists_" in entry.name and entry.name.endswith(".py"):
+ if force or os.path.getmtime(entry.path) >= start_time:
+ # this file is newly created
+ files.append(entry.path)
+ files.sort()
+ # read the files and update in chronological order
+ results = {}
+ for fn in files:
+ with open(fn) as f:
+ text = f.read()
+ modname = re.search("'(..*?)'", text).group(1)
+ results[modname] = text
+ for fn in results:
+ name = os.path.join(target_path, fn + ".py")
+ with open(name, "w") as f:
+ f.write(results[fn])
+ print("+++ generated:", name)
+ return len(results)
+
+
+def handle_suburl(idx, n, url, level):
+ if level == 1:
+ print(os.getpid(), "Reading", idx + 1, "of", n, rel_url(url))
+ response = read_url(url)
+ urls = find_all_links(response.text, url)
+ for sub_url in urls:
+ name = get_name(sub_url)
+ if name.endswith("/"):
+ if name.startswith("build_"):
+ continue
+ if name == "tasks/":
+ continue
+ handle_suburl(0, 0, sub_url, level + 1)
+ else:
+ if name.startswith("log.txt"):
+ test_name = sub_url.split("/")[-2]
+ print(os.getpid(), test_name)
+ response = read_url(sub_url)
+ txt = response.text if response else ''
+ if "BEGIN_FILE" in txt and "'BEGIN_FILE'" not in txt:
+ # find the text, but not a traceback with that text
+ print(os.getpid(), test_name, "FOUND!")
+ write_data(test_name, response.text)
+ else:
+ print(os.getpid(), test_name)
+
+
+def handle_suburl_tup(idx_n_url_level):
+ if stop_all:
+ return # bad solution, but it stops fast
+ idx, n, url, level = idx_n_url_level
+ try:
+ ret = handle_suburl(idx, n, url, level)
+ return url, None
+ except requests.exceptions.RequestException as e:
+ return url, e
+
+
+def handle_batch(urls, level):
+ n = len(urls)
+ args = ((idx, n, url, level) for (idx, url) in enumerate(urls))
+ with Pool(10) as p:
+ records = list(p.imap_unordered(handle_suburl_tup, args))
+ # re-read the failed ones
+ runs = [n]
+ for idx in range(10):
+ urls = list(x[0] for x in records if x[-1])
+ if not urls:
+ break
+ print("Pausing 5 seconds")
+ time.sleep(5)
+ n = len(urls)
+ runs.append(n)
+ args = ((idx, n, url, level) for (idx, url) in enumerate(urls))
+ with Pool(10) as p:
+ records = list(p.imap_unordered(handle_suburl_tup, args))
+ # Return success when the remaining URLs are empty.
+ print("Runs:", ", ".join(map(str, runs)))
+ return not urls
+
+
+def handle_topurl(url):
+ """
+ Find all links to directories.
+
+ We maintain a cache of these links. The cache is only updated
+ when all URLs have been successfully processed.
+ """
+ try:
+ response = requests.get(url)
+ except requests.exceptions.RequestException as e:
+ print("Skipped", e)
+ return
+ global top_url
+ top_url = url
+ urls = find_all_links(response.text, url, ignore=("tasks/",))
+ work_urls = set(urls)
+ cache_file = os.path.join(cache_path, "known_urls.json")
+ if os.path.exists(cache_file):
+ with open(cache_file, 'r') as fp:
+ known_urls = json.load(fp)
+ work_urls -= set(known_urls)
+ level = 1
+ for sub_url in work_urls:
+ name = get_name(sub_url)
+ if name.endswith("/"):
+ if name.startswith("build_"):
+ continue
+ work_urls.add(sub_url)
+ success = handle_batch(work_urls, 1)
+ if success:
+ with open(cache_file, 'w') as fp:
+ json.dump(urls, fp, sort_keys=True, indent=4)
+ return success
+
+
+def get_test_results(starturl):
+ ok = handle_topurl(starturl)
+ stop_time = time.time()
+ runtime = stop_time - start_time
+ hours, remainder = divmod(runtime, 3600)
+ minutes, seconds = divmod(remainder, 60)
+
+ runtime_formatted = f'{hours}:{minutes:%02d}:{seconds:%06.3f}'
+ print(f"Run time: {runtime_formatted}s")
+ if ok:
+ found = eval_data()
+ print(f"Successful scan, {found} new files.")
+ if found:
+ print("Please check if a git push is necessary.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ usage=dedent(f"""\
+ {os.path.basename(my_name)} [-h] scan
+
+ Scan the COIN testresults website for embedded exists_{{platf}}_{{version}}_ci.py files.
+
+ Warning: On the first call, this script may take almost 30 minutes to run.
+ Subsequent calls are *much* faster due to caching.
+
+ {os.path.basename(my_name)} [-h] eval
+
+ Enforces evaluation when a scan did not complete yet.
+
+ For more information, see the file
+ sources/shiboken6/libshiboken/signature_doc.rst
+ """))
+ subparsers = parser.add_subparsers(dest="command", metavar="", title="required argument")
+ # create the parser for the "scan" command
+ parser_scan = subparsers.add_parser("scan", help="run the scan")
+ parser_eval = subparsers.add_parser("eval", help="force evaluation")
+ args = parser.parse_args()
+ if not args.command:
+ parser.print_usage()
+ exit(1)
+ if args.command == "scan":
+ # Using this from the intranet require an internal URL
+ get_test_results("https://testresults.qt.io/coin/api/results/pyside/pyside-setup/")
+ elif args.command == "eval":
+ eval_data(force=True)