diff options
Diffstat (limited to 'data/get_data.py')
-rw-r--r-- | data/get_data.py | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/data/get_data.py b/data/get_data.py new file mode 100644 index 0000000..35cf548 --- /dev/null +++ b/data/get_data.py @@ -0,0 +1,165 @@ +# Copyright (C) 2021 The Qt Company Ltd. +# Contact: https://www.qt.io/licensing/ +# +# You may use this file under the terms of the CC0 license. +# See the file LICENSE.CC0 from this package for details. + +import argparse +import logging +import os +import re +import subprocess +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("get_data") + + +class cd: + def __init__(self, path): + self.path = path + + def __enter__(self): + self.saved = os.getcwd() + os.chdir(self.path) + + def __exit__(self, etype, value, traceback): + os.chdir(self.saved) + + +def get_qt_modules(qt_path): + def is_qt(x): + valid_dir = (x.startswith("qt") or "pyside-setup" in x) + return valid_dir and os.path.isdir(os.path.join(qt_path, x)) + return sorted([i for i in os.listdir(qt_path) if is_qt(i)]) + + +def get_email_domain(x): + x = x.replace('"', "") + if x.count("@") == 0: + try: + v = x.split(".")[-2] + except IndexError: + v = "" + else: + v = ".".join(x.split("@")[1].split(".")[:-1]).replace("\\", "") + + if v in ("theqtcompany", "qt", "nokia", "nokiamail", "digia"): + new_email = re.sub("@.*", "@qt.io", x) + return new_email, "qt" + return x, v + + +def process_git_log_line(line): + changed = insertions = deletions = 0 + # files changed + re_changed = re.search(r"(\d+) files? changed", line) + if re_changed: + changed = re_changed.group(1) + # insertions + re_insertions = re.search(r"(\d+) insertions?", line) + if re_insertions: + insertions = re_insertions.group(1) + # deletions + re_deletions = re.search(r"(\d+) deletions?", line) + if re_deletions: + deletions = re_deletions.group(1) + + original_line = line.split("‽")[0] + # the last field is the 'email' + original_email = original_line.split(";")[-1] + email, domain = get_email_domain(original_email) + + return f'{original_line};"{email}";"{domain}";"{changed}";"{insertions}";"{deletions}"' + + +def git_log(): + def is_valid_line(x): + if ( + x.strip() + and "Qt by Nokia" not in x + and "Qt Forward Merge Bot" not in x + and "Qt Submodule Update Bot" not in x + ): + return True + return False + + # We can do this process with: + # git log --all --no-merges --date=format:'%Y-%m-%d' + # --pretty=format:'µ"%cd";"%h";"%an";"%ce"' + # --shortstat | tr '\n' ' ' | tr 'µ' '\n' | + # sed 's/\ *\(\d+\)\ files\{0,1\}/\1/g' + # But we will use only Python to perform those pipe operations. + + # This command has a trick to get the 'shortstats' on the same line + # when processing the lines. Notice the 'µ' character that depicts the + # beginning of the line. Additionally we use an interrobang '‽' + # to depict the end of the git log, so we can add the 'files changed', + # 'insertions', and 'deletions' at the end. + o = subprocess.run( + ( + "git log --all --no-merges " + "--date=format:'%Y-%m-%d' " + '--pretty=format:\'µ"%cd";"%h";"%an";"%ae"\'‽ ' + "--shortstat " + ).split(), + capture_output=True, + universal_newlines=True, + encoding="utf-8", + errors="ignore", + ).stdout + o = o.replace("\n", " ").replace("µ", "\n").replace("'", "") + return "\n".join(process_git_log_line(line) for line in o.splitlines() if is_valid_line(line)) + + +def check_arguments(options): + qt_path = Path(options.qt_dir) + if qt_path.is_dir(): + return True + log.error(f"'{qt_path}' is not a directory.") + return False + + +def is_valid_module(m): + if (m.name.startswith("qt") or str(m.name) in ("pyside-setup",)) and m.is_dir(): + return True + return False + + +def process_qt_src(options): + qt_path = Path(options.qt_dir) + HEADER = "date;sha;name;original_email;email;domain;files_changed;insertions;deletions\n" + for i in qt_path.glob("*"): + if is_valid_module(i): + log.info(f"Processing {i}...") + output_csv = f"{Path(__file__).parent}/{i.name}.csv" + out = None + with cd(i): + out = git_log() + if not out: + log.error("Empty 'git log' for i") + continue + with open(output_csv, "w") as f: + f.write(HEADER) + f.write(out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog="get_data") + + parser.add_argument( + "--qt", + action="store", + dest="qt_dir", + required=True, + help="Path to a directory containing Qt modules, like the 'qt5' meta repository", + ) + + options = parser.parse_args() + if not check_arguments(options): + parser.print_help() + sys.exit(-1) + + # main process + process_qt_src(options) |