aboutsummaryrefslogtreecommitdiffstats
path: root/data/get_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'data/get_data.py')
-rw-r--r--data/get_data.py165
1 files changed, 165 insertions, 0 deletions
diff --git a/data/get_data.py b/data/get_data.py
new file mode 100644
index 0000000..35cf548
--- /dev/null
+++ b/data/get_data.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2021 The Qt Company Ltd.
+# Contact: https://www.qt.io/licensing/
+#
+# You may use this file under the terms of the CC0 license.
+# See the file LICENSE.CC0 from this package for details.
+
+import argparse
+import logging
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("get_data")
+
+
+class cd:
+ def __init__(self, path):
+ self.path = path
+
+ def __enter__(self):
+ self.saved = os.getcwd()
+ os.chdir(self.path)
+
+ def __exit__(self, etype, value, traceback):
+ os.chdir(self.saved)
+
+
+def get_qt_modules(qt_path):
+ def is_qt(x):
+ valid_dir = (x.startswith("qt") or "pyside-setup" in x)
+ return valid_dir and os.path.isdir(os.path.join(qt_path, x))
+ return sorted([i for i in os.listdir(qt_path) if is_qt(i)])
+
+
+def get_email_domain(x):
+ x = x.replace('"', "")
+ if x.count("@") == 0:
+ try:
+ v = x.split(".")[-2]
+ except IndexError:
+ v = ""
+ else:
+ v = ".".join(x.split("@")[1].split(".")[:-1]).replace("\\", "")
+
+ if v in ("theqtcompany", "qt", "nokia", "nokiamail", "digia"):
+ new_email = re.sub("@.*", "@qt.io", x)
+ return new_email, "qt"
+ return x, v
+
+
+def process_git_log_line(line):
+ changed = insertions = deletions = 0
+ # files changed
+ re_changed = re.search(r"(\d+) files? changed", line)
+ if re_changed:
+ changed = re_changed.group(1)
+ # insertions
+ re_insertions = re.search(r"(\d+) insertions?", line)
+ if re_insertions:
+ insertions = re_insertions.group(1)
+ # deletions
+ re_deletions = re.search(r"(\d+) deletions?", line)
+ if re_deletions:
+ deletions = re_deletions.group(1)
+
+ original_line = line.split("‽")[0]
+ # the last field is the 'email'
+ original_email = original_line.split(";")[-1]
+ email, domain = get_email_domain(original_email)
+
+ return f'{original_line};"{email}";"{domain}";"{changed}";"{insertions}";"{deletions}"'
+
+
+def git_log():
+ def is_valid_line(x):
+ if (
+ x.strip()
+ and "Qt by Nokia" not in x
+ and "Qt Forward Merge Bot" not in x
+ and "Qt Submodule Update Bot" not in x
+ ):
+ return True
+ return False
+
+ # We can do this process with:
+ # git log --all --no-merges --date=format:'%Y-%m-%d'
+ # --pretty=format:'µ"%cd";"%h";"%an";"%ce"'
+ # --shortstat | tr '\n' ' ' | tr 'µ' '\n' |
+ # sed 's/\ *\(\d+\)\ files\{0,1\}/\1/g'
+ # But we will use only Python to perform those pipe operations.
+
+ # This command has a trick to get the 'shortstats' on the same line
+ # when processing the lines. Notice the 'µ' character that depicts the
+ # beginning of the line. Additionally we use an interrobang '‽'
+ # to depict the end of the git log, so we can add the 'files changed',
+ # 'insertions', and 'deletions' at the end.
+ o = subprocess.run(
+ (
+ "git log --all --no-merges "
+ "--date=format:'%Y-%m-%d' "
+ '--pretty=format:\'µ"%cd";"%h";"%an";"%ae"\'‽ '
+ "--shortstat "
+ ).split(),
+ capture_output=True,
+ universal_newlines=True,
+ encoding="utf-8",
+ errors="ignore",
+ ).stdout
+ o = o.replace("\n", " ").replace("µ", "\n").replace("'", "")
+ return "\n".join(process_git_log_line(line) for line in o.splitlines() if is_valid_line(line))
+
+
+def check_arguments(options):
+ qt_path = Path(options.qt_dir)
+ if qt_path.is_dir():
+ return True
+ log.error(f"'{qt_path}' is not a directory.")
+ return False
+
+
+def is_valid_module(m):
+ if (m.name.startswith("qt") or str(m.name) in ("pyside-setup",)) and m.is_dir():
+ return True
+ return False
+
+
+def process_qt_src(options):
+ qt_path = Path(options.qt_dir)
+ HEADER = "date;sha;name;original_email;email;domain;files_changed;insertions;deletions\n"
+ for i in qt_path.glob("*"):
+ if is_valid_module(i):
+ log.info(f"Processing {i}...")
+ output_csv = f"{Path(__file__).parent}/{i.name}.csv"
+ out = None
+ with cd(i):
+ out = git_log()
+ if not out:
+ log.error("Empty 'git log' for i")
+ continue
+ with open(output_csv, "w") as f:
+ f.write(HEADER)
+ f.write(out)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(prog="get_data")
+
+ parser.add_argument(
+ "--qt",
+ action="store",
+ dest="qt_dir",
+ required=True,
+ help="Path to a directory containing Qt modules, like the 'qt5' meta repository",
+ )
+
+ options = parser.parse_args()
+ if not check_arguments(options):
+ parser.print_help()
+ sys.exit(-1)
+
+ # main process
+ process_qt_src(options)