From 66c19491a3d2ebc2ea5209480db14560602c4942 Mon Sep 17 00:00:00 2001
From: Laurent Monin <github@norz.org>
Date: Fri, 21 Feb 2020 12:53:19 +0100
Subject: [PATCH] Add a script to maintain license in source files

It uses existing copyrights and authors from git log.
---
 scripts/fix_header.py | 277 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100755 scripts/fix_header.py

diff --git a/scripts/fix_header.py b/scripts/fix_header.py
new file mode 100755
index 000000000..3bbe5300d
--- /dev/null
+++ b/scripts/fix_header.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+#
+# Picard, the next-generation MusicBrainz tagger
+#
+# Copyright (C) 2020 Laurent Monin
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+import argparse
+from collections import defaultdict
+import glob
+import itertools
+import os
+import re
+import subprocess
+
+
+ALIASES = {
+    'vishichoudhary': 'Vishal Choudhary',
+    'vishal choudhary': 'Vishal Choudhary',
+    'Lukas Lalinsky <lalinsky@gmail.com>': 'Lukáš Lalinský',
+    'yvanzo': 'Yvan Rivière',
+}
+
+
+# https://stackoverflow.com/a/4629241
+def ranges(i):
+    for a, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):
+        b = list(b)
+        yield b[0][1], b[-1][1]
+
+
+def extract_authors_from_gitlog(path):
+    authors = {}
+    cmd = ['git', 'log', r'--pretty=format:%ad %aN', r'--date=format:%Y', r'--', path]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=30)
+    if result.returncode == 0:
+        pattern = re.compile(r'^(\d+) (.*)$')
+        for line in result.stdout.decode('utf-8').split("\n"):
+            match = pattern.search(line)
+            if match:
+                year = int(match.group(1))
+                author = match.group(2)
+                author = ALIASES.get(author, author)
+                if author in authors:
+                    if year not in authors[author]:
+                        authors[author].append(year)
+                else:
+                    authors[author] = [year]
+    return authors
+
+
+def parse_copyright_text(text):
+    authors = {}
+    pattern_copyright = re.compile(r'^# Copyright \D*((?:\d{4}(?:,? *|-))+) (.+)\s*$')
+    range_pattern = re.compile(r'^\s*(\d{4})\s*-\s*(\d{4})\s*$')
+
+    for line in text.split("\n"):
+        #print(line)
+        match = pattern_copyright.search(line)
+        if match:
+            all_years = []
+            years_group = match.group(1)
+            author = match.group(2)
+            author = ALIASES.get(author, author)
+            comma_years = []
+            if ',' in years_group:
+                for year in years_group.split(','):
+                    comma_years.append(year.strip())
+            else:
+                comma_years.append(years_group.strip())
+
+            for years in comma_years:
+                m = range_pattern.search(years)
+                if m:
+                    year1 = int(m.group(1))
+                    year2 = int(m.group(2))
+                    for y in range(min(year1, year2), max(year1, year2)+1):
+                        all_years.append(y)
+                else:
+                    all_years.append(int(years))
+            if author in authors:
+                for y in all_years:
+                    if y not in authors[author]:
+                        authors[author].append(y)
+            else:
+                authors[author] = all_years
+    return authors
+
+
+EMPTY_LINE = ("\n", "#\n")
+
+
+def parse_file(path):
+    authors_from_log = extract_authors_from_gitlog(path)
+    start = end = None
+    authors_from_file = {}
+
+    with open(path) as f:
+        lines = f.readlines()
+        found = defaultdict(lambda: None)
+        for num, line in enumerate(lines):
+            if line.startswith("# Automatically generated"):
+                found['autogenerated'] = num
+                return (found, {}, {}, '', "".join(lines))
+
+        for num, line in enumerate(lines):
+            if not line.startswith("#") and line not in EMPTY_LINE:
+                break
+            if "coding: utf-8" in line:
+                del lines[num]
+                i = num + 1
+                while i < len(lines) and lines[i] in EMPTY_LINE:
+                    del lines[i]
+                break
+        for num, line in enumerate(lines):
+            if not line.startswith("#") and line not in EMPTY_LINE:
+                break
+            if "GNU General Public License" in line:
+                found['license'] = num
+                break
+        if found['license'] is not None:
+            i = starting_pos = found['license']
+            while lines[i].startswith("#"):
+                if i == 0:
+                    break
+                if lines[i].startswith("# Picard"):
+                    break
+                i -= 1
+            while True:
+                if i == 0:
+                    break
+                if lines[i-1] in EMPTY_LINE:
+                    i -= 1
+                else:
+                    break
+            start = i
+            i = starting_pos
+            while lines[i].startswith("#"):
+                if i == len(lines) - 1:
+                    break
+                if lines[i].endswith(" USA.\n"):
+                    break
+                i += 1
+            while True:
+                if i == len(lines) - 1:
+                    break
+                if lines[i+1] in EMPTY_LINE:
+                    i += 1
+                else:
+                    break
+            end = i
+            authors_from_file = parse_copyright_text("".join(lines[start:end]))
+            before = lines[:start]
+            after = lines[end+1:]
+        else:
+            before = []
+            after = lines
+        return found, authors_from_file, authors_from_log, "".join(before), "".join(after)
+
+
+CODING_TEXT = """# -*- coding: utf-8 -*-
+#
+"""
+
+LICENSE_TOP = """# Picard, the next-generation MusicBrainz tagger
+#
+"""
+
+LICENSE_BOTTOM = """#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+"""
+
+
+def fix_header(path):
+    found, authors_from_file, authors_from_log, before, after = parse_file(path)
+    if found['autogenerated'] is not None:
+        return None
+
+    authors = {}
+    for a in authors_from_log:
+        if a not in authors:
+            authors[a] = authors_from_log[a]
+        for b in authors_from_file:
+            if b not in authors:
+                authors[b] = authors_from_file[b]
+            else:
+                authors[b].extend(authors_from_file[b])
+
+    new_authors = {}
+    for a in authors:
+        new_authors[a] = []
+        for y1, y2 in list(ranges(sorted(set(authors[a])))):
+            if y1 == y2:
+                new_authors[a].append(str(y1))
+            else:
+                new_authors[a].append("%d-%d" % (y1, y2))
+
+    new_copyright = ""
+    for author, years in sorted(new_authors.items(), key=lambda x: (sorted(x[1]), x[0])):
+        new_copyright += "# Copyright (C) %s %s\n" % (", ".join(years), author)
+
+    before = before.strip()
+    after = after.strip()
+    has_content = bool(before + after)
+
+    parts = list(filter(None, [
+        CODING_TEXT.strip(),
+        LICENSE_TOP.strip(),
+        new_copyright.strip(),
+        LICENSE_BOTTOM.strip() + ("\n\n" if has_content else ""),
+        before.strip(),
+        after.strip(),
+    ]))
+    return "\n".join(parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate source file header with copyrights & license from existing header and git log',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('path', nargs='+', help='Path of a file or a folder of files')
+    parser.add_argument('-e', '--extension', default='.py', help='File extension to filter by')
+    parser.add_argument('-i', '--in-place', action='store_true', default=False, help='Edit files in place')
+    parser.add_argument('-r', '--recursive', action='store_true', default=False, help='Search through subfolders')
+    args = parser.parse_args()
+
+    paths = list(args.path)
+    files = set()
+    for path in paths:
+        if os.path.isfile(path):
+            name, ext = os.path.splitext(path)
+            if args.extension in ('', ext):
+                files.add(path)
+        else:
+            if args.recursive:
+                paths += glob.glob(path + '/*')
+
+    for path in files:
+        new_content = fix_header(path)
+        if new_content is None:
+            continue
+        if args.in_place:
+            with open(path, 'w') as f:
+                print(new_content, file=f)
+        else:
+            # by default, we just output to stdout
+            print(new_content)
+
+
+if __name__ == '__main__':
+    main()