Last active
December 1, 2022 02:23
-
-
Save josephlou5/6a2db33b82f9608a435c016106e2144c to your computer and use it in GitHub Desktop.
Gets the history of a specific file in all the commits of a repository
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
get_file_history.py | |
Gets the history of a specific file in all the commits of a repo. | |
GitPython: https://gitpython.readthedocs.io/en/stable/index.html | |
""" | |
# ============================================================================== | |
import json | |
from pathlib import Path | |
import git | |
# ============================================================================== | |
# Path to the repository folder. | |
REPO_PATH = '.' | |
# The file paths to filter by. Only return commits that change any of | |
# these paths. If empty, returns all commits. | |
FILE_PATHS = [ | |
'data/', | |
] | |
# The number of commits to process. Set to None to process all commits. | |
NUM_COMMITS = None | |
DATETIME_FMT = '%Y-%m-%d %H:%M:%S' | |
# ============================================================================== | |
PROCESSED_COMMITS = Path('processed_commits.json') | |
# ============================================================================== | |
def process_file(commit_sha, commit_datetime, filename, file_contents): | |
"""Do something with the file, such as save it. | |
This function will be called for every file in every commit, even if it's | |
not in `FILE_PATHS`. If needed, filter it out before processing the file. | |
""" | |
# ============================================================================== | |
def main(): | |
processed_commits = json.loads( | |
PROCESSED_COMMITS.read_text(encoding='utf-8')) | |
repo = git.Repo(REPO_PATH) | |
try: | |
count = 0 | |
for commit in repo.iter_commits(paths=FILE_PATHS): | |
commit_sha = commit.hexsha | |
if commit_sha in processed_commits: | |
# do not re-process this commit | |
continue | |
commit_datetime = commit.committed_datetime | |
commit_files = [] | |
for file in commit.stats.files: | |
if ' => ' in file: | |
# renamed file | |
left, right = file.split(' => ') | |
if '{' in file and '}' in file: | |
# only a part was renamed | |
# remove the '{' | |
unchanged_left, old_name = left.split('{') | |
# remove the '}' | |
new_name, unchanged_right = right.split('}') | |
new_file = unchanged_left + new_name + unchanged_right | |
else: | |
# the entire file was renamed | |
new_file = right | |
file = new_file | |
try: | |
file_data = commit.tree / file | |
except KeyError: | |
# file was deleted (probably) | |
continue | |
commit_files.append(file) | |
file_contents = file_data.data_stream.read() | |
process_file(commit_sha, commit_datetime, file, file_contents) | |
processed_commits[commit_sha] = { | |
'author': commit.author.name, | |
'email': commit.author.email, | |
'datetime': commit_datetime.strftime(DATETIME_FMT), | |
'message': commit.message, | |
'files': commit_files, | |
} | |
count += 1 | |
if NUM_COMMITS is not None and count >= NUM_COMMITS: | |
break | |
finally: | |
PROCESSED_COMMITS.write_text(json.dumps(processed_commits, indent=2), | |
encoding='utf-8') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: If using a local git repository (as I am), you must
git pull
before running to get updated commit data.