Last active
October 22, 2015 17:34
-
-
Save sveinungkb/20c0677433fce43df11c to your computer and use it in GitHub Desktop.
Simple script that will use Github's API to read a number of repo's commit history into .csv so it can be processed in other tools (Excel, R, Matlab)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import os | |
import datetime | |
OAUTH_TOKEN = 'YOUR-TOKEN | |
ORGANIZATION = "org" | |
REPOS = ["repo1", "repo2"] | |
# Uncomment to get all repos for org | |
# REPOS = [] | |
HISTORY_WINDOW = datetime.timedelta(days=30) | |
import requests | |
import re | |
import os | |
import datetime | |
import time | |
for file in os.listdir('.'): | |
if file.endswith('.csv'): | |
print "Cleaned up file %s" % file | |
os.remove(file) | |
commitsProcessed = 0 | |
def processCommits(repo, commits): | |
global commitsProcessed | |
target = open(repo + '.csv', 'a') | |
if os.stat(repo + '.csv').st_size == 0: | |
target.write("sha,date,user,avatar\n") | |
for commit in commits: | |
#print "Processing: %s" % commit['sha'] | |
if not commit['committer']: | |
#print "Skipping: %s" % commit | |
continue | |
dateString = commit['commit']['committer']['date'] | |
date = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ") | |
if date < datetime.datetime.now() - HISTORY_WINDOW: | |
print "Reached the end for %s at: %s (discarded)" % (repo, date) | |
target.close() | |
return True | |
else: | |
commitsProcessed += 1 | |
line = "%s,%s,%s,%s" % (commit['sha'], commit['commit']['committer']['date'], commit['committer']['login'], commit['committer']['avatar_url']) | |
target.write(line) | |
print "Adding commit: %s" % line | |
target.write("\n") | |
target.close() | |
return False | |
def getCommitsPage(repo, url): | |
print "Get commit history at %s" % url | |
headers = {"Authorization": "token " + OAUTH_TOKEN} | |
response = requests.get(url, headers=headers) | |
commits = response.json() | |
finished = processCommits(repo, commits) | |
if 'Link' not in response.headers: | |
return | |
next = response.headers['Link'] | |
print "API calls remaining: %s/%s" % (response.headers['X-RateLimit-Remaining'], response.headers['X-RateLimit-Limit']) | |
if not finished and next and 'rel="next"' in next: | |
match = re.compile('<(.*)>').search(next); | |
if match: | |
getCommitsPage(repo, match.group(1)) | |
def getHistory(repo): | |
url = 'https://api.github.com/repos/' + ORGANIZATION + '/' + repo + '/commits' | |
getCommitsPage(repo, url) | |
def getReposForUrl(url, repos): | |
headers = {"Authorization": "token " + OAUTH_TOKEN} | |
response = requests.get(url, headers=headers) | |
for repo in response.json(): | |
repos.append(repo['name']) | |
print "Added repo: %d %s" % (len(repos), repo['name']) | |
next = response.headers['Link'] | |
if next and 'rel="next"' in next: | |
match = re.compile('<(.*)>').search(next); | |
if match: | |
getReposForUrl(match.group(1), repos) | |
def getRepos(org): | |
print "Getting all repos for %s" % org | |
url = 'https://api.github.com/orgs/' + org + '/repos' | |
repos = [] | |
getReposForUrl(url, repos) | |
return repos | |
start = time.time() | |
if len(REPOS) == 0: | |
repos = getRepos(ORGANIZATION) | |
else: | |
repos = REPOS | |
for repo in repos: | |
getHistory(repo) | |
print "Processed %s commits in %d seconds" % (commitsProcessed, time.time()-start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment