Created
July 6, 2022 04:51
-
-
Save jennynz/6b7e31c7a140ed4c33b1b5f395b8e635 to your computer and use it in GitHub Desktop.
Python script for getting GitHub data (e.g. PRs, comments, reviews) about a public repository via their GraphQL API, with pagination & rate limit handline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import boto3 | |
import json | |
import os | |
from datetime import datetime, timedelta | |
# Example: getting data for vuejs/vue from the last 90 days | |
ORG_NAME = "vuejs" | |
REPO_NAME = "vue" | |
CREATED_DATE_START = datetime.now() - timedelta(days=30) | |
CREATED_DATE_END = datetime.now() | |
BUCKET_NAME = "my-cool-s3-bucket-query-results" # Add your output S3 bucket here | |
PERSONAL_ACCESS_TOKEN = "" # Add your token from GitHub here | |
HEADERS = { | |
"Authorization": f"bearer {PERSONAL_ACCESS_TOKEN}", | |
"Accept": "application/vnd.github.machine-man-preview+json", | |
} | |
# The types of queries you want to run for each PR | |
QUERIES = [ | |
{ | |
"name": "pull-requests", | |
"query_file": "pull-requests.graphql", # Paths to graphQL queries | |
"pagination_key": "comments", | |
}, | |
{ | |
"name": "pull-requests-reviews", | |
"query_file": "pull-requests-reviews.graphql", | |
"pagination_key": "reviews", | |
}, | |
{ | |
"name": "pull-requests-review-comments", | |
"query_file": "pull-requests-review-comments.graphql", | |
"pagination_key": "reviews", | |
}, | |
] | |
# First get the PR numbers for the PRs in your desired date range, using the REST API. | |
all_prs_retrieved = False | |
page_num = 0 | |
n_items_per_page = 100 | |
pr_numbers = [] | |
while not all_prs_retrieved: | |
# Get PR numbers with pagination via the REST API | |
# https://docs.github.com/en/rest/reference/pulls#list-pull-requests | |
response = requests.get( | |
f"https://api.github.com/repos/{ORG_NAME}/{REPO_NAME}/pulls", | |
params={ | |
"owner": ORG_NAME, | |
"repo": REPO_NAME, | |
"sort": "created", | |
"direction": "desc", | |
"per_page": n_items_per_page, | |
"page": page_num, | |
"state": "all", | |
}, | |
headers=HEADERS, | |
) | |
data = response.json() | |
# Get the created date of the last PR in the date-ordered response, and check if we've exceeded the date range | |
created_at = datetime.strptime(data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") | |
all_prs_retrieved = ( | |
created_at < CREATED_DATE_START or len(data) < n_items_per_page | |
) | |
for pr in data: | |
# You can do some extra filtering here e.g. discard any PRs from the late page with created dates outside of specified date | |
pr_numbers.append(pr["number"]) | |
page_num += 1 | |
n_pr_numbers = len(pr_numbers) | |
# Then query the GraphQL API for data on each PR | |
s3_client = boto3.client("s3") | |
for query in QUERIES: | |
print(f"Running {query['name']} query") | |
output_dir = os.path.join(ORG_NAME, REPO_NAME, query["name"]) | |
for i, pr_number in enumerate(pr_numbers): | |
print(f"PR number {pr_number} ({i} of {n_pr_numbers} PRs)") | |
next_cursor = "null" | |
items_left_to_paginate = True | |
page_num = 1 | |
while items_left_to_paginate: | |
# Get the GraphQL query and populate it with variables for the given PR | |
with open(query["query_file"], "r") as q_file: | |
query_string = q_file.read() | |
formatted_query_string = (query_string) % ( | |
ORG_NAME, | |
REPO_NAME, | |
pr_number, | |
next_cursor, | |
) | |
# Post the query | |
response = requests.post( | |
"https://api.github.com/graphql", | |
json={"query": formatted_query_string}, | |
headers=HEADERS, | |
) | |
results = response.json() | |
data = results["data"] | |
# Check rate limit | |
if ( | |
"errors" in results and "api limit" in results["errors"]["type"].lower() | |
) | (data["rateLimit"]["remaining"] <= 100): | |
# Rate limit has been reached, sleep until reset time | |
rate_limit_reset = datetime.strptime( | |
data["rateLimit"]["resetAt"], "%Y-%m-%dT%H:%M:%SZ" | |
) + timedelta(minutes=5) | |
# Write the query results to S3 using boto3 | |
bucket_key = f"{output_dir}/PR{pr_number}_page{page_num}.json" | |
s3_client.put_object( | |
Body=json.dumps(data), Bucket=BUCKET_NAME, Key=bucket_key | |
) | |
# Update pagination cursor | |
current_level_item_key = data["repository"]["pullRequest"][ | |
query["pagination_key"] | |
] | |
if current_level_item_key["pageInfo"]["hasNextPage"]: | |
next_cursor = current_level_item_key["pageInfo"]["endCursor"] | |
page_num += 1 | |
continue | |
items_left_to_paginate = False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment