Last active
May 8, 2017 14:56
-
-
Save seocam/7224518c280a8a584df726fc03f123f4 to your computer and use it in GitHub Desktop.
Index Github data in Elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import functools | |
import itertools | |
import requests | |
from elasticsearch import Elasticsearch | |
from elasticsearch.helpers import bulk | |
ELASTICSEARCH_IP = '<set>' | |
GITHUB_TOKEN = '<set>' | |
GITHUB_ORGANIZATION = '<set>' | |
GITHUB_PAGE_SIZE = 100 | |
GITHUB_AUTH_HEADERS = {'Authorization': 'token ' + GITHUB_TOKEN} | |
def grouper(n, iterable): | |
"""Divide a list in smaller lists with size n | |
Grouper pattern is documented in itertools docs | |
""" | |
args = [iter(iterable)] * n | |
return itertools.zip_longest(*args) | |
def get_doc_field(doc, field): | |
field_parts = field.split('.') | |
subdoc = doc | |
for part in field_parts: | |
if not subdoc: | |
return None | |
subdoc = subdoc.get(part) | |
return subdoc | |
def doc2es_action(doc, index_name, extra=None): | |
extra_data = {} | |
if extra: | |
for action_name, field_name in extra.items(): | |
if field_name.startswith('_'): | |
extra_data[action_name] = field_name.strip('_') | |
else: | |
extra_data[action_name] = get_doc_field(doc, field_name) | |
action = { | |
'_type': 'document', | |
'_id': doc.get('id', doc.get('sha')), | |
'_index': index_name, | |
} | |
action.update(doc) | |
action.update(extra_data) | |
return action | |
def gh_request_next_page(url): | |
print('.', end='', flush=True) | |
page = requests.get(url, headers=GITHUB_AUTH_HEADERS) | |
page_num = 1 | |
while len(page.json()): | |
yield page.json() | |
page_num += 1 | |
print('.', end='', flush=True) | |
page = requests.get(url + '&page=' + str(page_num), | |
headers=GITHUB_AUTH_HEADERS) | |
def gh_issues_to_actions(): | |
print('Getting Issues from {} organization'.format(GITHUB_ORGANIZATION)) | |
url = 'https://api.github.com/orgs/{}/issues'.format( | |
GITHUB_ORGANIZATION, | |
) | |
url += '?filter=all&state=all&per_page=100' | |
actions = [] | |
extra = { | |
'@timestamp': 'updated_at', | |
'@user': 'assignee.login', | |
'@type': '__issue__', | |
} | |
for page in gh_request_next_page(url): | |
for issue in page: | |
if issue.get('pull_request'): | |
continue | |
actions.append(doc2es_action(issue, 'gh-issue', extra)) | |
print('\n{} Issues recovered'.format(len(actions))) | |
return actions | |
@functools.lru_cache() | |
def gh_get_repos(): | |
url = 'https://api.github.com/orgs/{}/repos'.format( | |
GITHUB_ORGANIZATION, | |
) | |
repos = requests.get(url, headers=GITHUB_AUTH_HEADERS) | |
return repos.json() | |
def gh_commits_to_actions(): | |
print('Getting Commits from {} organization'.format( | |
GITHUB_ORGANIZATION, | |
)) | |
all_actions = [] | |
extra = { | |
'@timestamp': 'commit.author.date', | |
'@user': 'author.login', | |
'@type': '__commit__', | |
} | |
for repo in gh_get_repos(): | |
actions = [] | |
url = 'https://api.github.com/repos/{}/{}/commits'.format( | |
GITHUB_ORGANIZATION, | |
repo['name'], | |
) | |
url += '?per_page=100' | |
for page in gh_request_next_page(url): | |
for commit in page: | |
actions.append(doc2es_action(commit, 'gh-commit', extra)) | |
all_actions += actions | |
print('\n{} Commits recovered'.format(len(all_actions))) | |
return all_actions | |
def gh_pulls_to_actions(): | |
print('Getting Pull Requests from {} organization'.format( | |
GITHUB_ORGANIZATION, | |
)) | |
all_actions = [] | |
extra = { | |
'@timestamp': 'created_at', | |
'@user': 'user.login', | |
'@type': '__pull-request__', | |
} | |
for repo in gh_get_repos(): | |
actions = [] | |
# print('Getting PRs from repo', repo['name']) | |
url = 'https://api.github.com/repos/{}/{}/pulls'.format( | |
GITHUB_ORGANIZATION, | |
repo['name'], | |
) | |
url += '?state=all&per_page=100' | |
for page in gh_request_next_page(url): | |
for pull in page: | |
actions.append(doc2es_action(pull, 'gh-pull', extra)) | |
all_actions += actions | |
print('\n{} Pull Requests recovered'.format(len(all_actions))) | |
return all_actions | |
def gh_releases_to_actions(): | |
print('Getting Releases from {} organization'.format( | |
GITHUB_ORGANIZATION, | |
)) | |
all_actions = [] | |
extra = { | |
'@timestamp': 'published_at', | |
'@user': 'author.login', | |
'@type': '__release__', | |
} | |
for repo in gh_get_repos(): | |
actions = [] | |
url = 'https://api.github.com/repos/{}/{}/releases'.format( | |
GITHUB_ORGANIZATION, | |
repo['name'], | |
) | |
url += '?state=all&per_page=100' | |
for page in gh_request_next_page(url): | |
for release in page: | |
actions.append(doc2es_action(release, 'gh-release', extra)) | |
all_actions += actions | |
print('\n{} Releases recovered'.format(len(all_actions))) | |
return all_actions | |
def batch_index(actions, batch_size=50): | |
print('Indexing documents') | |
es = Elasticsearch([ELASTICSEARCH_IP], timeout=180) | |
grouped_actions = grouper(batch_size, actions) | |
for action_group in grouped_actions: | |
print('.', end='', flush=True) | |
action_group = list(filter(None, action_group)) | |
bulk(es, action_group) | |
print() | |
def index_gh(): | |
issues_actions = gh_issues_to_actions() | |
batch_index(issues_actions) | |
pulls_actions = gh_pulls_to_actions() | |
batch_index(pulls_actions) | |
release_actions = gh_releases_to_actions() | |
batch_index(release_actions) | |
commits_actions = gh_commits_to_actions() | |
batch_index(commits_actions) | |
if __name__ == '__main__': | |
index_gh() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This version indexes Releases, Issues and Pull Requests.