Created
March 25, 2021 23:22
-
-
Save VirenMohindra/ced3318325e8a4ea7dae65b6eb486baa to your computer and use it in GitHub Desktop.
Scraping GitHub issues from https://github.com/headllines/hackernews-daily/issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from github import Github | |
import csv | |
import re | |
import requests | |
from datetime import datetime, timedelta | |
g = Github("access_token") | |
REPO_NAME = 'headllines/hackernews-daily' | |
FILE_NAME = "hackernews-daily.csv" | |
HN_API_URL = 'https://hacker-news.firebaseio.com/v0/user/' | |
NOOB_ACCOUNT_TRESHOLD = 15 | |
repo = g.get_repo(REPO_NAME) | |
issues = repo.get_issues(state='open') | |
fields = ['Date', 'Title', 'Link', 'userID', 'userID Age', 'New Account?', 'Number of Points', 'Number of Comments', 'HN Link'] | |
rows = [] | |
for issue in issues: | |
if (issue.title == 'npm ci'): | |
continue | |
date = all_square_brackets = all_links = title = link = user_id = '' | |
user_id_age = is_new = user_id_link = number_of_points = number_of_comments = '' | |
try: | |
date = issue.title.split('@')[1].strip(" ") | |
except IndexError: | |
date = issue.title.split('之')[1].strip(" ") | |
post = issue.body.split('\n\n') | |
for line in post[:-1]: | |
# regex for title, user_id, number of comments | |
all_square_brackets = re.findall("\[(.*?)\]", line) | |
# regex for Link, userID Link, HN Link | |
all_links = re.findall("(?P<url>https?://[^\s]+)", line.lower()) | |
title = all_square_brackets[0] | |
link = all_links[0].strip("**").strip(" )") | |
user_id = all_square_brackets[1] | |
r = requests.get(HN_API_URL + user_id + '.json') | |
user_id_age = r.json()['created'] | |
account_created_date = datetime.fromtimestamp(user_id_age) | |
submission_date = is_new = '' | |
try: | |
submission_date = datetime.strptime(date, '%Y-%m-%d') | |
calc_date = - timedelta(days=NOOB_ACCOUNT_TRESHOLD) | |
is_new = account_created_date > submission_date # submission date should always be 15 days greater than account creation | |
except ValueError: | |
cleaned_date = date.replace(' GMT+0000 (Coordinated Universal Time)', '') | |
submission_date = datetime.strptime(cleaned_date, '%a %b %d %Y %H:%M:%S') | |
calc_date = submission_date - timedelta(days=NOOB_ACCOUNT_TRESHOLD) | |
is_new = account_created_date > submission_date | |
date = submission_date.strftime('%Y-%m-%d') # conforming all dates to YYYY/MM/DD standard | |
user_id_link = all_links[1].strip(" )").replace('https://news.ycombinator.com/user?id=', '') | |
number_of_points = re.findall('(\w+ ){1}point', line)[0].strip(" ") | |
number_of_comments = all_square_brackets[2].split(' ')[0] | |
try: | |
hn_link = all_links[2].strip(" )") | |
except IndexError: | |
link = '' | |
user_id_link = all_links[0].strip(" )") | |
hn_link = all_links[1].strip(" )") | |
row = [date, title, link, user_id, user_id_age, is_new, number_of_points, number_of_comments, hn_link] | |
rows.append(row) | |
with open(FILE_NAME, 'w', encoding='utf-8-sig') as csvfile: | |
csvwriter = csv.writer(csvfile) | |
csvwriter.writerow(fields) | |
csvwriter.writerows(rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment