Skip to content

Instantly share code, notes, and snippets.

@thinkjrs
Forked from scrapehero/nasdaq_finance.py
Created October 3, 2018 01:40
Show Gist options
  • Save thinkjrs/63c0133ae4c2b6cda109d709a1e58b83 to your computer and use it in GitHub Desktop.
Save thinkjrs/63c0133ae4c2b6cda109d709a1e58b83 to your computer and use it in GitHub Desktop.
Script to scrape financial data from NASDAQ
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import html
import requests
from time import sleep
import json
import argparse
from random import randint
def parse_finance_page(ticker):
"""
Grab financial data from NASDAQ page
Args:
ticker (str): Stock symbol
Returns:
dict: Scraped data
"""
key_stock_dict = {}
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"Connection":"keep-alive",
"Host":"www.nasdaq.com",
"Referer":"http://www.nasdaq.com",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36"
}
# Retrying for failed request
for retries in range(5):
try:
url = "http://www.nasdaq.com/symbol/%s"%(ticker)
response = requests.get(url, headers = headers, verify=False)
if response.status_code!=200:
raise ValueError("Invalid Response Received From Webserver")
print("Parsing %s"%(url))
# Adding random delay
sleep(randint(1,3))
parser = html.fromstring(response.text)
xpath_head = "//div[contains(@id,'pageheader')]//h1//text()"
xpath_key_stock_table = '//div[contains(@class,"overview-results")]//div[contains(@class,"table-table")]/div'
xpath_open_price = '//b[contains(text(),"Open Price:")]/following-sibling::span/text()'
xpath_open_date = '//b[contains(text(),"Open Date:")]/following-sibling::span/text()'
xpath_close_price = '//b[contains(text(),"Close Price:")]/following-sibling::span/text()'
xpath_close_date = '//b[contains(text(),"Close Date:")]/following-sibling::span/text()'
xpath_key = './/div[@class="table-cell"]/b/text()'
xpath_value = './/div[@class="table-cell"]/text()'
raw_name = parser.xpath(xpath_head)
key_stock_table = parser.xpath(xpath_key_stock_table)
raw_open_price = parser.xpath(xpath_open_price)
raw_open_date = parser.xpath(xpath_open_date)
raw_close_price = parser.xpath(xpath_close_price)
raw_close_date = parser.xpath(xpath_close_date)
company_name = raw_name[0].replace("Common Stock Quote & Summary Data","").strip() if raw_name else ''
open_price =raw_open_price[0].strip() if raw_open_price else None
open_date = raw_open_date[0].strip() if raw_open_date else None
close_price = raw_close_price[0].strip() if raw_close_price else None
close_date = raw_close_date[0].strip() if raw_close_date else None
# Grabbing and cleaning keystock data
for i in key_stock_table:
key = i.xpath(xpath_key)
value = i.xpath(xpath_value)
key = ''.join(key).strip()
value = ' '.join(''.join(value).split())
key_stock_dict[key] = value
nasdaq_data = {
"company_name":company_name,
"ticker":ticker,
"url":url,
"open price":open_price,
"open_date":open_date,
"close_price":close_price,
"close_date":close_date,
"key_stock_data":key_stock_dict
}
return nasdaq_data
except Exception as e:
print("Failed to process the request, Exception:%s"%(e))
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('ticker',help = 'Company stock symbol')
args = argparser.parse_args()
ticker = args.ticker
print("Fetching data for %s"%(ticker))
scraped_data = parse_finance_page(ticker)
print("Writing scraped data to output file")
with open('%s-summary.json'%(ticker),'w') as fp:
json.dump(scraped_data,fp,indent = 4,ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment