Created
December 3, 2015 22:19
-
-
Save Zabanaa/599168f0a0fc274f675e to your computer and use it in GitHub Desktop.
First stab at creating a web crawler using requests and beautiful soup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script will attempt to fetch data from Yahoo finance for a particular | |
# stock, get the prices for the last 6 months and save the info to a csv file | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
import time | |
def get_prices(url): | |
print("Fetching Prices ...") | |
time.sleep(1) | |
initial_request = requests.get(url) # Making the request | |
html_response = initial_request.text # Get the html source | |
soup = BeautifulSoup(html_response, "html.parser") # Turn that into a soup object | |
tables = soup.find_all("table", {"cellpadding":"2"}) # The CSS selectors could be improved on the site | |
# the table we are interested in has some generic table | |
# attributes which makes it difficult to target | |
stock_table = tables[4] # this is the table containing our stock prices | |
rows = stock_table.find_all("tr")[1:-1] # we ignore the first and the last rows as they don't contain any price information | |
prices = [] # This is an empty list that will contain the price information for the stock | |
for row in rows: | |
cells = row.find_all("td") | |
date = cells[0].string | |
adj_price = cells[-1].string | |
prices.append([date, adj_price]) | |
print("Prices stored, preparing the info ...") | |
return prices | |
def copy_to_csv(price_list, file_name): | |
prices_file = open(file_name + ".csv", "a") | |
prices_file_writer = csv.writer(prices_file) | |
print("Writing prices to csv file ...") | |
time.sleep(1) | |
for price in price_list: | |
prices_file_writer.writerow(price) | |
prices_file.close() | |
print("Prices saved to the csv file, open it !") | |
if __name__ == "__main__": | |
page = 0 | |
""" | |
The stock prices table has a row of headings | |
One problem I came across is that when fetching | |
and appending them to the prices list (in the get_prices function) | |
they would repeat in the outputed csv file. | |
Example: The script would fetch the data for page 1 and write it | |
to the file, then it would go on to page 2 and add the headings again and so on | |
for every page. | |
The short term solution I came up with is to open the file and manually add | |
the heading titles myself before calling the functions, that way | |
it doesn't append a new row of headings for each new fetched page. | |
""" | |
with open("apple-prices.csv", "w") as apple_prices: | |
apple_prices_writer = csv.writer(apple_prices) | |
apple_prices_writer.writerow(["Date", "Adj Close"]) | |
apple_prices.close() | |
while page < 198: | |
apple_stock_url = "https://uk.finance.yahoo.com/q/hp?s=AAPL&a=03&b=6&c=2015&d=11&e=2&f=2015&g=d&z=66&y=" + str(page) | |
print(apple_stock_url) | |
apple_prices = get_prices(apple_stock_url) | |
copy_to_csv(apple_prices, "apple-prices") | |
page += 66 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment