-
-
Save emir-munoz/c8281e6efc78fd9d8012deee772d0380 to your computer and use it in GitHub Desktop.
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for each table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for | |
each table | |
""" | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import os | |
import codecs | |
wiki = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" | |
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia | |
req = urllib2.Request(wiki,headers=header) | |
page = urllib2.urlopen(req) | |
soup = BeautifulSoup(page) | |
tables = soup.findAll("table", { "class" : "wikitable" }) | |
# show tables | |
for table in tables: | |
print "###############" | |
print table.text[:100] | |
for tn in range(len(tables)): | |
table=tables[tn] | |
# preinit list of lists | |
rows=table.findAll("tr") | |
row_lengths=[len(r.findAll(['th','td'])) for r in rows] | |
ncols=max(row_lengths) | |
nrows=len(rows) | |
data=[] | |
for i in range(nrows): | |
rowD=[] | |
for j in range(ncols): | |
rowD.append('') | |
data.append(rowD) | |
# process html | |
for i in range(len(rows)): | |
row=rows[i] | |
rowD=[] | |
cells = row.findAll(["td","th"]) | |
for j in range(len(cells)): | |
cell=cells[j] | |
#lots of cells span cols and rows so lets deal with that | |
cspan=int(cell.get('colspan',1)) | |
rspan=int(cell.get('rowspan',1)) | |
for k in range(rspan): | |
for l in range(cspan): | |
data[i+k][j+l]+=cell.text | |
data.append(rowD) | |
# write data out | |
page=os.path.split(wiki)[1] | |
fname='output_{}_t{}.csv'.format(page,tn) | |
f = codecs.open(fname, 'w')#,encoding='utf-8') | |
for i in range(nrows): | |
rowStr=','.join(data[i]) | |
rowStr=rowStr.replace('\n','') | |
print rowStr | |
rowStr=rowStr#.encode('unicode_escape') | |
f.write(rowStr+'\n') | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment