Skip to content

Instantly share code, notes, and snippets.

@VictorGarritano
Created June 17, 2018 17:07
Show Gist options
  • Save VictorGarritano/fc73bdf71e57223c932ec24f858c5a2d to your computer and use it in GitHub Desktop.
Save VictorGarritano/fc73bdf71e57223c932ec24f858c5a2d to your computer and use it in GitHub Desktop.
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "http://www.maisbolao.com.br/bolao/classificacao/31858/bolao-da-twist"
class HTMLTableParser:
def parse_url(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
return [self.parse_html_table(table) for table in soup.find_all("table")]
def parse_html_table(self, table):
n_columns = 0
n_rows = 0
column_names = []
for row in table.find_all("tr"):
td_tags = row.find_all("td")
if len(td_tags):
n_rows += 1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all("th")
if len(th_tags) and not len(column_names):
for th in th_tags:
column_names.append(th.get_text())
if len(column_names) and len(column_names) != n_columns:
raise Exception("column titles do not match the number of columns")
columns = column_names if len(column_names) else range(0, n_columns)
df = pd.DataFrame("<UNK>", columns=columns[3:], index=range(1, n_rows+1))
row_marker = 0
for row in table.find_all("tr")[1:]:
column_marker = 0
columns = row.find_all("td")
for column in columns[3:]:
df.iat[row_marker, column_marker] = column.get_text()
column_marker += 1
if len(columns):
row_marker += 1
for col in df:
try:
df[col] = df[col].astype(int)
except ValueError:
pass
return df
if __name__ == '__main__':
hp = HTMLTableParser()
table = hp.parse_url(url)
print(table[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment