Last active
May 16, 2023 00:51
-
-
Save zepadovani/a06e8f1b83abee59de958c0c90da09c0 to your computer and use it in GitHub Desktop.
Make a dataframe (and a csv) with the names of bird species and portuguese common names of birds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
import pandas as pd | |
def getWikiAvesDF(filename=None,savecsv=False): | |
""" | |
Scrape the Wikipedia Aves table and return a DataFrame with 'Espécie' and 'Nome Comum' columns. | |
Parameters: | |
- filename (str): Name of the CSV file to save the DataFrame (optional). | |
- savecsv (bool): Flag indicating whether to save the DataFrame as a CSV file (optional). | |
Returns: | |
- DataFrame: DataFrame with 'Espécie' and 'Nome Comum' columns. | |
""" | |
url = "https://www.wikiaves.com.br/especies.php?t=t" | |
driver = webdriver.Chrome() # Create a new Selenium webdriver instance (make sure you have the appropriate driver installed) | |
driver.get(url) # Load the page using Selenium | |
content = driver.page_source # Get the HTML content after JavaScript execution | |
driver.quit() # Close the Selenium webdriver | |
soup = BeautifulSoup(content, "html.parser") # Parse the HTML content using BeautifulSoup | |
table = soup.find('table') # Table | |
# Extract the table headers | |
headers = [] | |
for th in table.find_all('th'): | |
headers.append(th.text) | |
# Extract the table rows | |
data = [] | |
for tr in table.find_all('tr'): | |
row = [] | |
for td in tr.find_all('td'): | |
row.append(td.text) | |
if row: | |
data.append(row) | |
# Create a DataFrame | |
df = pd.DataFrame(data, columns=headers) | |
# remove repetição de nome comum antes da espécie (sei lá porque aconteceu isso) | |
df['Espécie'] = df.apply(lambda row: row['Espécie'].replace(row['Nome Comum'], '', 1) if pd.notnull(row['Espécie']) and pd.notnull(row['Nome Comum']) else row['Espécie'], axis=1) | |
df = df.dropna(subset=['Espécie']) | |
df = df.reset_index(drop=True) | |
outdf = df[["Espécie","Nome Comum"]] | |
if savecsv: | |
df.to_csv(filename, index=False) | |
return outdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment