zepadovani · May 16, 2023 00:51
diff --git a/wikiaves_species_table_to_df_and_csv.py b/wikiaves_species_table_to_df_and_csv.py
 import requests
 from bs4 import BeautifulSoup
 from selenium import webdriver
 import pandas as pd

 def getWikiAvesDF(filename=None,savecsv=False):
    """
    Scrape the Wikipedia Aves table and return a DataFrame with 'Espécie' and 'Nome Comum' columns.
    
    Parameters:
        - filename (str): Name of the CSV file to save the DataFrame (optional).
        - savecsv (bool): Flag indicating whether to save the DataFrame as a CSV file (optional).
        
    Returns:
        - DataFrame: DataFrame with 'Espécie' and 'Nome Comum' columns.
    """
    url = "https://www.wikiaves.com.br/especies.php?t=t"

    driver = webdriver.Chrome()     # Create a new Selenium webdriver instance (make sure you have the appropriate driver installed)
    driver.get(url)                 # Load the page using Selenium
    content = driver.page_source    # Get the HTML content after JavaScript execution
    driver.quit()                   # Close the Selenium webdriver

    
    soup = BeautifulSoup(content, "html.parser") # Parse the HTML content using BeautifulSoup
    table = soup.find('table')                   # Table

    # Extract the table headers
    headers = []
    for th in table.find_all('th'):
        headers.append(th.text)
    # Extract the table rows
    data = []
    for tr in table.find_all('tr'):
        row = []
        for td in tr.find_all('td'):
            row.append(td.text)
        if row:
            data.append(row)

    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)

    # remove repetição de nome comum antes da espécie (sei lá porque aconteceu isso)
    df['Espécie'] = df.apply(lambda row: row['Espécie'].replace(row['Nome Comum'], '', 1) if pd.notnull(row['Espécie']) and pd.notnull(row['Nome Comum']) else row['Espécie'], axis=1)
    df = df.dropna(subset=['Espécie'])
    df = df.reset_index(drop=True)
    outdf = df[["Espécie","Nome Comum"]]

    if savecsv:
        df.to_csv(filename, index=False)
        
    return outdf
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	import pandas as pd

	def getWikiAvesDF(filename=None,savecsv=False):
	"""
	Scrape the Wikipedia Aves table and return a DataFrame with 'Espécie' and 'Nome Comum' columns.

	Parameters:
	- filename (str): Name of the CSV file to save the DataFrame (optional).
	- savecsv (bool): Flag indicating whether to save the DataFrame as a CSV file (optional).

	Returns:
	- DataFrame: DataFrame with 'Espécie' and 'Nome Comum' columns.
	"""
	url = "https://www.wikiaves.com.br/especies.php?t=t"

	driver = webdriver.Chrome() # Create a new Selenium webdriver instance (make sure you have the appropriate driver installed)
	driver.get(url) # Load the page using Selenium
	content = driver.page_source # Get the HTML content after JavaScript execution
	driver.quit() # Close the Selenium webdriver


	soup = BeautifulSoup(content, "html.parser") # Parse the HTML content using BeautifulSoup
	table = soup.find('table') # Table

	# Extract the table headers
	headers = []
	for th in table.find_all('th'):
	headers.append(th.text)
	# Extract the table rows
	data = []
	for tr in table.find_all('tr'):
	row = []
	for td in tr.find_all('td'):
	row.append(td.text)
	if row:
	data.append(row)

	# Create a DataFrame
	df = pd.DataFrame(data, columns=headers)

	# remove repetição de nome comum antes da espécie (sei lá porque aconteceu isso)
	df['Espécie'] = df.apply(lambda row: row['Espécie'].replace(row['Nome Comum'], '', 1) if pd.notnull(row['Espécie']) and pd.notnull(row['Nome Comum']) else row['Espécie'], axis=1)
	df = df.dropna(subset=['Espécie'])
	df = df.reset_index(drop=True)
	outdf = df[["Espécie","Nome Comum"]]

	if savecsv:
	df.to_csv(filename, index=False)

	return outdf