Last active
January 7, 2021 21:35
-
-
Save brenoimatos/7a3448854a3d59416cf17e81094a724a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Checando se temos algum NaN | |
print(df_raw[df_raw['lyrics'].isna() == True]) | |
# Excluindo os NaN | |
df_valid = df_raw.dropna() | |
print(df_valid.isna().sum()) | |
# Criando a função para limpar o dataframe | |
def cleaning_text(text): | |
regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'',''))) | |
text = text.lower().replace('’', '') | |
text = unidecode.unidecode(text) | |
text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre [] | |
text = re.sub('\((.+?)\)', '', text) #excluir palavras entre () | |
text = re.sub(regex, ' ', text) #excluir pontuação | |
text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1. | |
return text | |
#Agrupando as letras por artista e aplicando a função para limpar | |
df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index() | |
df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x)) | |
df.head(3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment