brenoimatos · January 7, 2021 21:35
diff --git a/lyrics_nlp_cleaning.py b/lyrics_nlp_cleaning.py
 # Checando se temos algum NaN
 print(df_raw[df_raw['lyrics'].isna() == True])

 # Excluindo os NaN
 df_valid = df_raw.dropna()
 print(df_valid.isna().sum())

 # Criando a função para limpar o dataframe
 def cleaning_text(text):
    
    regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'','')))
    text = text.lower().replace('’', '')
    text = unidecode.unidecode(text) 
    text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre []
    text = re.sub('\((.+?)\)', '', text) #excluir palavras entre ()
    text = re.sub(regex, ' ', text) #excluir pontuação
    text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1.
    
    return text

 #Agrupando as letras por artista e aplicando a função para limpar
 df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index()
 df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x))
 df.head(3)
	# Checando se temos algum NaN
	print(df_raw[df_raw['lyrics'].isna() == True])

	# Excluindo os NaN
	df_valid = df_raw.dropna()
	print(df_valid.isna().sum())

	# Criando a função para limpar o dataframe
	def cleaning_text(text):

	regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'','')))
	text = text.lower().replace('’', '')
	text = unidecode.unidecode(text)
	text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre []
	text = re.sub('\((.+?)\)', '', text) #excluir palavras entre ()
	text = re.sub(regex, ' ', text) #excluir pontuação
	text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1.

	return text

	#Agrupando as letras por artista e aplicando a função para limpar
	df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index()
	df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x))
	df.head(3)