This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# função adjust_text importada de https://github.com/Phlya/adjustText/blob/master/adjustText/__init__.py | |
x = df['Unique'] | |
y = df['Relative'] | |
annot = df.index | |
fig, ax = plt.subplots(figsize=(12, 15)) | |
ax.scatter(x, y) | |
texts = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Para pegar o path do seu diretório de trabalho | |
path = os.getcwd() | |
# Definindo o nome da pasta que vai ser salvo os png | |
wordcloud_folder = 'wordcloud' | |
for artist in df.index: | |
wordcloud = WordCloud(background_color="black", | |
max_words = 500, | |
width=1600, height=800).generate(df.loc[artist]['wordcloud']) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df['Unique'] = df['tokens'].apply(lambda x: len(set(x))) | |
df['Total'] = df['tokens'].apply(lambda x: len(x)) | |
df['Relative'] = df['tokens'].apply(lambda x: len(set(x))/len(x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk.download('stopwords') | |
stopwords_nltk = nltk.corpus.stopwords.words('portuguese') | |
update = ["tá",'pra','tô', 'cê','pro', 'então', "meu", "em", | |
"você", "de", "ao", "os",'vou', 'vai', 'vem', 'mim', | |
'uns', 'sei', 'quero', 'ser', 'ver', 'aqui','faz'] | |
# Concatenando as duas listas | |
stopwords_raw = [*stopwords_nltk, *update] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df["tokens"] = df.lyrics_clean.str.split() | |
df.set_index('artist', inplace = True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Checando se temos algum NaN | |
print(df_raw[df_raw['lyrics'].isna() == True]) | |
# Excluindo os NaN | |
df_valid = df_raw.dropna() | |
print(df_valid.isna().sum()) | |
# Criando a função para limpar o dataframe | |
def cleaning_text(text): | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import nltk | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud, ImageColorGenerator | |
import unidecode | |
import re | |
import string | |
import matplotlib.ticker as mtick | |
import os |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LyricsSpider(scrapy.Spider): | |
name = 'rap' | |
urls = [] | |
artists = '3030 1kilo adl-mcs afro-x azzy baco-exu-do-blues bin bk black-alien bnegao c4bal cacife-clandestino cartel-mcs choice chris cone-crew-diretoria coruja-bc1 costa-gold criolo cynthia-luz dalsin delacruz de-leve dfideliz diomedes-chinaski djonga don-l drik-barbosa dudu-mc emicida fabio-brazza faccao-central felp22 flora-matos froid gaab gabriel-pensador gloria-groove haikaiss hungria-hip-hop jaya-luuck je-santiago kamau karol-conka kayua kiaz l7nnon luccas-carlos makalister-renton mano-brown marcelo-d2 matue mc-hariel-sp mc-marechal mc-orochi mr-thug mv-bill nabrisa-tonett negra-li nill nill ogi oriente pele-milflows projota quinto-andar racionais-mcs rael rappin-hood rashid ret rincon-sapiencia sabotage sant shawlin sidoka slim-rimografia speed-freaks tulio-dek ucl xama'.split() | |
for artist in artists: | |
urls.append(f'https://www.letras.mus.br/{artist}/') |