Created
May 28, 2019 02:57
-
-
Save antonioFlavio/45a1d32d6d2d85b1f4d86a3e32332c43 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy as np | |
import tensorflow as tf | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.linear_model import Perceptron | |
from sklearn.model_selection import GridSearchCV, cross_val_score | |
from sklearn import preprocessing | |
from sklearn.metrics import f1_score | |
def crie_features(row, coluna1, coluna2): | |
if row[coluna1] == 0 or row[coluna2] == 0 : return 0 | |
return row[coluna1]/row[coluna2] | |
def cri_features_na_base(base): | |
base['balance_by_time'] = base.apply(lambda x: crie_features(x, 'Balance', 'Tenure'), axis=1) | |
base['salary_by_age'] = base.apply(lambda x: crie_features(x, 'EstimatedSalary', 'Age'), axis=1) | |
base['products_by_month'] = base.apply(lambda x: crie_features(x, 'NumOfProducts', 'Tenure'), axis=1) | |
base['balance_by__salary'] = base.apply(lambda x: crie_features(x, 'Balance', 'EstimatedSalary'), axis=1) | |
base['balance_by_age'] = base.apply(lambda x: crie_features(x, 'Balance', 'Age'), axis=1) | |
base['balance_by_products'] = base.apply(lambda x: crie_features(x, 'Balance', 'NumOfProducts'), axis=1) | |
base['score_balance'] = base['CreditScore'] * base['Balance'] | |
return base | |
def salve_predicao(df_validacao, previsao): | |
df_validacao['Exited'] = previsao.reshape(-1, 1) | |
resultado = df_validacao[['RowNumber', 'Exited']].reset_index(drop=True) | |
resultado.to_csv("D:\\Projetos\\RepositoriosGit\\Analise_Churn\\resultado.csv",index=False) | |
#resultado.to_csv("D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\resultado.csv",index=False) | |
def realizeTransformacao_1(df): | |
df = cri_features_na_base(df) | |
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True) | |
df = df.drop('Surname', axis=1) | |
#df = df.drop('Geography_Spain', axis=1) | |
#df = df.drop('HasCrCard', axis=1) | |
#df = df.drop('Gender_Male', axis=1) | |
#df = df.drop('Tenure', axis=1) | |
return df | |
def realize_predicao(metodo_transformacao, modelo, features): | |
df_validacao = pd.read_csv('D:\\Projetos\\RepositoriosGit\\Analise_Churn\\valid.csv') | |
#df_validacao = pd.read_csv('D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\valid.csv') | |
df_transformado = metodo_transformacao(df_validacao) | |
predicao = modelo.predict(df_transformado[features]) | |
salve_predicao(df_transformado, predicao) | |
def f1_score_measure(nome_algoritmo, y_true, y_pred): | |
print(nome_algoritmo, ": ", f1_score(y_true, y_pred, average='weighted')) | |
def normalize_base(base, features): | |
colunas = features | |
for i in range(len(colunas)): | |
base[colunas[i]] = base[colunas[i]] / base[colunas[i]].max() | |
return base | |
df = pd.read_csv('D:\\Projetos\\RepositoriosGit\\Analise_Churn\\train.csv') | |
#df = pd.read_csv('D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\train.csv') | |
#df = pd.read_csv('train.csv') | |
#features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male'] | |
#features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male'] | |
#features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'balance_by_age', 'balance_by_products', 'score_balance'] | |
features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'balance_by_age', 'balance_by_products', 'score_balance', 'Geography_Spain', 'HasCrCard', 'Gender_Male', 'Tenure'] | |
# feature_plot = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Exited'] | |
# sns.pairplot(df[feature_plot]) | |
# plt.show() | |
df_2 = realizeTransformacao_1(df) | |
df_2 = normalize_base(df_2, features) | |
X_train, X_test, y_train, y_test = train_test_split(df_2, df_2.Exited, test_size = 0.35, random_state= 40) | |
X_train = X_train[features] | |
X_test = X_test[features] | |
def crie_matriz_classificacao_binaria(vetor): | |
matriz = [] | |
for valor in vetor: | |
if valor == 1: | |
matriz.append([1, 0]) | |
else: | |
matriz.append([0, 1]) | |
return matriz | |
def deserialize_classificacao_binaria(matriz): | |
vetor = [] | |
for valor in matriz: | |
if valor[0] > valor[1]: | |
vetor.append(1) | |
else: | |
vetor.append(0) | |
return vetor | |
def rede_keras(): | |
X = np.array(X_train) | |
matriz_y = crie_matriz_classificacao_binaria(y_train) | |
#y= np.array(y_train) | |
y= np.array(matriz_y) | |
#y_teste = np.array(y_test) | |
matriz_y_test = crie_matriz_classificacao_binaria(y_test) | |
y_de_teste = np.array(matriz_y_test) | |
X_teste = np.array(X_test) | |
model = tf.keras.Sequential() | |
model.add(tf.keras.layers.Dense(56, input_dim=len(features), activation='elu')) | |
model.add(tf.keras.layers.Dense(28, activation='relu')) | |
model.add(tf.keras.layers.Dense(2, activation='softmax')) | |
sgd = tf.keras.optimizers.SGD(lr=0.001) | |
RMSprop = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0) | |
Adagrad = tf.keras.optimizers.Adagrad(lr=0.01, epsilon=0.00001, decay=0.0) | |
model.compile(loss='mean_squared_error', optimizer=Adagrad, metrics=['mae', 'acc']) | |
epochs = 100 | |
H = model.fit(X, y, batch_size=40, epochs=epochs, verbose=0, validation_data=(X_teste, y_de_teste)) | |
score = model.evaluate(X_teste, y_de_teste, verbose=1) | |
print('Test loss:', score[0]) | |
print('Test mae:', score[1]) | |
print('Test accuracy:', score[2]) | |
model_pred = model.predict(X_teste) | |
f1_score_measure('Rede Keras: ', deserialize_classificacao_binaria(matriz_y_test), deserialize_classificacao_binaria(model_pred)) | |
# plotando 'loss' e 'accuracy' para os datasets 'train' e 'test' | |
plt.figure() | |
plt.plot(np.arange(0,epochs), H.history["loss"], label="Perda no treino") | |
plt.plot(np.arange(0,epochs), H.history["val_loss"], label="Perda no teste") | |
plt.plot(np.arange(0,epochs), H.history["acc"], label="Acurácia de Treino") | |
plt.plot(np.arange(0,epochs), H.history["val_acc"], label="Acurácia de teste") | |
plt.title("Métricas do modelo") | |
plt.xlabel("Épocas #") | |
plt.ylabel("Perda/Acurácia") | |
plt.legend() | |
plt.show() | |
rede_keras() | |
# Antes de prosseguir com as previsões, vamos realizar uma análise dos dados: | |
# Sobre nossas entradas | |
#CustomerId: Identificador do cliente | |
#Surname: Nome do cliente | |
#CreditScore: Nota de crédito atual | |
#Geography: País | |
#Gender: Gênero | |
#Age: Idade | |
#Tenure: Número de meses que o cliente ficou na empresa | |
#Balance: Ainda não está claro o que é.. | |
#NumOfProducts: Número de produtos | |
#HasCrCard: Indicador de cartão de crédito | |
#IsActiveMember: Se o cliente é ativo no banco | |
#EstimatedSalary: Salário | |
#Exited: Se o cliente saiu ou não. | |
# Nosso alvo é a variável Exited | |
# # Conhecendo nossos dados | |
# print(df.head()) | |
# print(df.describe()) | |
# # Pode-se perceber que é uma base bem tratada, com pouca ou nenhuma variável com problemas. | |
# # Visualizando variáveis categóricas | |
# print(df['Geography'].unique()) | |
# print(df['Gender'].unique()) | |
# # Exploração dos dados | |
# # As classes estão desbalanceadas... | |
# #print(df['Exited'].value_counts()) | |
# # Verificando algumas médias | |
# print("Médias por Exited") | |
# print(df.groupby('Exited').mean()) | |
# # Podemos perceber que a média do saldo dos clientes Alemães é maior, | |
# # mesmo que o salário estimado seja semelhante. Talvez existam outliers. | |
# print("Médias por localização") | |
# print(df.groupby('Geography').mean()) | |
# print("Médias por gênero") | |
# print(df.groupby('Gender').mean()) | |
# print("Médias por idade") | |
# print(df.groupby('Age').mean()) | |
# print("Média por Cartão de Crédito") | |
# print(df.groupby('HasCrCard').mean()) | |
# print("Média por Tenure") | |
# print(df.groupby('Tenure').mean()) | |
# print("Média por IsActiveMember") | |
# print(df.groupby('IsActiveMember').mean()) | |
# # Correlações por localização | |
# # df_germany = df.loc[df['Geography'] == 'France'] | |
# # print(df_germany.corr()) | |
# #'Spain' 'France', Germany | |
# def mostre_crosstab(coluna): | |
# pd.crosstab(df[coluna], df.Exited).plot(kind='bar') | |
# plt.xlabel(coluna) | |
# plt.ylabel('Frequency of Purchase') | |
# plt.savefig('pur_dayofweek_bar') | |
# def mostre_crosstab_formato_diferente(coluna): | |
# table=pd.crosstab(df[coluna],df.Exited) | |
# table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True) | |
# plt.xlabel(coluna) | |
# plt.ylabel('Exited') | |
# plt.savefig('mariral_vs_pur_stack') | |
#mostre_crosstab('Gender') | |
#mostre_crosstab_formato_diferente('Gender') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment