Last active
July 16, 2018 16:35
-
-
Save ah89/2a8d2fc2e6b34e5911627eacb699bd18 to your computer and use it in GitHub Desktop.
greedy and decision tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import copy | |
from sklearn import tree | |
import graphviz | |
class Greedy: | |
def __init__(self, set_vlaues, given_tuple, metric): | |
self.given_tuple = given_tuple | |
self.set_vlaues = set_vlaues | |
self.metric = metric | |
def score_m(self, s): | |
t_m = self.given_tuple[self.metric].values | |
S_m = s[self.metric].values | |
sd = np.std(S_m) | |
ret = float("inf") | |
if sd != 0: | |
ret = float((np.abs( | |
t_m - (float(np.sum(S_m)) / S_m.shape[0])) / np.std( | |
S_m)).flatten()) | |
return ret | |
def s_stars(self, S, all_sets): | |
temp = [] | |
s_star = set(list(S.index.values)) | |
scores = [] | |
star_set_score = self.score_m(S.loc[s_star]) | |
for x in s_star: | |
if x != t_index: | |
diff = set(s_star - set([x])) | |
diff_score = self.score_m(S.loc[diff]) | |
if diff_score >= star_set_score: | |
temp.append(list(diff)) | |
scores.append(diff_score) | |
if len(temp) != 0: | |
indexes_of_max_scores = np.argwhere( | |
scores == np.amax(scores)).flatten().tolist() | |
for i in indexes_of_max_scores: | |
self.s_stars(S.loc[temp[i]], all_sets) | |
else: | |
cnt = 0 | |
for df in all_sets: | |
if df.equals(S): | |
cnt = 1 | |
if cnt == 0: | |
all_sets.append(S) | |
class DecisionTree: | |
def __init__(self, data, starsets, id_col_name, gtuple, cat=0): | |
self.tuple = gtuple | |
self.data = data | |
self.starsets = starsets | |
self.id_col_name = id_col_name | |
self.columns_name = list(self.data.columns.values) | |
if cat == 1: | |
self.conversion_dicts() | |
self.data2num() | |
else: | |
self.x = self.data.drop(['survived','id'], axis=1).as_matrix() | |
self.create_labels() | |
def conversion_dicts(self): | |
self.cat2num = {} | |
self.num2cat = {} | |
for col in self.columns_name: | |
col_cat2num = {} | |
col_num2cat = {} | |
col_values = list(set(self.data[col].tolist())) | |
for ind in range(len(col_values)): | |
col_cat2num.update({col_values[ind]:ind}) | |
col_num2cat.update({ind:col_values[ind]}) | |
self.cat2num.update({col:col_cat2num}) | |
self.num2cat.update({col:col_num2cat}) | |
def data2num(self): | |
self.num_data = copy.copy(self.data) | |
for col in self.columns_name: | |
if col != self.id_col_name: | |
for ind in self.data.index.values: | |
self.num_data.set_value(ind, col, self.cat2num[col][ | |
self.data.loc[ind][col]]) | |
self.x = self.num_data.as_matrix() | |
def create_labels(self): | |
self.labels = [] | |
for df in self.starsets: | |
y = [0] * len(self.data.index.values) | |
for ind in list(df.index.values): | |
y[self.data.loc[self.data[self.id_col_name] == df.loc[ind][ | |
self.id_col_name]].index.values[0]] = 1 | |
y[self.tuple.index.values[0]] = 0 | |
self.labels.append(y) | |
def decision_tree(self, index): | |
clf = tree.DecisionTreeClassifier() | |
clf = clf.fit(self.x, self.labels[index]) | |
dot_data = tree.export_graphviz(clf, | |
out_file="decisionTree for " + str( | |
index) + ".dot", | |
feature_names=self.data.drop(['survived','id'], axis=1).columns.values.tolist()) | |
graph = graphviz.Source(dot_data) | |
# graph.render("decisionTreefor" + str(index)) | |
# Test | |
data = pd.read_csv("titanic_clean_10.csv") | |
for col_name in data.columns: | |
if(data[col_name].dtype == 'object'): | |
data[col_name]= data[col_name].astype('category') | |
data[col_name] = data[col_name].cat.codes | |
print data | |
# data = data.drop(["id"]) | |
# data["age"] = pd.Series(pd.cut(data['age'], bins= 5, retbins=False, labels = ["child", "youth", "senior", "old", "very_old"])) | |
# data["fare"] = pd.Series(pd.cut(data['fare'], bins= 4, retbins=False, labels = ["very_low", "low", "high", "very_high"])) | |
data = data.drop(['name', 'body', 'boat', 'home.dest', 'cabin', 'ticket', 'sibsp', 'parch'], axis=1) | |
# data = data.drop(['name', 'body', 'home.dest', 'boat', 'cabin'], axis=1) | |
data = data[data["survived"].notnull()] | |
# print(data) | |
S = copy.copy(data) | |
t_index = 3 | |
t = data.loc[t_index:t_index, :] | |
print(S.index.values) | |
print(t) | |
all_stars_sets = [] | |
gr = Greedy(S, t, "survived") | |
gr.s_stars(gr.set_vlaues, all_stars_sets) | |
# print(len(all_stars_sets)) | |
# print all_stars_sets | |
dt = DecisionTree(data, all_stars_sets, 'id', t) | |
for i in range(len(all_stars_sets)): | |
dt.decision_tree(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment