|
########################################################################################## |
|
# Purpose: Script to create and characterize patient representations. |
|
# version 1.0.0 |
|
########################################################################################## |
|
|
|
# import and load needed scripts |
|
import patient_data |
|
import matplotlib.pyplot as plt |
|
import matplotlib |
|
# matplotlib.style.use('ggplot') |
|
import numpy as np |
|
import pandas as pd |
|
import pickle |
|
from scipy import interp |
|
from sklearn.decomposition import TruncatedSVD |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.manifold import TSNE |
|
from sklearn.metrics import roc_curve, auc, confusion_matrix |
|
from sklearn.metrics.pairwise import linear_kernel |
|
import random |
|
from sklearn.cluster import KMeans |
|
from sklearn import metrics |
|
from scipy.spatial.distance import cdist |
|
import collections |
|
from operator import itemgetter |
|
import operator |
|
from sklearn.preprocessing import scale |
|
|
|
|
|
# https://buhrmann.github.io/tfidf-analysis.html |
|
# http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html |
|
|
|
|
|
def query_saver(query_file, file_name): |
|
""" |
|
Function takes two strings as input, the first is for the path/file storing the query and the second is for the |
|
file path storing the location to write the results of the function to. Using the first string the function runs |
|
the query referenced by the string and writes the results to the second string. |
|
|
|
:param query_file: The path/file storing the query. |
|
:param file_name: The location to write the results. |
|
|
|
:return: |
|
query_data: A list of lists holding the results of the running the query. |
|
""" |
|
|
|
query_data = PatientData_v2.data_query(query_file) |
|
outfile = open(file_name, 'wb') |
|
pickle.dump(query_data, outfile) |
|
outfile.close() |
|
print "Query Ran:", str(file_name.split('/')[-1].split('.')[0]) |
|
|
|
return query_data |
|
|
|
|
|
def add_vectors(cond_labels, data1, data2, data3): |
|
""" |
|
Function takes a pandas data frame of patient labels and three dictionaries of patient data where each list |
|
represents a different type of clinical data (e.g. conditions, medications, and labs). With these data, |
|
the function creates two new lists, where the first list contains patient identifiers that were in all clinical |
|
data types and the second list contains strings which contain all clinical concepts. |
|
|
|
:param cond_labels: a pandas df where patients are the keys and values are strings representing each patients |
|
:param data1: a dict where keys are patients and values are lists of concept codes for clinical data type 1 |
|
:param data2: a dict where keys are patients and values are lists of concept codes for clinical data type 2 |
|
:param data3: a dict where keys are patients and values are lists of concept codes for clinical data type 3 |
|
|
|
:return: |
|
pat_ids: A list of patient identifiers that were in all clinical data types. |
|
add_vecs: A list of strings which contain all clinical concepts. |
|
""" |
|
|
|
pat_ids = []; add_vecs = [] |
|
for pat in cond_labels['pat_id']: |
|
d1 = ""; d2 = ""; d3 = "" |
|
if pat in list(data1['pat_id']): idx = data1.index[data1['pat_id'] == pat][0]; d1 = data1['pat_conds'][idx] |
|
if pat in list(data2['pat_id']): idx = data2.index[data2['pat_id'] == pat][0]; d2 = data2['pat_conds'][idx] |
|
if pat in list(data3['pat_id']): idx = data3.index[data3['pat_id'] == pat][0]; d3 = data3['pat_conds'][idx] |
|
vec_sum = " ".join([d1, d2, d3]); pat_ids.append(pat); add_vecs.append(vec_sum) |
|
|
|
# CHECK - file has data |
|
if len(pat_ids) != len(add_vecs): raise ValueError('The number of patients and vectors is not equal') |
|
else: return pat_ids, add_vecs |
|
|
|
|
|
def db_maker(labels, cond, med, lab, file_path): |
|
""" |
|
Function takes 4 lists of lists containing patient data and a string that stores file path information. Using |
|
this information the function generates, saves, and returns 4 pandas datasets. |
|
|
|
:param labels: A list of lists containing patient ids and associated condition labels. |
|
:param cond: A list of lists containing patient condition information. |
|
:param med: A list of lists containing patient medications information. |
|
:param lab: A list of lists containing patient labs information. |
|
:param file_path: A string containing information on where to write generated pandas datasets. |
|
|
|
:return: |
|
cond_db: A Pandas DataFrame of patient condition information. |
|
med_db: A Pandas DataFrame of patient medication information. |
|
lab_db: A Pandas DataFrame of patient lab information. |
|
combo_db: A Pandas DataFrame of patient condition, medication, and lab information. |
|
""" |
|
|
|
cond_db = labels.merge( |
|
pd.DataFrame(dict(pat_id=cond[0].keys(), pat_conds=cond[0].values())), |
|
left_on='pat_id', right_on='pat_id', how='left').dropna(axis=0, how='any').reset_index(drop=True) |
|
lb = cond_db.iloc[:, 0:3].copy() |
|
# medications |
|
med_db = lb.merge( |
|
pd.DataFrame(dict(pat_id=med[0].keys(), pat_conds=med[0].values())), |
|
left_on='pat_id', right_on='pat_id', how='left').dropna(axis=0, how='any').reset_index(drop=True) |
|
# labs |
|
lab_db = lb.merge( |
|
pd.DataFrame(dict(pat_id=lab[0].keys(), pat_conds=lab[0].values())), |
|
left_on='pat_id', right_on='pat_id', how='left').dropna(axis=0, how='any').reset_index(drop=True) |
|
# combine vectors for conditions, medications, and labs |
|
pat_comb = add_vectors(lb, cond_db, med_db, lab_db) |
|
combo_db = lb.merge( |
|
pd.DataFrame(dict(pat_id=pat_comb[0], pat_vecs=pat_comb[1])), |
|
left_on='pat_id', right_on='pat_id', how='left').dropna(axis=0, how='any').reset_index(drop=True) |
|
# save data - raw datasets |
|
cond_db.to_pickle(str(file_path) + "_cond_db"); med_db.to_pickle(str(file_path) + "_med_db") |
|
lab_db.to_pickle(str(file_path) + "_lab_db"); combo_db.to_pickle(str(file_path) + "_combo_db") |
|
|
|
return cond_db, med_db, lab_db, combo_db |
|
|
|
|
|
def patient_bow(corpus): |
|
""" |
|
Function takes a list of lists where the first item in each nested list a patient identifier and the second item |
|
is a string that represents all concepts for a given clinical feature for that patient. The function uses this |
|
information to create a count and TF-IDF transformed matrix. The function returns the count matrix where each row |
|
represents a patient and each column represents a concept, TF-IDF matrix where each row represents a patient and |
|
each column represents a concept and counts are weighted by TF-IDF, and a dictionary of each feature and it's |
|
frequency in the TF-IDF matrix. |
|
|
|
:param corpus: A list of lists where the first item in each nested list a patient identifier and the second |
|
item is a string that represents all concepts for a given clinical feature for that patient. |
|
|
|
:return: |
|
count_matrix: A matrix where each row represents a patient and each column represents a concept. |
|
tfidf_matrix: A matric here each row represents a patient and each column represents a concept and counts are |
|
weighted by TF-IDF. |
|
feature_counts: A dictionary of each feature and it's frequency in the TF-IDF matrix. |
|
""" |
|
|
|
# STEP 1: create vectorized representation of patient concepts |
|
count_vector = CountVectorizer(stop_words=None, lowercase=True) |
|
count_matrix = count_vector.fit_transform([content for _, content in corpus]) |
|
# pat_counts.shape # should be number patients by number unique dx codes |
|
# count_matrix.toarray() |
|
|
|
# STEP 2: TF-IDF transformation + L2 normalization |
|
tfidf_transform = TfidfVectorizer(analyzer='word', use_idf=True, norm='l2', stop_words=None, lowercase=True) |
|
tfidf_matrix = tfidf_transform.fit_transform([content for file, content in corpus]) |
|
# tfidf_matrix.shape |
|
|
|
# get location of each feature as well as a list of unique features |
|
feature_dict_cnt = count_vector.vocabulary_ |
|
concepts_cnt = count_vector.get_feature_names() |
|
feature_dict = tfidf_transform.vocabulary_ |
|
concepts = tfidf_transform.get_feature_names() |
|
|
|
return count_matrix, tfidf_matrix, concepts, feature_dict, concepts_cnt, feature_dict_cnt |
|
|
|
|
|
def similarity_search(tfidf_matrix, index_patient, top_n): |
|
""" |
|
Function takes as input a tfidf matrix, an integer representing a patient id, and an integer representing the |
|
number of similar patients to return. The function uses this information and calculates the cosine similarity |
|
between the index patient and all other included patients. The results are sorted and returned as a list of |
|
lists where each list contains a patient identifier and the cosine similarity score the top set of similar as |
|
indicated by the input argument are returned. |
|
|
|
:param tfidf_matrix: A matrix where each row represents a patient and each column represents a concept and counts are |
|
weighted by TF-IDF. |
|
:param index_patient: An integer representing a patient id. |
|
:param top_n: An integer representing the number of similar patients to return. |
|
|
|
:return: |
|
similar_patients: A list of lists where each list contains a patient identifier and the cosine similarity |
|
score the top set of similar as indicated by the input argument are returned. |
|
""" |
|
|
|
# http://markhneedham.com/blog/2016/07/27/scitkit-learn-tfidf-and-cosine-similarity- |
|
|
|
# calculate similarity |
|
cosine_similarities = linear_kernel(tfidf_matrix[index_patient:index_patient + 1], tfidf_matrix).flatten() |
|
rel_pat_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index_patient] |
|
similar_patients = [(patient, cosine_similarities[patient]) for patient in rel_pat_indices][0:top_n] |
|
|
|
return similar_patients |
|
|
|
|
|
def similarity_test(tfidf_matrix, test_data1, index_id, cond_type): |
|
""" |
|
Function takes a matrix of counts weighted with tf-idf, a pandas data frame containing ids, conditions, |
|
and patient conditions, a number representing the identifier of the index patient, and a string representing |
|
the disease. |
|
|
|
:param tfidf_matrix: A matrix of counts weighted with tf-idf. |
|
:param test_data1: A pandas data frame containing ids, conditions, and patient conditions, a number |
|
representing the The identifier of the index patient. |
|
:param index_id: A number representing the identifier of the index patient. |
|
:param cond_type: A string representing the disease. |
|
|
|
:return: |
|
p_sim: Pandas DataFrame for index patient including condition labels and similarity scores. |
|
""" |
|
|
|
# get index of id - used to pull correct row from tf-idf matrix |
|
index_pat = test_data1.index[test_data1.pat_id == index_id][0] |
|
id = []; cond = []; scores = [] |
|
for index, score in similarity_search(tfidf_matrix, index_pat, len(test_data1) - 1): |
|
id.append(test_data1['pat_id'][index]) |
|
conds_list = test_data1['pat_label'][test_data1.index[test_data1.pat_id == test_data1['pat_id'][index]]] |
|
if list(conds_list)[0] == cond_type: cond.append(1.0) |
|
else: cond.append(0.0) |
|
scores.append(score) |
|
p_sim = pd.DataFrame(dict(pat_id=id, conds=cond, score=scores)) |
|
|
|
# verify that the number of rows is correct |
|
if len(test_data1) - 1 != len(p_sim): raise ValueError('Output file is the wrong length') |
|
else: return p_sim |
|
|
|
|
|
def youden_index(data_frame): |
|
""" |
|
Function takes a data frame as input and uses it to calculate Youden index and confusion matrix information. |
|
The function returns a list of values where each value is a result from a confusion matrix. |
|
|
|
:param data_frame: Pandas DataFrame. |
|
|
|
:return: |
|
tn, fp, fn, tp: The results from a confusion matrix. |
|
""" |
|
|
|
# create a clean data frame for the regression |
|
cols_to_keep = ['conds', 'score']; data = data_frame[cols_to_keep] |
|
data['intercept'] = 1.0 # manually add the intercept |
|
train_cols = data.columns[1:] |
|
# run regression |
|
model = LogisticRegression().fit(data[train_cols], data['conds']) |
|
data['pred_probs'] = model.predict_proba(data[train_cols])[:, 1] |
|
# identify thresholds |
|
fpr, tpr, thresholds = roc_curve(data['conds'], data['pred_probs']) |
|
threshold = pd.Series(tpr - fpr, index=thresholds).idxmax() |
|
# add threshold to data |
|
data['pred'] = data['pred_probs'].map(lambda x: 1.0 if x > threshold else 0.0) |
|
# calculate confusion matrix (*by 1.0 to add decimal point to numbers) |
|
tn, fp, fn, tp = (confusion_matrix(data['conds'], data['pred']).ravel()) * 1.0 |
|
|
|
return [fpr, tpr], tn, fp, fn, tp |
|
|
|
|
|
def top_tfidf_feats(row_id, data, row, features, top_n=25): |
|
"""Get top n tfidf values in row and return them with their corresponding feature names.""" |
|
|
|
topn_ids = np.argsort(row)[::-1][:top_n] |
|
top_feats = [(features[i], row[i]) for i in topn_ids] |
|
df = pd.DataFrame(top_feats) |
|
df.columns = [str(data['pat_id'][row_id]) + "_" + str(data['pat_label'][row_id]), 'tfidf'] |
|
|
|
return df |
|
|
|
|
|
def top_feats_in_doc(data, xtr, features, row_id, top_n=25): |
|
"""Top tfidf features in specific document (matrix row) """ |
|
|
|
row = np.squeeze(xtr[row_id].toarray()) |
|
return top_tfidf_feats(row_id, data, row, features, top_n) |
|
|
|
|
|
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25): |
|
"""Return the top n features that on average are most important amongst documents in rows identified by indices |
|
in grp_ids.""" |
|
|
|
if grp_ids: d = xtr[grp_ids].toarray() |
|
else: d = xtr.toarray() |
|
d[d < min_tfidf] = 0 |
|
tfidf_means = np.mean(d, axis=0) |
|
|
|
return top_tfidf_feats(tfidf_means, features, top_n) |
|
|
|
|
|
def top_feats_by_class(xtr, y, features, min_tfidf=0.1, top_n=25): |
|
"""Return a list of dfs, where each df holds top_n features and their mean tfidf value calculated across |
|
documents with the same class label.""" |
|
|
|
dfs = [] |
|
labels = np.unique(y) |
|
|
|
for label in labels: |
|
ids = np.where(y == label) |
|
feats_df = top_mean_feats(xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n) |
|
feats_df.label = label |
|
l = [label] * len(feats_df) |
|
feats_df['label'] = l |
|
dfs.append(feats_df) |
|
|
|
return dfs |
|
|
|
|
|
def plot_tfidf_classfeats_h(dfs): |
|
"""Plot the data frames returned by the function plot_tfidf_classfeats().""" |
|
|
|
fig = plt.figure(figsize=(50, 10)) |
|
x = np.arange(len(dfs[0])) |
|
|
|
for i, df in enumerate(dfs): |
|
ax = fig.add_subplot(1, len(dfs), i + 1) |
|
ax.spines["top"].set_visible(False) |
|
ax.spines["right"].set_visible(False) |
|
ax.set_frame_on(False) |
|
ax.get_xaxis().tick_bottom() |
|
ax.get_yaxis().tick_left() |
|
ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14) |
|
ax.set_title("label = " + str(df.label), fontsize=16) |
|
ax.ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) |
|
ax.barh(x, df.tfidf, align='center', color='#3F5D7D') |
|
ax.set_yticks(x) |
|
ax.set_ylim([-1, x[-1] + 1]) |
|
ax.set_yticklabels(df.feature, fontsize=10) |
|
plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52) |
|
plt.show() |
|
|
|
|
|
def label_remover(test_data, cut_list, col_list, extra_strings): |
|
"""Function removes any codes or labels that should be excluded from the datasets.""" |
|
|
|
cut_list = [a for b in cut_list for a in b] |
|
for row in range(len(test_data)): |
|
for col in col_list: |
|
clean_row = [word for word in test_data[col][row].split(' ') if word not in cut_list] |
|
idx = [[i for i, item in enumerate(clean_row) if re.search(s.lower(), item.lower())] for s in extra_strings] |
|
cleaned_row = [i for i in clean_row if i not in [clean_row[x] for y in idx for x in y]] |
|
test_data.iloc[row, test_data.columns.get_loc(col)] = ' '.join(set(cleaned_row)) |
|
|
|
return test_data |
|
|
|
def model_metrics(tn, fp, fn, tp): |
|
"""Function calculates performance metrics.""" |
|
|
|
specificty = tn / (tn + fp) |
|
recall = tp / (tp + fn) |
|
precision = tp / (tp + fp) |
|
fpr = 1 - (tn / (tn + fp)) |
|
accuracy = (tp + tn) / (tp + fp + fn + tn) |
|
f1_score = (2 * tp / (2 * tp + fp + fn)) |
|
|
|
return specificty, recall, precision, fpr, accuracy, f1_score |
|
|
|
|
|
def label_remover(test_data, cut_list, col_list, extra_strings): |
|
"""Removes code or labels from a column where each cell contains a string of codes or labels.""" |
|
|
|
cut_list = [a for b in cut_list for a in b] |
|
for row in range(len(test_data)): |
|
for col in col_list: |
|
clean_row = [word for word in test_data[col][row].split(' ') if word not in cut_list] |
|
idx = [[i for i, item in enumerate(clean_row) if re.search(s.lower(), item.lower())] for s in extra_strings] |
|
cleaned_row = [i for i in clean_row if i not in [clean_row[x] for y in idx for x in y]] |
|
test_data.iloc[row, test_data.columns.get_loc(col)] = ' '.join(set(cleaned_row)) |
|
|
|
return test_data |
|
|
|
|
|
def main(): |
|
|
|
##################################################################################################### |
|
#### Read in Clinical Data from GBQ Query #### |
|
##################################################################################################### |
|
# labels |
|
case_labels = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2', 'rb')) |
|
rand_labels = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Ransom2', 'rb')) |
|
# demographics |
|
case_demo= pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases_Demographics', 'rb')) |
|
rand_demo = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Random_Demographics', 'rb')) |
|
# conditions |
|
case_cond_c = pickle.load(open(‘’../Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Condition_Codes' ,’rb')) |
|
case_cond_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Condition_Source', 'rb')) |
|
rand_cond_c = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Condition_Codes', 'rb')) |
|
rand_cond_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Condition_Source', 'rb')) |
|
# medications |
|
case_med_c = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Medication_Codes', 'rb')) |
|
case_med_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Medication_Source', 'rb')) |
|
rand_med_c = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Medication_Codes', 'rb')) |
|
rand_med_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Medication_Source', 'rb')) |
|
# labs |
|
case_lab_c = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Lab_Codes', 'rb')) |
|
case_lab_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Cases2_Lab_Source', 'rb')) |
|
rand_lab_c = pickle.load(open('Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Lab_Codes', 'rb')) |
|
rand_lab_txt = pickle.load(open('../Queries/Test_Queries/QueryData/PatSim_RareDisease_Random2_Lab_Source', 'rb')) |
|
|
|
|
|
|
|
##################################################################################################### |
|
#### Process Clinical Data #### |
|
##################################################################################################### |
|
### STEP 1: Identify duplicate patients (this needs to include identifying the min number of patients with each concept) |
|
case_labels_dedup = patient_data.duplicate_identifier(case_labels, 'YES', 'YES') |
|
rand_labels_dedup = patient_data.duplicate_identifier(rand_labels, 'YES') |
|
# demographics |
|
case_demo = patient_data.patient_demo(case_demo, case_labels_dedup) |
|
rand_demo = patient_data.patient_demo(rand_demo, rand_labels_dedup) |
|
|
|
|
|
### STEP 2: Process data for concept labels |
|
# conditions |
|
case_cond_text = patient_data.patient_concepts(case_cond_txt, case_labels_dedup, 'CASE', 'TEXT', 'COND') |
|
rand_cond_text = patient_data.patient_concepts(rand_cond_txt, rand_labels_dedup, 'RAND', 'TEXT', 'COND') |
|
# medications |
|
case_med_txt = patient_data.patient_concepts(case_med_txt, case_labels_dedup, 'CASE', 'TEXT', 'MED') |
|
rand_med_text = patient_data.patient_concepts(rand_med_txt, rand_labels_dedup, 'RAND', 'TEXT', 'MED') |
|
# labs |
|
case_lab_txt = patient_data.patient_concepts(case_lab_txt, case_labels_dedup, 'CASE', 'TEXT', 'LAB') |
|
rand_lab_text = patient_data.patient_concepts(rand_lab_txt, rand_labels_dedup, 'RAND', 'TEXT', 'LAB') |
|
|
|
## tidy, filter, and merge data |
|
# CASES |
|
# conditions |
|
case_cond_text.concept_text = case_cond_text.concept_text.astype(str) |
|
case_dx_visits = list(set(list(case_cond_text['visit_id']))) |
|
cond_agg = case_cond_text.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_txt = cond_agg.merge(case_cond_text[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_txt.drop_duplicates(keep='first', inplace=True) |
|
case_cond_txt = case_cond_txt.reset_index(drop=True) |
|
case_cond_txt.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_txt.concept_text = case_med_txt.concept_text.astype(str) |
|
med_filt = case_med_txt[case_med_txt['visit_id'].isin(case_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_txt = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_txt.drop_duplicates(keep='first', inplace=True) |
|
case_med_txt = case_med_txt.reset_index(drop=True) |
|
case_med_txt.groupby(by=['pat_label']).size() |
|
# labs |
|
case_lab_txt.concept_text = case_lab_txt.concept_text.astype(str) |
|
lab_filt = case_lab_txt[case_lab_txt['visit_id'].isin(case_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_txt = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_txt.drop_duplicates(keep='first', inplace=True) |
|
case_lab_txt = case_lab_txt.reset_index(drop=True) |
|
case_lab_txt.groupby(by=['pat_label']).size() |
|
# combine data types |
|
combo_txt = case_cond_txt.merge(case_lab_txt[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_txt = combo_txt.merge(case_med_txt[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_txt = combo_txt.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_txt = combo_txt.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
combo_txt["pat_vecs"] = combo_txt["cond_vecs"] + " " + combo_txt["med_vecs"] + " " + combo_txt["lab_vecs"] |
|
combo_txt["pat_vecs3"] = combo_txt['dob'] + " " + combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["cond_vecs"] + " " + combo_txt["med_vecs"] + " " + combo_txt["lab_vecs"] |
|
combo_txt["pat_vecs2"] = combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["cond_vecs"] + " " + combo_txt["med_vecs"] + " " + combo_txt["lab_vecs"] |
|
combo_txt["cond_vecs2"] = combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["cond_vecs"] |
|
combo_txt["med_vecs2"] = combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["med_vecs"] |
|
combo_txt["lab_vecs2"] = combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["lab_vecs"] |
|
combo_txt["cond_vecs3"] = combo_txt['dob'] + " " + combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["cond_vecs"] |
|
combo_txt["med_vecs3"] = combo_txt['dob'] + " " + combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["med_vecs"] |
|
combo_txt["lab_vecs3"] = combo_txt['dob'] + " " + combo_txt['gender'] + " " + combo_txt['race'] + " " + combo_txt["lab_vecs"] |
|
# save data |
|
combo_txt.to_pickle("Data/PatSim_test/Rare_case_all_text_df") |
|
|
|
# CONTROLS |
|
# conditions |
|
rand_cond_text.concept_text = rand_cond_text.concept_text.astype(str) |
|
cond_agg = rand_cond_text.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
rand_cond_text = cond_agg.merge(rand_cond_text[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_cond_text.drop_duplicates(keep='first', inplace=True) |
|
rand_cond_text = rand_cond_text.reset_index(drop=True) |
|
rand_cond_text.groupby(by=['pat_label']).size() |
|
# medications |
|
rand_med_text.concept_text = rand_med_text.concept_text.astype(str) |
|
med_agg = rand_med_text.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
rand_med_text = med_agg.merge(rand_med_text[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_med_text.drop_duplicates(keep='first', inplace=True) |
|
rand_med_text = rand_med_text.reset_index(drop=True) |
|
rand_med_text.groupby(by=['pat_label']).size() |
|
# labs |
|
rand_lab_text.concept_text = rand_lab_text.concept_text.astype(str) |
|
lab_agg = rand_lab_text.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
rand_lab_text = lab_agg.merge(rand_lab_text[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_lab_text.drop_duplicates(keep='first', inplace=True) |
|
# rand_lab_text = rand_lab_text.sample(n=10000, replace=False).reset_index(drop=True) |
|
rand_lab_text.groupby(by=['pat_label']).size() |
|
# add demo info |
|
rand_all = rand_cond_text.merge(rand_med_text[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
rand_all.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'med_vecs'}, inplace=True) |
|
rand_all = rand_all.merge(rand_lab_text[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
rand_all.rename(columns={'pat_conds': 'lab_vecs'}, inplace=True) |
|
rand_all = rand_all.replace(np.nan, '', regex=True) |
|
rand_all = rand_all.merge(rand_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
rand_all["pat_vecs"] = rand_all["cond_vecs"] + " " + rand_all["med_vecs"] + " " + rand_all["lab_vecs"] |
|
rand_all["pat_vecs2"] = rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] + " " + rand_all["med_vecs"] + " " + rand_all["lab_vecs"] |
|
rand_all["cond_vecs2"] = rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] |
|
rand_all["med_vecs2"] = rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["med_vecs"] |
|
rand_all["lab_vecs2"] = rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["lab_vecs"] |
|
rand_all["pat_vecs3"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] + " " + rand_all["med_vecs"] + " " + rand_all["lab_vecs"] |
|
rand_all["cond_vecs3"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] |
|
rand_all["med_vecs3"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["med_vecs"] |
|
rand_all["lab_vecs3"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["lab_vecs"] |
|
|
|
# combine and save data |
|
combo_txt = pd.concat([combo_txt, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_txt.to_pickle("Data/PatSim_test/Rare_combo_all_text_df") |
|
|
|
### STEP 2 B: Removing concepts BEFORE labeled dx occurred |
|
c_grp = case_cond_txt.groupby(by=['pat_label']) |
|
c_grp = c_grp.apply(lambda g: g[pd.to_datetime(g['cond_date']) <= pd.to_datetime(g['cond_start_date'])]) |
|
cond_before_dx = c_grp.reset_index(drop=True) |
|
cond_before_dx.groupby(by=['pat_label']).size() |
|
before_dx_visits = list(set(list(c_grp['visit_id']))) |
|
# conditions |
|
cond_before_dx.concept_text = cond_before_dx.concept_text.astype(str) |
|
cond_agg = cond_before_dx.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_txt_B_dx = cond_agg.merge(cond_before_dx[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_txt_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_cond_txt_B_dx = case_cond_txt_B_dx.reset_index(drop=True) |
|
case_cond_txt_B_dx.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_txt.concept_text = case_med_txt.concept_text.astype(str) |
|
med_filt = case_med_txt[case_med_txt['visit_id'].isin(before_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_txt_B_dx = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_txt_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_med_txt_B_dx = case_med_txt_B_dx.reset_index(drop=True) |
|
case_med_txt_B_dx.groupby(by=['pat_label']).size() |
|
# measurements |
|
case_lab_txt.concept_text = case_lab_txt.concept_text.astype(str) |
|
lab_filt = case_lab_txt[case_lab_txt['visit_id'].isin(before_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_txt_B_dx = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_txt_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_lab_txt_B_dx = case_lab_txt_B_dx.reset_index(drop=True) |
|
case_lab_txt_B_dx.groupby(by=['pat_label']).size() |
|
|
|
### STEP 2 C: Remove conceprs AFTER labeled dx occurred |
|
c_grp = case_cond_txt.groupby(by=['pat_id']) |
|
c_grp = c_grp.apply(lambda g: g[pd.to_datetime(g['cond_date']) >= pd.to_datetime(g['cond_start_date'])]) |
|
cond_after_dx = c_grp.reset_index(drop=True) |
|
after_dx_visits = list(set(list(c_grp['visit_id']))) |
|
# conditions |
|
cond_after_dx.concept_text = cond_after_dx.concept_text.astype(str) |
|
cond_agg = cond_after_dx.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_txt_A_dx = cond_agg.merge(cond_after_dx[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_txt_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_cond_txt_A_dx = case_cond_txt_A_dx.reset_index(drop=True) |
|
case_cond_txt_A_dx.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_txt.concept_text = case_med_txt.concept_text.astype(str) |
|
med_filt = case_med_txt[case_med_txt['visit_id'].isin(after_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_txt_A_dx = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_txt_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_med_txt_A_dx = case_med_txt_A_dx.reset_index(drop=True) |
|
case_med_txt_A_dx.groupby(by=['pat_label']).size() |
|
# labs |
|
case_lab_txt.concept_text = case_lab_txt.concept_text.astype(str) |
|
lab_filt = case_lab_txt[case_lab_txt['visit_id'].isin(after_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_text'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_txt_A_dx = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_txt_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_lab_txt_A_dx = case_lab_txt_A_dx.reset_index(drop=True) |
|
case_lab_txt_A_dx.groupby(by=['pat_label']).size() |
|
|
|
### STEP 2 D: Combine concept sets |
|
## combine data types - before |
|
combo_txt_B_dx = case_cond_txt_B_dx.merge(case_lab_txt_B_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt_B_dx.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_txt_B_dx = combo_txt_B_dx.merge(case_med_txt_B_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt_B_dx.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_txt_B_dx = combo_txt_B_dx.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_txt_B_dx = combo_txt_B_dx.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
combo_txt_B_dx["pat_vecs"] = combo_txt_B_dx["cond_vecs"] + " " + combo_txt_B_dx["med_vecs"] + " " + combo_txt_B_dx[ "lab_vecs"] |
|
combo_txt_B_dx["pat_vecs2"] = combo_txt_B_dx['dob'] + " " + combo_txt_B_dx['gender'] + " " + combo_txt_B_dx['race'] + " " + combo_txt_B_dx["cond_vecs"] + " " + combo_txt_B_dx["med_vecs"] + " " + combo_txt_B_dx["lab_vecs"] |
|
combo_txt_B_dx["cond_vecs2"] = combo_txt_B_dx['dob'] + " " + combo_txt_B_dx['gender'] + " " + combo_txt_B_dx['race'] + " " + combo_txt_B_dx["cond_vecs"] |
|
combo_txt_B_dx["med_vecs2"] = combo_txt_B_dx['dob'] + " " + combo_txt_B_dx['gender'] + " " + combo_txt_B_dx['race'] + " " + combo_txt_B_dx["med_vecs"] |
|
combo_txt_B_dx["lab_vecs2"] = combo_txt_B_dx['dob'] + " " + combo_txt_B_dx['gender'] + " " + combo_txt_B_dx['race'] + " " + combo_txt_B_dx["lab_vecs"] |
|
combo_txt_B_dx.to_pickle("Data/PatSim_test/case_all_text_B_filtered_df") |
|
|
|
# merge with controls |
|
combo_txt_B_dx = pd.concat([combo_txt_B_dx, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_txt_B_dx.to_pickle("Data/PatSim_test/comb_all_text_B_filtered_df") |
|
|
|
## combine data types - after |
|
combo_txt_A_dx = case_cond_txt_A_dx.merge(case_lab_txt_A_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt_A_dx.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_txt_A_dx = combo_txt_A_dx.merge(case_med_txt_A_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_txt_A_dx.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_txt_A_dx = combo_txt_A_dx.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_txt_A_dx = combo_txt_A_dx.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
combo_txt_A_dx["pat_vecs"] = combo_txt_A_dx["cond_vecs"] + " " + combo_txt_A_dx["med_vecs"] + " " + combo_txt_A_dx["lab_vecs"] |
|
combo_txt_A_dx["pat_vecs2"] = combo_txt_A_dx['dob'] + " " + combo_txt_A_dx['gender'] + " " + combo_txt_A_dx['race'] + " " + combo_txt_A_dx["cond_vecs"] + " " + combo_txt_A_dx["med_vecs"] + " " + combo_txt_A_dx[ "lab_vecs"] |
|
combo_txt_A_dx["cond_vecs2"] = combo_txt_A_dx['dob'] + " " + combo_txt_A_dx['gender'] + " " + combo_txt_A_dx['race'] + " " + combo_txt_A_dx["cond_vecs"] |
|
combo_txt_A_dx["med_vecs2"] = combo_txt_A_dx['dob'] + " " + combo_txt_A_dx['gender'] + " " + combo_txt_A_dx['race'] + " " + combo_txt_A_dx["med_vecs"] |
|
combo_txt_A_dx["lab_vecs2"] = combo_txt_A_dx['dob'] + " " + combo_txt_A_dx['gender'] + " " + combo_txt_A_dx['race'] + " " + combo_txt_A_dx["lab_vecs"] |
|
combo_txt_A_dx.to_pickle("Data/PatSim_test/case_all_text_A_filtered_df") |
|
|
|
# merge with controls |
|
combo_txt_A_dx = pd.concat([combo_txt_A_dx, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_txt_A_dx.to_pickle("Data/PatSim_test/comb_all_text_A_filtered_df") |
|
|
|
### STEP 2: Process data for concept codes |
|
## tidy, filter, and merge data |
|
# conditions |
|
case_cond_c = patient_data.patient_concepts(case_cond_c, case_labels_dedup, 'CASE', 'CODE', 'COND') |
|
rand_cond_c = patient_data.patient_concepts(rand_cond_c, rand_labels_dedup, ‘RANDOM’, 'CODE', 'COND') |
|
# medications |
|
case_med_c = patient_data.patient_concepts(case_med_c, case_labels_dedup, 'CASE', 'CODE', 'MED') |
|
rand_med_c = patient_data.patient_concepts(rand_med_c, rand_labels_dedup, ‘RANDOM, 'CODE', 'MED') |
|
# labs |
|
case_lab_c = patient_data.patient_concepts(case_lab_c, case_labels_dedup, 'CASE', 'CODE', 'LAB') |
|
rand_lab_c = patient_data.patient_concepts(rand_lab_c, rand_labels_dedup, ‘RANDOM, 'CODE', 'LAB') |
|
|
|
# CASES |
|
# conditions |
|
case_cond_c.concept_code = case_cond_c.concept_code.astype(str) |
|
case_dx_visits = list(set(list(case_cond_c['visit_id']))) |
|
cond_agg = case_cond_c.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_c = cond_agg.merge(case_cond_c[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_c.drop_duplicates(keep='first', inplace=True) |
|
case_cond_c = case_cond_c.reset_index(drop=True) |
|
case_cond_c.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_c.concept_code = case_med_c.concept_code.astype(str) |
|
med_filt = case_med_c[case_med_c['visit_id'].isin(case_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_c = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_c.drop_duplicates(keep='first', inplace=True) |
|
case_med_c = case_med_c.reset_index(drop=True) |
|
case_med_c.groupby(by=['pat_label']).size() |
|
# labs |
|
case_lab_c.concept_code = case_lab_c.concept_code.astype(str) |
|
lab_filt = case_lab_c[case_lab_c['visit_id'].isin(case_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_c = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_c.drop_duplicates(keep='first', inplace=True) |
|
case_lab_c = case_lab_c.reset_index(drop=True) |
|
case_lab_c.groupby(by=['pat_label']).size() |
|
# combine data types - before |
|
combo_c = case_cond_c.merge(case_lab_c[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_c = combo_c.merge(case_med_c[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_c = combo_c.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_c = combo_c.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
combo_c["pat_vecs"] = combo_c["cond_vecs"] + " " + combo_c["med_vecs"] + " " + combo_c["lab_vecs"] |
|
combo_c["pat_vecs2"] = combo_c['dob'] + " " + combo_c['gender'] + " " + combo_c['race'] + " " + combo_c["cond_vecs"] + " " + combo_c["med_vecs"] + " " + combo_c["lab_vecs"] |
|
combo_c["cond_vecs2"] = combo_c['dob'] + " " + combo_c['gender'] + " " + combo_c['race'] + " " + combo_c["cond_vecs"] |
|
combo_c["med_vecs2"] = combo_c['dob'] + " " + combo_c['gender'] + " " + combo_c['race'] + " " + combo_c["med_vecs"] |
|
combo_c["lab_vecs2"] = combo_c['dob'] + " " + combo_c['gender'] + " " + combo_c['race'] + " " + combo_c["lab_vecs"] |
|
# save data |
|
combo_c.to_pickle("Data/PatSim_test/case_all_code_df") |
|
|
|
# CONTROLS |
|
# conditions |
|
rand_cond_c.concept_code = rand_cond_c.concept_code.astype(str) |
|
cond_agg = rand_cond_c.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
rand_cond_c = cond_agg.merge(rand_cond_c[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_cond_c.drop_duplicates(keep='first', inplace=True) |
|
rand_cond_c = rand_cond_c.reset_index(drop=True) |
|
rand_cond_c.groupby(by=['pat_label']).size() |
|
# medications |
|
rand_med_c.concept_code = rand_med_c.concept_code.astype(str) |
|
med_agg = rand_med_c.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
rand_med_c = med_agg.merge(rand_med_c[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_med_c.drop_duplicates(keep='first', inplace=True) |
|
rand_med_c = rand_med_c.reset_index(drop=True) |
|
rand_med_c.groupby(by=['pat_label']).size() |
|
# labs |
|
rand_lab_c.concept_code = rand_lab_c.concept_code.astype(str) |
|
lab_agg = rand_lab_c.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
rand_lab_c = lab_agg.merge(rand_lab_c[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
rand_lab_c.drop_duplicates(keep='first', inplace=True) |
|
rand_lab_c = rand_lab_c.reset_index(drop=True) |
|
rand_lab_c.groupby(by=['pat_label']).size() |
|
# add demo info |
|
rand_all = rand_cond_c.merge(rand_med_c[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
rand_all.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'med_vecs'}, inplace=True) |
|
rand_all = rand_all.merge(rand_lab_c[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
rand_all.rename(columns={'pat_conds': 'lab_vecs'}, inplace=True) |
|
rand_all = rand_all.replace(np.nan, '', regex=True) |
|
rand_all = rand_all.merge(rand_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# combine conds into 1 column |
|
rand_all["pat_vecs"] = rand_all["cond_vecs"] + " " + rand_all["med_vecs"] + " " + rand_all["lab_vecs"] |
|
rand_all["pat_vecs2"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] + " " + rand_all["med_vecs"] + " " + rand_all["lab_vecs"] |
|
rand_all["cond_vecs2"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["cond_vecs"] |
|
rand_all["med_vecs2"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["med_vecs"] |
|
rand_all["lab_vecs2"] = rand_all['dob'] + " " + rand_all['gender'] + " " + rand_all['race'] + " " + rand_all["lab_vecs"] |
|
|
|
# combine data |
|
combo_c = pd.concat([combo_c, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_c.to_pickle('Data/PatSim_test/comb_all_code_df') |
|
|
|
### STEP 2 B: Remove concepts BEFORE labeled dx occurred |
|
c_grp = case_cond_c.groupby(by=['pat_label']) |
|
c_grp = c_grp.apply(lambda g: g[pd.to_datetime(g['cond_date']) <= pd.to_datetime(g['cond_start_date'])]) |
|
cond_before_dx = c_grp.reset_index(drop=True) |
|
cond_before_dx.groupby(by=['pat_label']).size() |
|
before_dx_visits = list(set(list(c_grp['visit_id']))) |
|
# conditions |
|
cond_before_dx.concept_code = cond_before_dx.concept_code.astype(str) |
|
cond_agg = cond_before_dx.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_c_B_dx = cond_agg.merge(cond_before_dx[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_c_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_cond_c_B_dx = case_cond_c_B_dx.reset_index(drop=True) |
|
case_cond_c_B_dx.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_c.concept_code = case_med_c.concept_code.astype(str) |
|
med_filt = case_med_c[case_med_c['visit_id'].isin(before_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_c_B_dx = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_c_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_med_c_B_dx = case_med_c_B_dx.reset_index(drop=True) |
|
case_med_c_B_dx.groupby(by=['pat_label']).size() |
|
# labs |
|
case_lab_c.concept_code = case_lab_c.concept_code.astype(str) |
|
lab_filt = case_lab_c[case_lab_c['visit_id'].isin(before_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_c_B_dx = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_c_B_dx.drop_duplicates(keep='first', inplace=True) |
|
case_lab_c_B_dx = case_lab_c_B_dx.reset_index(drop=True) |
|
case_lab_c_B_dx.groupby(by=['pat_label']).size() |
|
|
|
|
|
### STEP 2 C: Remove concepts AFTER labeled dx occurred |
|
c_grp = case_cond_c.groupby(by=['pat_id']) |
|
c_grp = c_grp.apply(lambda g: g[pd.to_datetime(g['cond_date']) >= pd.to_datetime(g['cond_start_date'])]) |
|
cond_after_dx = c_grp.reset_index(drop=True) |
|
after_dx_visits = list(set(list(c_grp['visit_id']))) |
|
# conditions |
|
cond_after_dx.concept_code = cond_after_dx.concept_code.astype(str) |
|
cond_agg = cond_after_dx.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
cond_agg = pd.DataFrame({'pat_id': cond_agg.index, 'pat_conds': cond_agg.values}) |
|
case_cond_c_A_dx = cond_agg.merge(cond_after_dx[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_cond_c_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_cond_c_A_dx = case_cond_c_A_dx.reset_index(drop=True) |
|
case_cond_c_A_dx.groupby(by=['pat_label']).size() |
|
# medications |
|
case_med_c.concept_code = case_med_c.concept_code.astype(str) |
|
med_filt = case_med_c[case_med_c['visit_id'].isin(after_dx_visits)] |
|
med_agg = med_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
med_agg = pd.DataFrame({'pat_id': med_agg.index, 'pat_conds': med_agg.values}) |
|
case_med_c_A_dx = med_agg.merge(med_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_med_c_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_med_c_A_dx = case_med_c_A_dx.reset_index(drop=True) |
|
case_med_c_A_dx.groupby(by=['pat_label']).size() |
|
# labs |
|
case_lab_c.concept_code = case_lab_c.concept_code.astype(str) |
|
lab_filt = case_lab_c[case_lab_c['visit_id'].isin(after_dx_visits)] |
|
lab_agg = lab_filt.groupby('pat_id')['concept_code'].agg(lambda col: ' '.join(col)) |
|
lab_agg = pd.DataFrame({'pat_id': lab_agg.index, 'pat_conds': lab_agg.values}) |
|
case_lab_c_A_dx = lab_agg.merge(lab_filt[['pat_id', 'pat_label', 'pat_lab_num']], left_on='pat_id', right_on='pat_id', how='left') |
|
case_lab_c_A_dx.drop_duplicates(keep='first', inplace=True) |
|
case_lab_c_A_dx = case_lab_c_A_dx.reset_index(drop=True) |
|
case_lab_c_A_dx.groupby(by=['pat_label']).size() |
|
|
|
### STEP 2 D: Combine concept sets |
|
## combine data types - before |
|
# conditions |
|
combo_c_B_dx = case_cond_c_B_dx.merge(case_lab_c_B_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c_B_dx.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_c_B_dx = combo_c_B_dx.merge(case_med_c_B_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c_B_dx.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_c_B_dx = combo_c_B_dx.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_c_B_dx = combo_c_B_dx.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index(drop=True) |
|
# conditions |
|
combo_c_B_dx["pat_vecs"] = combo_c_B_dx["cond_vecs"] + " " + combo_c_B_dx["med_vecs"] + " " + combo_c_B_dx["lab_vecs"] |
|
combo_c_B_dx["pat_vecs2"] = combo_c_B_dx['dob'] + " " + combo_c_B_dx['gender'] + " " + combo_c_B_dx['race'] + " " + combo_c_B_dx["cond_vecs"] + " " + combo_c_B_dx["med_vecs"] + " " + combo_c_B_dx[ "lab_vecs"] |
|
combo_c_B_dx["cond_vecs2"] = combo_c_B_dx['dob'] + " " + combo_c_B_dx['gender'] + " " + combo_c_B_dx['race'] + " " + combo_c_B_dx["cond_vecs"] |
|
combo_c_B_dx["med_vecs2"] = combo_c_B_dx['dob'] + " " + combo_c_B_dx['gender'] + " " + combo_c_B_dx['race'] + " " + combo_c_B_dx["med_vecs"] |
|
combo_c_B_dx["lab_vecs2"] = combo_c_B_dx['dob'] + " " + combo_c_B_dx['gender'] + " " + combo_c_B_dx['race'] + " " + combo_c_B_dx["lab_vecs"] |
|
combo_c_B_dx.to_pickle('Data/PatSim_test/case_all_code_B_filtered_df') |
|
# merge with random and write |
|
combo_c_B_dx = pd.concat([combo_c_B_dx, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_c_B_dx.to_pickle('Data/PatSim_test/comb_all_code_B_filtered_df') |
|
|
|
## combine data types - after |
|
combo_c_A_dx = case_cond_c_A_dx.merge(case_lab_c_A_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c_A_dx.rename(columns={'pat_conds_x': 'cond_vecs', 'pat_conds_y': 'lab_vecs'}, inplace=True) |
|
combo_c_A_dx = combo_c_A_dx.merge(case_med_c_A_dx[['pat_id', 'pat_conds']], left_on='pat_id', right_on='pat_id', how='left') |
|
combo_c_A_dx.rename(columns={'pat_conds': 'med_vecs'}, inplace=True) |
|
combo_c_A_dx = combo_c_A_dx.replace(np.nan, '', regex=True) |
|
# add demo info |
|
combo_c_A_dx = combo_c_A_dx.merge(case_demo, left_on='pat_id', right_on='pat_id', how='left').reset_index( drop=True) |
|
# combine conds into 1 column |
|
combo_c_A_dx["pat_vecs"] = combo_c_A_dx["cond_vecs"] + " " + combo_c_A_dx["med_vecs"] + " " + combo_c_A_dx["lab_vecs"] |
|
combo_c_A_dx["pat_vecs2"] = combo_c_A_dx['dob'] + " " + combo_c_A_dx['gender'] + " " + combo_c_A_dx['race'] + " " + combo_c_A_dx["cond_vecs"] + " " + combo_c_A_dx["med_vecs"] + " " + combo_c_A_dx["lab_vecs"] |
|
combo_c_A_dx["cond_vecs2"] = combo_c_A_dx['dob'] + " " + combo_c_A_dx['gender'] + " " + combo_c_A_dx['race'] + " " + combo_c_A_dx["cond_vecs"] |
|
combo_c_A_dx["med_vecs2"] = combo_c_A_dx['dob'] + " " + combo_c_A_dx['gender'] + " " + combo_c_A_dx['race'] + " " + combo_c_A_dx["med_vecs"] |
|
combo_c_A_dx["lab_vecs2"] = combo_c_A_dx['dob'] + " " + combo_c_A_dx['gender'] + " " + combo_c_A_dx['race'] + " " + combo_c_A_dx["lab_vecs"] |
|
combo_c_A_dx.to_pickle('Data/PatSim_test/case_all_code_A_filtered_df') |
|
# merge with random and save |
|
combo_c_A_dx = pd.concat([combo_c_A_dx, rand_all.sample(n=10000, replace=False)]).reset_index(drop=True) |
|
combo_c_A_dx.to_pickle('Data/PatSim_test/comb_all_code_A_filtered_df') |
|
|
|
|
|
|
|
##################################################################################################### |
|
#### Remove Labels and Code Use to Diagnosis Cases #### |
|
##################################################################################################### |
|
# CF patients |
|
cf_cut_code = [str(x) for x in [254320, 194325, 441267, 434615, 193174, 40479565]] |
|
cf_cut_txt = ['Cystic_fibrosis_with_meconium_ileus', 'Meconium_ileus_in_cystic_fibrosis', 'Cystic_fibrosis_with_other_manifestations', 'Cystic_fibrosis_with_other_intestinal_manifestations', 'Cystic_fibrosis_with_gastrointestinal_manifestations', 'Cystic_fibrosis_with_pulmonary_manifestations', 'Cystic_fibrosis_without_mention_of_meconium_ileus', 'Cystic_fibrosis_unspecified', 'Cystic_fibrosis', 'Cystic_fibrosis_carrier', 'Cystic_fibrosis_gene_carrier'] |
|
|
|
# SCD patients |
|
sc_cut_code = [str(x) for x in [30683, 22281, 443726, 26942, 196943, 443721, 254062, 25518, 321263, 443738, 40485018]] |
|
sc_cut_txt = ['Sickle_cell_disease_unspecified', 'Hb_SS_disease_with_acute_chest_syndrome', 'Sickle_cell_disease,_unspecified', 'Sickle_cell_disease', 'Sickle_cell_trait', 'Hb_SS_disease_with_crisis', 'Hb_SS_disease_with_crisis_unspecified', 'Other_sickle_cell_disorders_with_crisis_unspecified', 'Hb_SS_disease_with_splenic_sequestration' 'Other_sickle_cell_disease_with_crisis' 'Other_sickle_cell_disorders_with_acute_chest_syndrome' 'Sickle_cell_disease_without_crisis', 'Hb_SS_disease_without_crisis', 'Other_sickle_cell_disorders_without_crisis' 'Other_sickle_cell_disease_without_crisis’, 'Sickle_cell_Hb_C_disease_with_splenic_sequestration', |
|
'Sickle_cell_thalassemia_with_acute_chest_syndrome', 'Acute_chest_syndrome', 'Sickle_cell_Hb_C_disease_with_acute_chest_syndrome', 'Sickle_cell_thalassemia_without_crisis', |
|
'Sickle_cell_Hb_C_disease_with_crisis_unspecified', 'Sickle_cell_Hb_C_disease_with_crisis', 'Sickle_cell_Hb_C_disease_without_crisis', 'Sickle_cell_thalassemia_with_crisis', |
|
'Sickle_cell_thalassemia_with_crisis_unspecified', 'Other_sickle_cell_disorders_with_splenic_sequestration'] |
|
|
|
# CAH patients |
|
ch_cut_code = [str(x) for x in [4314093, 4130017, 4081998]] |
|
ch_cut_txt = ['Congenital_hypothyroidism_with_diffuse_goiter', 'Congenital_hypothyroidism_without_goiter', 'Congenital_hypothyroidism'] |
|
|
|
# PKU patients |
|
pku_cut_code = [str(x) for x in [432872]] |
|
pku_cut_txt = ['Phenylketonuria_PKU', 'Phenylketonuria', 'Classical_phenylketonuria', 'Screening_for_phenylketonuria_PKU'] |
|
|
|
# put lists together |
|
cut_txt = [cf_cut_txt, sc_cut_txt, ch_cut_txt, pku_cut_txt] |
|
cut_code = [cf_cut_code, sc_cut_code, ch_cut_code, pku_cut_code] |
|
extra_strings = ['cystic_fibrosis', 'congenital_hypothyroidism', 'sickle_cell', 'hb_ss', 'phenylketonuria'] |
|
|
|
# read in processed data — assumes the data from the prior steps were saved |
|
f_loc2s = ['Data/PatSim_test/case_all_code_df', 'Data/PatSim_test/comb_all_code_df', 'Data/PatSim_test/case_all_text_df', 'Data/PatSim_test/combo_all_text_df', 'Data/PatSim_test/case_all_code_B_filtered_df', 'Data/PatSim_test/comb_all_code_B_filtered_df', 'Data/PatSim_test/case_all_text_B_filtered_df', 'Data/PatSim_test/comb_all_text_B_filtered_df', 'Data/PatSim_test/case_all_code_A_filtered_df', 'Data/PatSim_test/comb_all_code_A_filtered_df', 'Data/PatSim_test/case_all_text_A_filtered_df', 'Data/PatSim_test/comb_all_text_A_filtered_df'] |
|
|
|
for x in f_loc2s: |
|
print(x) |
|
df = pd.read_pickle(x); col_list = ['cond_vecs', 'med_vecs', 'lab_vecs', 'pat_vecs'] |
|
if 'text' in x: df = label_remover(df, cut_txt, col_list, extra_strings) |
|
else: df = label_remover(df, cut_code, col_list, extra_strings) |
|
df.to_pickle(x) |
|
|
|
|
|
|
|
##################################################################################################### |
|
#### Perform Classification #### |
|
##################################################################################################### |
|
### STEP 1: Perform leave-one-patient-out classifcation |
|
domain = ['cond', 'med', 'lab', 'all'] |
|
f_loc2 = ['Data/PatSim_test/case_all_code_df', 'Data/PatSim_test/comb_all_code_df', 'Data/PatSim_test/case_all_text_df', 'Data/PatSim_test/combo_all_text_df', 'Data/PatSim_test/case_all_code_B_filtered_df', 'Data/PatSim_test/comb_all_code_B_filtered_df', 'Data/PatSim_test/case_all_text_B_filtered_df', 'Data/PatSim_test/comb_all_text_B_filtered_df', 'Data/PatSim_test/case_all_code_A_filtered_df', 'Data/PatSim_test/comb_all_code_A_filtered_df', 'Data/PatSim_test/case_all_text_A_filtered_df', 'Data/PatSim_test/comb_all_text_A_filtered_df'] |
|
origin = 'Results/PatSim_Testing/dissertation/' # set location to write data to |
|
|
|
for d_file in range(0, len(f_loc2)): |
|
data_file = f_loc2[d_file]; df = pd.read_pickle(data_file) |
|
print('\n' + 'FILE: {}\n'.format(data_file)) |
|
for dom in domain[1:]: |
|
corpus = [] |
|
print('\n' + 'DOMAIN: {}'.format(dom)) |
|
if dom == 'all': sub_dir = 'Combined'; var = 'pat_vecs' |
|
if dom == 'cond': sub_dir = 'Conditions'; var = dom + '_vecs' |
|
if dom == 'med': sub_dir = 'Medications'; var = dom + '_vecs' |
|
if dom == 'lab': sub_dir = 'Labs'; var = dom + '_vecs' |
|
# files and titles |
|
f_name = data_file.replace('all', dom).split('/')[-1] |
|
f_type = origin + sub_dir + '/' + f_name + '_'; title_grp = '' |
|
|
|
# update databases |
|
test_data = df[['pat_id', 'pat_label', var]].drop_duplicates(keep='first', inplace=False) |
|
test_data = test_data[test_data[var] != '']; test_data = test_data.reset_index(drop=True) |
|
if 'comb' in data_file: |
|
case = test_data[test_data['pat_label'] != 'Rand'].reset_index(drop=True) |
|
ctrl = test_data[test_data['pat_label'] == 'Rand'].sample(n=1000, replace=False).reset_index(drop=True) |
|
test_data = pd.concat([case, ctrl]).reset_index(drop=True) |
|
print(test_data.groupby(by=['pat_label']).size()) |
|
|
|
# create corpus |
|
for i in range(0, len(test_data)): |
|
corpus.append((test_data['pat_id'][i], test_data[str(var)][i])) |
|
matrix = patient_bow(corpus); test_matrix = matrix[1]; features = matrix[2] |
|
print(test_matrix.shape); print('\n') |
|
|
|
# perform classification |
|
cf_counts = []; cf_plot = []; cf = [[] for _ in range(10)] |
|
pku_counts = []; pku_plot = []; pku = [[] for _ in range(10)] |
|
sc_counts = []; sc_plot = []; sc = [[] for _ in range(10)] |
|
ch_counts = []; ch_plot = []; ch = [[] for _ in range(10)] |
|
pat_lab = len(test_data['pat_label'][test_data.pat_label != 'Rand']) |
|
for i in range(0, len(test_data)): |
|
index = test_data['pat_id'][i]; cond = test_data['pat_label'][i] |
|
print("*********************" * 5) |
|
print("Running patient: " + str(i + 1) + "/" + str(pat_lab) + "\n") |
|
|
|
# CF patients |
|
if cond == 'CF': |
|
sim_res = similarity_test(test_matrix, test_data, index, cond) |
|
results_cf = youden_index(sim_res) |
|
cf_counts.append( |
|
[len([1.0 for x in sim_res['conds'][0:1] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:10] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:20] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:30] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:40] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:50] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:60] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:70] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:80] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:90] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:100] if x == 1.0])]) |
|
cf_plot.append(results_cf[0]) |
|
cf[0].append(results_cf[1]); cf[1].append(results_cf[2]) |
|
cf[2].append(results_cf[3]); cf[3].append(results_cf[4]) |
|
res = model_metrics(results_cf[1], results_cf[2], results_cf[3], results_cf[4]) |
|
cf[4].append(res[0]); cf[5].append(res[1]); cf[6].append(res[2]); cf[7].append(res[3]) |
|
cf[8].append(res[4]); cf[9].append(res[5]) |
|
|
|
# PKU patients |
|
if cond == 'PKU': |
|
sim_res = similarity_test(test_matrix, test_data, index, cond) |
|
results_pku = youden_index(sim_res) |
|
pku_counts.append( |
|
[len([1.0 for x in sim_res['conds'][0:1] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:10] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:20] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:30] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:40] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:50] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:60] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:70] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:80] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:90] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:100] if x == 1.0])]) |
|
pku_plot.append(results_pku[0]); pku[0].append(results_pku[1]); pku[1].append(results_pku[2]) |
|
pku[2].append(results_pku[3]); pku[3].append(results_pku[4]) |
|
res = model_metrics(results_pku[1], results_pku[2], results_pku[3], results_pku[4]) |
|
pku[4].append(res[0]); pku[5].append(res[1]); pku[6].append(res[2]) |
|
pku[7].append(res[3]); pku[8].append(res[4]); pku[9].append(res[5]) |
|
|
|
# SCD patients |
|
if cond == 'SC': |
|
sim_res = similarity_test(test_matrix, test_data, index, cond) |
|
results_sc = youden_index(sim_res) |
|
sc_counts.append( |
|
[len([1.0 for x in sim_res['conds'][0:1] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:10] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:20] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:30] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:40] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:50] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:60] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:70] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:80] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:90] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:100] if x == 1.0])]) |
|
sc_plot.append(results_sc[0]); sc[0].append(results_sc[1]); sc[1].append(results_sc[2]) |
|
sc[2].append(results_sc[3]); sc[3].append(results_sc[4]) |
|
res = model_metrics(results_sc[1], results_sc[2], results_sc[3], results_sc[4]) |
|
sc[4].append(res[0]); sc[5].append(res[1]); sc[6].append(res[2]) |
|
sc[7].append(res[3]); sc[8].append(res[4]); sc[9].append(res[5]) |
|
# CAH patients |
|
if cond == 'CH': |
|
sim_res = similarity_test(test_matrix, test_data, index, cond) |
|
results_ch = youden_index(sim_res) |
|
ch_counts.append( |
|
[len([1.0 for x in sim_res['conds'][0:1] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:10] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:20] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:30] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:40] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:50] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:60] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:70] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:80] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:90] if x == 1.0]), |
|
len([1.0 for x in sim_res['conds'][0:100] if x == 1.0])]) |
|
ch_plot.append(results_ch[0]); ch[0].append(results_ch[1]); ch[1].append(results_ch[2]) |
|
ch[2].append(results_ch[3]); ch[3].append(results_ch[4]) |
|
res = model_metrics(results_ch[1], results_ch[2], results_ch[3], results_ch[4]) |
|
ch[4].append(res[0]); ch[5].append(res[1]); ch[6].append(res[2]); ch[7].append(res[3]) |
|
ch[8].append(res[4]); ch[9].append(res[5]) |
|
# write results to disk |
|
pickle.dump(cf_counts, open(str(f_type) + "cf_counts.txt", "wb")) |
|
pickle.dump(cf_plot, open(str(f_type) + "cf_plot.txt", "wb")) |
|
pickle.dump(cf, open(str(f_type) + "cf_res.txt", "wb")) |
|
pickle.dump(open(str(f_type) + "pku_counts.txt", "wb"), fp) |
|
pickle.dump(pku_plot, open(str(f_type) + "pku_plot.txt", "wb")) |
|
pickle.dump(pku, open(str(f_type) + "pku_res.txt", "wb")) |
|
pickle.dump(sc_counts, open(str(f_type) + "sc_counts.txt", "wb")) |
|
pickle.dump(sc_plot, open(str(f_type) + "sc_plot.txt", "wb")) |
|
pickle.dump(sc, open(str(f_type) + "sc_res.txt", "wb")) |
|
pickle.dump(ch_counts, open(str(f_type) + "ch_counts.txt", "wb")) |
|
pickle.dump(ch_plot, open(str(f_type) + "ch_plot.txt", "wb")) |
|
pickle.dump(ch, open(str(f_type) + "ch_res.txt", "wb")) |
|
|
|
|
|
### STEP 2: Calculate performance metrics |
|
results = {} |
|
for filename in glob.iglob(str(f_type) + '*.txt'): |
|
if '_performance' not in filename: |
|
var = '_'.join(filename.split('_')[-2:]).split('.')[0] |
|
with open(str(filename), "rb") as fp: |
|
results[var] = pickle.load(fp) |
|
print("*" * 25 + '\n' + 'Generating Average Performance Results') |
|
myfile = open(str(f_type) + 'model_performance.txt', 'w') |
|
for dis in results: |
|
if '_res' in dis: |
|
tn = np.mean(results[dis][0]); fp = np.mean(results[dis][1]) |
|
fn = np.mean(results[dis][2]); tp = np.mean(results[dis][3]) |
|
tot = (tp + fp + fn + tn); obs_acc = (tp + tn) / tot |
|
r = ((tp+fp)*(tp+fn))/tot; l = (tp*(fp+tn))/tot |
|
exp_acc = (r+l)/tot; Kappa = (obs_acc- exp_acc) / (1 - exp_acc) |
|
myfile.write("GROUP: " + str(dis) + "\n") |
|
myfile.write("Mean Specificity [tn/(tn+fp)]: " + str(tn / (tn + fp)) + "\n") |
|
myfile.write("Mean TPR/Recall [tp/(tp+fn)]: " + str(tp / (tp + fn)) + "\n") |
|
myfile.write("Mean Precision [tp/(tp_fp)]: " + str(tp / (tp + fp)) + "\n") |
|
myfile.write("Mean FPR [1-(tn/(tn+fp))]: " + str(1 - (tn / (tn + fp))) + "\n") |
|
myfile.write("Mean Accuracy [(tp+tn)/(tp+fp+fn+tn)]: " + str((tp + tn) / (tp + fp + fn + tn)) + "\n") |
|
myfile.write("Mean F1 Score: " + str((2*tp / (2*tp + fp + fn))) + "\n") |
|
myfile.write("Kappa Score: " + str((obs_acc- exp_acc) / (1 - exp_acc)) + "\n") |
|
myfile.write("\n") |
|
myfile.close() |
|
|
|
### STEP 3: Create AUC plots |
|
print("*" * 25 + '\n' + 'Generating AUC Plots') |
|
fig = plt.figure(figsize=(12, 12)) |
|
grid = plt.GridSpec(2, 2, hspace=0.2, wspace=0.2) |
|
ax_1 = fig.add_subplot(grid[0, 0]) |
|
ax_2 = fig.add_subplot(grid[0, 1], ) |
|
ax_3 = fig.add_subplot(grid[1, 0]) |
|
ax_4 = fig.add_subplot(grid[1, 1]) |
|
plt.suptitle(str(title_grp), fontsize=13) |
|
fig.text(0.075, 0.5, 'True Positive Rate', va="center", rotation="vertical", fontsize=11) |
|
fig.text(0.46, 0.06, 'False Positive Rate', va="center", fontsize=11) |
|
w_file = str(f_type) + 'ROC_plot.png' |
|
plt_ax = [ax_1, ax_2, ax_3, ax_4]; i = 0 |
|
for dis_plot in results.keys(): |
|
if '_plot' in dis_plot: |
|
plots = results[dis_plot]; title = str(dis_plot.split('_')[0].upper()) |
|
tprs, aucs, = [], []; base_fpr = np.linspace(0, 1, 101) |
|
for res in plots: |
|
fpr = res[0]; tpr = res[1]; roc_auc = auc(fpr, tpr); aucs.append(roc_auc) |
|
tpr = interp(base_fpr, fpr, tpr); tpr[0] = 0.0; tprs.append(tpr) |
|
tprs = np.array(tprs); mean_tprs = tprs.mean(axis=0); mean_auc = auc(base_fpr, mean_tprs) |
|
std_auc = np.std(aucs); std_tpr = np.std(tprs, axis=0) |
|
tprs_upper = np.minimum(mean_tprs + std_tpr, 1); tprs_lower = np.maximum(mean_tprs - std_tpr, 0) |
|
plt_ax[i].plot(base_fpr, mean_tprs, 'b', |
|
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) |
|
plt_ax[i].fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3, |
|
label=r'$\pm$ 1 std. dev.') |
|
plt_ax[i].plot([0, 1], [0, 1], 'r--', label='Random Chance') |
|
plt_ax[i].set_xlim([-0.01, 1.01]) |
|
plt_ax[i].set_ylim([-0.01, 1.01]) |
|
plt_ax[i].legend(loc="lower right", fontsize=10) |
|
plt_ax[i].set_title(str(title), fontsize=10) |
|
i += 1 |
|
plt.savefig(str(w_file), bbox_inches='tight') |
|
plt.close() |
|
|
|
### STEP 4: Create TPR plots |
|
print("*" * 25 + '\n' + 'Generating TP Count Plots'); print("Generating ") |
|
fig = plt.figure(figsize=(12, 12)) |
|
grid = plt.GridSpec(2, 2, hspace=0.2, wspace=0.2) |
|
ax_1 = fig.add_subplot(grid[0, 0]) |
|
ax_2 = fig.add_subplot(grid[0, 1]) |
|
ax_3 = fig.add_subplot(grid[1, 0]) |
|
ax_4 = fig.add_subplot(grid[1, 1]) |
|
plt.suptitle("True Positives by Top N Similar Patients - " + str(title_grp), fontsize=13) |
|
fig.text(0.075, 0.5, '% True Positive', va="center", rotation="vertical", fontsize=11) |
|
fig.text(0.46, 0.06, 'Top N Similar Patients', va="center", fontsize=11) |
|
w_file = str(f_type) + 'TP_counts.png' |
|
plt_ax = [ax_1, ax_2, ax_3, ax_4]; j = 0 |
|
for dis_count in results.keys(): |
|
if '_counts' in dis_count: |
|
dis = results[dis_count]; title = str(dis_count.split('_')[0].upper()) |
|
g10 = []; g20 = []; g30 = []; g40 = []; g50 = []; g60 = []; g70 = []; g80 = []; g90 = []; g100 = [] |
|
for person in dis: |
|
g10.append(person[1] / 10.0); g20.append(person[2] / 20.0); g30.append(person[3] / 30.0) |
|
g40.append(person[4] / 40.0); g50.append(person[5] / 50.0); g60.append(person[6] / 60.0) |
|
g70.append(person[7] / 70.0); g80.append(person[8] / 80.0); g90.append(person[9] / 90.0) |
|
g100.append(person[10] / 100.0) |
|
|
|
res = g10 + g20 + g30 + g40 + g50 + g60 + g70 + g80 + g90 + g100 |
|
labels = [[x for _ in range(len(g10))] for x in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] |
|
df_plot = pd.DataFrame({'data': res, 'group': [item for sublist in labels for item in sublist]}) |
|
grouped = df_plot.groupby('group'); names, vals, xs = [], [], [] |
|
# plot results |
|
for i, (name, subdf) in enumerate(grouped): |
|
names.append(name); vals.append(subdf['data'].tolist()) |
|
xs.append(np.random.normal(i + 1, 0.04, subdf.shape[0])) |
|
ngroup = len(vals); clevels = np.linspace(0., 1., ngroup) |
|
bp = plt_ax[j].boxplot(vals, labels=names, patch_artist=True) |
|
for box in bp['boxes']: |
|
box.set(color='mediumseagreen', linewidth=3) # change outline color |
|
box.set(facecolor='mediumaquamarine', alpha=0.3) # change fill color |
|
plt.setp(bp['whiskers'], color='black') |
|
plt.setp(bp['fliers'], color='black') |
|
plt.setp(bp['means'], color='black') |
|
plt.setp(bp['caps'], color='black') |
|
plt.setp(bp['medians'], color='darkcyan', linewidth=2.5) |
|
# add scatter with jitter to plot |
|
for x, val, clevel in zip(xs, vals, clevels): plt_ax[j].scatter(x, val, c='gray', alpha=0.5) |
|
plt_ax[j].set_ylim([-0.01, 1.01]); plt_ax[j].set_title(str(title), fontsize=10); j += 1 |
|
plt.savefig(str(w_file), bbox_inches='tight') |
|
plt.close() |
|
|
|
|
|
##################################################################################################### |
|
#### Generate t-SNE Plots #### |
|
##################################################################################################### |
|
domain = ['cond', 'med', 'lab', 'all'] |
|
f_loc2 = ['Data/PatSim_test/case_all_code_df', 'Data/PatSim_test/comb_all_code_df', 'Data/PatSim_test/case_all_text_df', 'Data/PatSim_test/combo_all_text_df', 'Data/PatSim_test/case_all_code_B_filtered_df', 'Data/PatSim_test/comb_all_code_B_filtered_df', 'Data/PatSim_test/case_all_text_B_filtered_df', 'Data/PatSim_test/comb_all_text_B_filtered_df', 'Data/PatSim_test/case_all_code_A_filtered_df', 'Data/PatSim_test/comb_all_code_A_filtered_df', 'Data/PatSim_test/case_all_text_A_filtered_df', 'Data/PatSim_test/comb_all_text_A_filtered_df'] |
|
origin = 'Results/PatSim_Testing/dissertation/' |
|
for d_file in range(0, len(f_loc2)): |
|
data_file = f_loc2[d_file] |
|
print('\n' + 'FILE: {}'.format(data_file)) |
|
df = pd.read_pickle(data_file) |
|
for dom in domain: |
|
corpus = [] |
|
print('\n' + 'DOMAIN: {}'.format(dom)) |
|
### STEP1: Prepare data and file names |
|
if dom == 'all': sub_dir = 'Combined'; var = 'pat_vecs' |
|
if dom == 'cond': sub_dir = 'Conditions'; var = dom + '_vecs' |
|
if dom == 'med': sub_dir = 'Medications'; var = dom + '_vecs' |
|
if dom == 'lab': sub_dir = 'Labs'; var = dom + '_vecs' |
|
# files and titles |
|
f_name = data_file.replace('all', dom).split('/')[-1] |
|
f_type = origin + sub_dir + '/' + f_name + '_'; title_grp = '' |
|
# update databases |
|
test_data = df[['pat_id', 'pat_label', 'pat_lab_num', var]].drop_duplicates(keep='first', inplace=False) |
|
test_data = test_data[test_data[var] != '']; test_data = test_data.reset_index(drop=True) |
|
# create corpus |
|
for i in range(0, len(test_data)): corpus.append((test_data['pat_id'][i], test_data[str(var)][i])) |
|
matrix = patient_bow(corpus); test_matrix = matrix[1]; features = matrix[2] |
|
# format file names |
|
tsne_file = origin + sub_dir + '/Data/' + f_name + '_' + 'tsne.npy' |
|
tsne_title = ""; clustering_title = "" |
|
plot_file = origin + sub_dir + '/Figures/' + f_name + '_' + 'tsne.png' |
|
clust_k_file = origin + sub_dir + '/Clustering/' + f_name + '_' + 'kmeans_elbow.png' |
|
clustering_file = origin + sub_dir + '/Figures/' + f_name + '_' + 'CLUSTERS.png' |
|
clustering_metrics = origin + sub_dir + '/Clustering/' + f_name + '_' + 'CLUSTERING_Metrics.txt' |
|
f_type_clust = origin + sub_dir + '/Clustering/' + f_name + '_' |
|
|
|
### STEP 2: Reduce dimensions of matrices |
|
X_embedded = np.load(tsne_file) |
|
X_reduced = TruncatedSVD(n_components=50, random_state=1).fit_transform(test_matrix) |
|
X_embedded = TSNE(n_components=2, random_state=1, verbose=True, perplexity=25.0).fit_transform(X_reduced) |
|
np.save(tsne_file, X_embedded) # X_embedded = np.load("../Data/tSNE_embeddings.npy") |
|
|
|
### STEP 3: Create t-SNE plots |
|
colors = {0: '#00C0A3', 1: '#4B4453', 3: '#009EFA', 4: '#FF8066', 99: 'goldenrod'} |
|
names = {0: 'Phenylketonuria', 1: 'Congenital Hypothyroidism', 2: 'Sickle Cell', 3: 'Cystic Fibrosis', 99: 'Control'} |
|
pku = mpatches.Patch(color='#00C0A3', label='Phenylketonuria') |
|
ch = mpatches.Patch(color='#4B4453', label='Congenital Hypothyroidism') |
|
scd = mpatches.Patch(color='#009EFA', label='Sickle Cell') |
|
cf = mpatches.Patch(color='#FF8066', label='Cystic Fibrosis') |
|
rnd = mpatches.Patch(color='goldenrod', label='Control') |
|
# create data frame that has the result of the MDS plus the cluster numbers and titles |
|
df = pd.DataFrame(dict(x=X_embedded[:, 0], y=X_embedded[:, 1], group=list(test_data['pat_lab_num']))) |
|
groups = df.groupby('group'); fig, ax = plt.subplots(figsize=(12, 8)) |
|
for name, group in groups: |
|
if name == 99: ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, label=names[name], color=colors[name], mec='whitesmoke', alpha=0.9) |
|
for name, group in groups: |
|
if name != 99: ax.plot(group.x, group.y, marker='o', linestyle='', ms=6, label=names[name], color=colors[name], mec='whitesmoke', alpha=0.7) |
|
if 'comb' not in data_file: plt.legend(handles=[cf, pku, ch, scd], fontsize=12, frameon=False, loc="lower center", ncol=4) |
|
else: plt.legend(handles=[cf, pku, ch, scd, rnd], fontsize=10.5, frameon=False, loc="lower center", ncol=5) |
|
ax.tick_params(labelsize=16) |
|
if 'comb' not in data_file: n = 100; plt.ylim(-n, n); plt.xlim(-n, n) |
|
else: n = 125; plt.ylim(-n, n); plt.xlim(-n, n) |
|
plt.savefig(str(plot_file), bbox_inches='tight') |
|
plt.show(); plt.close() |
|
|
|
##################################################################################################### |
|
#### Perform K-Means Clustering #### |
|
##################################################################################################### |
|
# https://www.tutorialspoint.com/scikit_learn/scikit_learn_clustering_performance_evaluation.htm |
|
### STEP 1: Create list to store labels |
|
labels_real = [int(x) for x in test_data['pat_lab_num']]; labels_true = [] |
|
for l in labels_real: |
|
if l == 0 or l == 1: labels_true.append(l) |
|
if l == 3: labels_true.append(2) |
|
if l == 4: labels_true.append(3) |
|
if l == 99: labels_true.append(4) |
|
|
|
# elbow method to help pick k |
|
# scaled_data = scale(X_embedded, with_mean=False); cluster_range = range(1, 10); cluster_errors = [] |
|
# for num_clusters in cluster_range: |
|
# clusters = KMeans(num_clusters).fit(scaled_data); cluster_errors.append(clusters.inertia_) |
|
# clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors}) |
|
# plt.figure(figsize=(9, 6)) |
|
# plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker="o") |
|
# plt.title('Elbow Method - Determining the Optimal K') |
|
# plt.savefig(clust_k_file, bbox_inches='tight'); plt.show(); plt.close() |
|
|
|
### STEP 2: Run K-Means |
|
k = 4 if 'comb' not in data_file else 5 |
|
# kmeans_model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=100).fit(scaled_data) |
|
kmeans_model = KMeans(n_clusters=k, init='k-means++', max_iter=1000, random_state=0).fit(scaled_data) |
|
labels = [int(x) for x in kmeans_model.labels_.tolist()] |
|
|
|
### STEP 3: Derive clustrering metrics |
|
ar = round(adjusted_rand_score(labels_true, labels), 3) |
|
anmi = round(adjusted_mutual_info_score(labels_true, labels), 3) |
|
pur = round(purity_score(labels_true, labels), 3) |
|
with open(clustering_metrics, 'w') as out: |
|
out.write('CLUSTERING: ' + f_name + '\n\n') |
|
out.write('Adjusted Rand Score: ' + str(ar) + '\n') |
|
out.write('Adjusted Normalized Mutual Information: ' + str(anmi) + '\n') |
|
out.write('Purity: ' + str(pur) + '\n') |
|
print('Adjusted Rand Score: ' + str(ar)) |
|
print('Adjusted Normalized Mutual Information: ' + str(anmi)) |
|
print('Purity: ' + str(pur) + '\n') |
|
out.close() |
|
|
|
# store clustering results in df |
|
data = pd.DataFrame(dict(x=scaled_data[:, 0], y=scaled_data[:, 1], cluster=list(labels))) |
|
|
|
### STEP 4: Plot clustering results |
|
label1 = {0: 'red', 10: 'orchid', 3: 'lightcoral', 2: 'darkorange', 4: 'gray', 5: 'saddlebrown', 6: 'blue', |
|
11: 'turquoise', 8: 'dodgerblue', 9: 'forestgreen', 1: 'lightseagreen', 7: 'darkmagenta', |
|
12: 'deeppink', 13: 'rosybrown', 14: "darkviolet", 15: 'lime', 16: 'mediumvioletred'} |
|
color = [label1[i] for i in labels] |
|
plt.figure(figsize=(12, 8)) |
|
for i, label in enumerate(labels): |
|
plt.scatter(x=data.loc[data['cluster'] == label, 'x'], y=data.loc[data['cluster'] == label, 'y'], |
|
alpha=0.6, marker='o', color=color[i], s=40, edgecolor='whitesmoke') |
|
plt.annotate(label, data.loc[data['cluster'] == label, ['x', 'y']].mean(), |
|
horizontalalignment='center', verticalalignment='center', size=20, color='ghostwhite', |
|
weight='bold', bbox=dict(boxstyle="round", fc=color[i], edgecolor='ghostwhite', alpha=0.4)) |
|
plt.ylim(-3.5, 3.5); plt.xlim(-3.5, 3.5); plt.tick_params(labelsize=16) |
|
plt.savefig(str(clustering_file), bbox_inches='tight'); plt.show(); plt.close() |
|
|
|
### STEP 5: Obtain patient- and concept-level information for each cluster |
|
same_conds = {} |
|
for pat in range(0, len(data)): |
|
feats = top_feats_in_doc2(test_data, test_matrix, features, pat, top_n=10) |
|
cluster = str(data.iloc[pat]['cluster']) |
|
id = test_data.iloc[pat]['pat_id']; poc = test_data.iloc[pat]['pat_lab_num'] |
|
if poc == 0 or poc == 1: cond = poc |
|
if poc == 3: cond = 2 |
|
if poc == 4: cond = 3 |
|
if poc == 99: cond = 4 |
|
features2 = list(feats[feats.columns[0]]); feat_count = list(feats[feats.columns[1]]) |
|
# filter out features with tf-idf of 0.000 |
|
idx = len([x for x in feat_count if x != 0.000000]); features2 = features2[0:idx] |
|
if cluster in same_conds.keys(): |
|
same_conds[cluster]['id'].append(id); same_conds[cluster]['feature'] += features2 |
|
same_conds[cluster]['feat_count'] += feat_count; same_conds[cluster]['cond'].append(cond) |
|
same_conds[cluster]['cluster_cond'].append(names[cond]) |
|
else: |
|
same_conds[cluster] = {}; same_conds[cluster]['id'] = [id] |
|
same_conds[cluster]['feature'] = features2; same_conds[cluster]['feat_count'] = feat_count |
|
same_conds[cluster]['cond'] = [cond]; same_conds[cluster]['cluster_cond'] = [names[cond]] |
|
|
|
### STEP 6: Output clustering information |
|
np.save(f_type_clust + 'Clustering_results.txt', same_conds) |
|
|
|
## write out statistics for all patient concepts by cluster |
|
myfile = open(f_type_clust + 'Clustering.txt', 'w'); unq = {} |
|
for cluster in same_conds.keys(): |
|
cnt = collections.Counter(same_conds[cluster]['feature']) |
|
sort_cnt = sorted(cnt.items(), key=itemgetter(1), reverse=True) |
|
unq[cluster] = sort_cnt |
|
myfile.write("==" * 20 + "\n") |
|
myfile.write("CLUSTER: " + str(cluster) + "\n") |
|
myfile.write("Number of patients: " + str(len(same_conds[cluster]['id'])) + "\n") |
|
myfile.write("Number of concepts: " + str(len(sort_cnt)) + "\n") |
|
myfile.write("Patient Conditions:\n") |
|
dx_list = same_conds[cluster]['cluster_cond']; clust_keys = set(dx_list); tot = len(dx_list) |
|
for p in sorted(clust_keys): |
|
frac = round((dx_list.count(p)/float(tot))*100, 2) |
|
myfile.write("\t\t - {}: {} ({}%)\n".format(p, str(dx_list.count(p)), str(frac))) |
|
myfile.write("*" * 10 + "\n") |
|
for x in sort_cnt[0:40]: |
|
myfile.write(' - '.join((x[0], str(x[1]))) + "\n") |
|
myfile.write("==" * 20 + "\n") |
|
myfile.close() |
|
|
|
## write out statistics for only those concepts unique to each patient cluster |
|
myfile2 = open(f_type_clust + 'Clustering_unique.txt', 'w') |
|
for i in unq: |
|
all = []; unq_res = []; start = {x[0]: x[1] for x in unq[i]} |
|
for x in unq.keys(): |
|
if x != i: |
|
for res in unq[x]: |
|
all.append(res[0]) |
|
unq_res = sorted({a: start[a] for a in start.keys() if a not in all}.items(), |
|
key=itemgetter(1), reverse=True) |
|
myfile2.write("==" * 20 + "\n") |
|
myfile2.write("CLUSTER: " + str(i) + "\n") |
|
myfile2.write("Number of concepts: " + str(len(unq[i])) + "\n") |
|
myfile2.write("Number of unique concepts: " + str(len(unq_res)) + "\n") |
|
myfile2.write("*" * 10 + "\n") |
|
for y in unq_res: |
|
myfile2.write(' - '.join((y[0], str(y[1]))) + "\n") |
|
myfile2.write("==" * 20 + "\n") |
|
myfile2.close() |