Created
March 20, 2019 14:01
-
-
Save sebp/9c718d4f031a12378051a8581ba7960e to your computer and use it in GitHub Desktop.
Plot missing values.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from matplotlib.backends.backend_pdf import PdfPages | |
def index_to_binary_matrix(index): | |
return index.to_series().apply(lambda x: pd.Series(map(int, x.split(":")), name=x)) | |
class MissingValuesPlot: | |
"""Plot missing values""" | |
def __init__(self, data, label=None, sort_by_missing=True): | |
""" | |
:type data: pandas.DataFrame | |
:param label: Label of the data | |
:type label: str | |
:param sort_by_missing: Whether to sort columns by | |
amount of missingess | |
""" | |
self.data = data | |
self.label = label | |
self.sort_by_missing = sort_by_missing | |
def get_missings(self): | |
""" | |
:rtype: pandas.Series | |
""" | |
miss = self.data.apply(lambda x: x.isnull().sum(), reduce=True, axis=0) | |
return miss | |
def get_combinations(self): | |
""" | |
:rtype: pandas.DataFrame | |
""" | |
miss_patterns = self.data.isnull().astype(np.int8).apply( | |
lambda x: ":".join([str(v) for v in x]), | |
reduce=True, axis=1) | |
counts = miss_patterns.value_counts() | |
percentage = counts * 100 / self.data.shape[0] | |
df = pd.DataFrame({"Count": counts, "Percent": percentage}) | |
assert df["Count"].sum() == self.data.shape[0] | |
return df | |
def plot_combinations(self, max_patterns=None, **fig_kw): | |
"""Plot combinations of missing values""" | |
miss = self.get_missings() | |
combs = self.get_combinations() | |
if max_patterns is not None: | |
combs = combs.head(max_patterns) | |
binary_mat = index_to_binary_matrix(combs.index) | |
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, | |
gridspec_kw={"width_ratios": [4, 1], "wspace": 0.01, | |
"bottom": 0.25, "top": 0.96}, | |
**fig_kw) | |
if self.sort_by_missing: | |
o = np.argsort(-miss.values) | |
data = binary_mat.iloc[::-1, o] | |
xticklabels = miss.iloc[o].index.tolist() | |
else: | |
data = binary_mat.iloc[::-1, :] | |
xticklabels = miss.index.tolist() | |
sns.heatmap(data, xticklabels=xticklabels, | |
cbar=False, yticklabels=True, linewidths=.1, | |
ax=ax1) | |
ax1.set_title("Combinations") | |
for t in ax1.get_xticklabels(): | |
t.set_rotation('vertical') | |
if self.label is not None: | |
ax1.set_xlabel(self.label) | |
pos = ax1.get_yticks() | |
percent = combs["Percent"] | |
ax2.barh(pos, width=percent, height=.9, align="center") | |
ax2.yaxis.grid(False) | |
ax1.yaxis.set_visible(False) | |
ax2.yaxis.set_visible(False) | |
ax2.set_xlabel("Percent") | |
ax3 = ax2.twinx() | |
ax3.set_yticks(pos) | |
ax3.set_ylim(ax2.get_ylim()) | |
ax3.set_yticklabels(["%.1f%%" % v for v in percent]) | |
ax3.yaxis.grid(False) | |
return fig | |
def plot_percentage(self, **kwargs): | |
"""Plot percentage of missingness""" | |
miss = self.get_missings() | |
df_percentage = miss / self.data.shape[0] | |
if self.sort_by_missing: | |
data = df_percentage.sort_values(ascending=False) | |
else: | |
data = df_percentage | |
ax = data.plot.bar(**kwargs) | |
plt.subplots_adjust(bottom=0.25, top=0.96) | |
ax.set_ylabel("Proportion of missings") | |
if self.label is not None: | |
ax.set_xlabel(self.label) | |
return ax | |
def drop_completely_missing(data): | |
perc_missing = data.isnull().sum(axis=1) / data.shape[1] | |
avail_data = data[perc_missing != 1] | |
return avail_data.copy() | |
def get_top_missing_pattern(data): | |
""" | |
:type data: pandas.DataFrame | |
:rtype pandas.Series | |
""" | |
mvp = MissingValuesPlot(data) | |
all_combs = mvp.get_combinations() | |
top_comb = all_combs.head(1).copy() | |
binary_mat = index_to_binary_matrix(top_comb.index) | |
top_comb["Missing"] = binary_mat.sum(axis=1) | |
complete_index = ":".join(["0"] * binary_mat.shape[1]) | |
if complete_index in all_combs.index: | |
complete = all_combs.loc[complete_index, :] | |
else: | |
complete = pd.Series([0, 0], index=["Count, Percent"]) | |
top_comb["Complete cases"] = complete["Count"] | |
top_comb["Complete cases (percentage)"] = complete["Percent"] | |
top_feature = mvp.get_missings().sort_values(ascending=False).head(1) | |
top_comb["Feature"] = top_feature.index.format()[0] | |
top_comb["Feature Missing"] = top_feature.iloc[0] * 100 / data.shape[0] | |
return top_comb | |
def plot_missing_values_as_pdf(data, name, sort_by_missing=True, max_patterns=None, | |
xlabel=None, figsize=None): | |
""" | |
:type data: pandas.DataFrame | |
:type name: str | |
:param sort_by_missing: Whether to sort columns by amount of missingess | |
:param max_patterns: Maximum number of combinations to plot | |
:param figsize: Size of the figure | |
:type figsize: tuple | |
""" | |
with PdfPages("%s.pdf" % name) as pg: | |
mvp = MissingValuesPlot(data, sort_by_missing=sort_by_missing, label=xlabel) | |
mvp.plot_combinations(max_patterns=max_patterns, figsize=figsize) | |
pg.savefig(bbox_inches="tight") | |
plt.close() | |
plt.figure(figsize=figsize) | |
mvp.plot_percentage() | |
pg.savefig(bbox_inches="tight") | |
plt.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment