Last active
April 8, 2018 14:04
-
-
Save AntonOsika/cadee92254cde2e4b0bebbd0dd2012d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bisect | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.utils.validation import check_is_fitted | |
from sklearn.utils import column_or_1d | |
import numpy as np | |
class CategoricalTransform(LabelEncoder): | |
""" | |
Encode labels with value between 0 and n_classes-1. | |
Handles unseen labels and will treat everything as strings. | |
# Arguments: | |
min_category_size: integer, number of samples necessary to form a | |
separate class when fitting | |
Read more in sklearn :ref:`User Guide <preprocessing_targets>`. | |
""" | |
def __init__(self, min_category_size=None): | |
super(CategoricalTransform, self).__init__() | |
self.min_category_size = min_category_size | |
def fit(self, y): | |
"""Fit label encoder | |
Parameters | |
---------- | |
y : array-like of shape (n_samples,) | |
Target values. | |
Returns | |
------- | |
self : returns an instance of self. | |
""" | |
y = np.array(y) | |
y = y.astype('str') | |
classes, counts = np.unique(y, return_counts=True) | |
if self.min_category_size: | |
large_classes = (counts >= self.min_category_size) | |
classes = classes[large_classes] | |
classes = classes.tolist() | |
bisect.insort_left(classes, '<unknown>') | |
self.classes_ = classes | |
return self | |
def transform(self, y): | |
"""Transform labels to normalized encoding. | |
Parameters | |
---------- | |
y : array-like of shape [n_samples] | |
Target values. | |
Returns | |
------- | |
y : array-like of shape [n_samples] | |
""" | |
check_is_fitted(self, 'classes_') | |
y = column_or_1d(y, warn=True) | |
y = y.astype('str') | |
y[~np.isin(y, self.classes_)] = '<unknown>' | |
return np.searchsorted(self.classes_, y) | |
def fit_transform(self, y): | |
"""Fit and transform.""" | |
self.fit(y) | |
return self.transform(y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment