Skip to content

Instantly share code, notes, and snippets.

@achinta
Last active July 18, 2022 19:44
Show Gist options
  • Save achinta/d56aa05a3185248cb9beff314dcbf1f6 to your computer and use it in GitHub Desktop.
Save achinta/d56aa05a3185248cb9beff314dcbf1f6 to your computer and use it in GitHub Desktop.
from sklearn.base import TransformerMixin
class CuCategoryEncoder(TransformerMixin):
"""
Runs on GPU using cudf
Once fit method is called, sklearn.preprocessing.LabelEncoder cannot encode new categories.
In this category encoder, fit can be called any number times. It encodes categories which it has not seen before,
without changing the encoding of existing categories.
"""
# categories as series
cats = {}
def __init__(self, cols, auto_fit=False, share_cats=False):
"""
"""
assert type(cols) == list
self.cols = cols
self.auto_fit = auto_fit
self.share_cats = share_cats
def fit(self, df):
for col in self.cols:
# set the key for the category df
if self.share_cats:
cat_key = 'shared'
else:
cat_key = col
# use the right datatype
dtype = df[col].dtype
if self.cats.get(cat_key) is None:
self.cats[cat_key] = cudf.DataFrame({'cats': []}, dtype=dtype)
# join values with categories and filter out the matches
joined = df.merge(self.cats[cat_key], left_on=col, right_on='cats', how='left')
new_cats = cudf.DataFrame({'cats': joined[joined.cats.isnull()][col].unique()})
# append new cats to exiting cats
self.cats[cat_key] = cudf.concat([self.cats[cat_key], new_cats], ignore_index=True)
return self
def transform(self, df):
if self.auto_fit:
self.fit(df)
for col in self.cols:
# set the key for the category df
if self.share_cats:
cat_key = 'shared'
else:
cat_key = col
df = df.merge(self.cats[cat_key].reset_index(), left_on=col, right_on='cats', how='left')\
.drop([col,'cats'],axis=1).rename(columns={'index':col})
return df
def inverse_transform(self, df):
for col in self.cols:
# set the key for the category df
if self.share_cats:
cat_key = 'shared'
else:
cat_key = col
df = df.merge(self.cats[cat_key].reset_index(), how='left', left_on=col, right_on='index')\
.drop(['index',col]).rename(columns={'cats':col})
return df
# Testing it
pdf = pd.DataFrame({
'sa': np.random.randint(1,10,3),
'da': np.random.randint(10,20,3)
})
df = cudf.from_pandas(pdf)
encoder = CuCategoryEncoder(['sa','da'],auto_fit=True, share_cats=True)
# keep repeating the following lines
encoder.fit(df)
print(f'df:\n ', df)
print(f'cats:\n ', encoder.cats)
transformed = encoder.transform(df)
print(f'trans:\n ', transformed.head())
print(f'inv:\n ', encoder.inverse_transform(transformed))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment