Skip to content

Instantly share code, notes, and snippets.

@groverpr
Last active February 25, 2020 01:46
Show Gist options
  • Save groverpr/c1b9dcce85cc058e0bc350c58f4b5095 to your computer and use it in GitHub Desktop.
Save groverpr/c1b9dcce85cc058e0bc350c58f4b5095 to your computer and use it in GitHub Desktop.
class Tokenize(BaseEstimator, TransformerMixin):
"""
Takes in pandas series and applies tokenization on each row based on given split pattern.
"""
def __init__(self, split_pat=f"([{string.punctuation}])"):
self.split_pat = split_pat # re pattern used to split string to tokens. default splits over any string punctuation
def tokenize(self, s):
""" Tokenize string """
re_tok = re.compile(self.split_pat)
return re_tok.sub(r' \1 ', s).split() # substitute all delimiters specified in pattern with space and then splits over space
def fit(self, X, y=None): # no need to learn anything from training data for tokenization. so returning self
return self
def transform(self, X, y=None):
return X.apply(self.tokenize)
class Tok2Idx(BaseEstimator, TransformerMixin):
"""
Creates integer index from tokenizes columns.
Creates a dictionary of all unique tokens and corresponding integer index
Transform maps any token unseen in the training data to <unk> token
Need pandas series as input.
"""
def map_tok_idx(self, char):
try:
return self.tok2idx[char]
except KeyError:
return self.tok2idx["<unk>"]
def fit(self, X, y=None):
""" To be called for training data. Creates a token to integer map """
self.uniq_set = list(set([y for x in list(X.values) for y in x]))
self.uniq_set.append("<unk>")
self.tok2idx = {j:i for i,j in enumerate(self.uniq_set)}
self.idx2tok = {i:j for i,j in enumerate(self.uniq_set)}
return self
def transform(self, X, y=None):
return X.map(lambda x: [self.map_tok_idx(c) for c in x])
class LowerCaser(BaseEstimator, TransformerMixin):
"""
Lower case all string values.
Need pandas series as input.
No fitting is required for this one.
"""
def transform(self, X, y=None):
return X.str.lower()
class NullImputer(SimpleImputer):
"""
SimpleImputer works with 2D array. For the purpose of this analysis, we are working with pd Series.
Modifying it a bit
"""
def __init__(self, missing_values=np.nan, strategy='mean', fill_value=None, **kw):
super(NullImputer, self).__init__(
missing_values=missing_values,
strategy=strategy,
fill_value=fill_value
)
def fit(self, X, y=None):
super(NullImputer, self).fit(pd.DataFrame(X), y)
return self
def transform(self X):
result = super(NullImputer, self).transform(pd.DataFrame(X))
return pd.DataFrame(result)[0] # converting 2D array back to series
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment