Last active
February 25, 2020 01:46
-
-
Save groverpr/c1b9dcce85cc058e0bc350c58f4b5095 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Tokenize(BaseEstimator, TransformerMixin): | |
""" | |
Takes in pandas series and applies tokenization on each row based on given split pattern. | |
""" | |
def __init__(self, split_pat=f"([{string.punctuation}])"): | |
self.split_pat = split_pat # re pattern used to split string to tokens. default splits over any string punctuation | |
def tokenize(self, s): | |
""" Tokenize string """ | |
re_tok = re.compile(self.split_pat) | |
return re_tok.sub(r' \1 ', s).split() # substitute all delimiters specified in pattern with space and then splits over space | |
def fit(self, X, y=None): # no need to learn anything from training data for tokenization. so returning self | |
return self | |
def transform(self, X, y=None): | |
return X.apply(self.tokenize) | |
class Tok2Idx(BaseEstimator, TransformerMixin): | |
""" | |
Creates integer index from tokenizes columns. | |
Creates a dictionary of all unique tokens and corresponding integer index | |
Transform maps any token unseen in the training data to <unk> token | |
Need pandas series as input. | |
""" | |
def map_tok_idx(self, char): | |
try: | |
return self.tok2idx[char] | |
except KeyError: | |
return self.tok2idx["<unk>"] | |
def fit(self, X, y=None): | |
""" To be called for training data. Creates a token to integer map """ | |
self.uniq_set = list(set([y for x in list(X.values) for y in x])) | |
self.uniq_set.append("<unk>") | |
self.tok2idx = {j:i for i,j in enumerate(self.uniq_set)} | |
self.idx2tok = {i:j for i,j in enumerate(self.uniq_set)} | |
return self | |
def transform(self, X, y=None): | |
return X.map(lambda x: [self.map_tok_idx(c) for c in x]) | |
class LowerCaser(BaseEstimator, TransformerMixin): | |
""" | |
Lower case all string values. | |
Need pandas series as input. | |
No fitting is required for this one. | |
""" | |
def transform(self, X, y=None): | |
return X.str.lower() | |
class NullImputer(SimpleImputer): | |
""" | |
SimpleImputer works with 2D array. For the purpose of this analysis, we are working with pd Series. | |
Modifying it a bit | |
""" | |
def __init__(self, missing_values=np.nan, strategy='mean', fill_value=None, **kw): | |
super(NullImputer, self).__init__( | |
missing_values=missing_values, | |
strategy=strategy, | |
fill_value=fill_value | |
) | |
def fit(self, X, y=None): | |
super(NullImputer, self).fit(pd.DataFrame(X), y) | |
return self | |
def transform(self X): | |
result = super(NullImputer, self).transform(pd.DataFrame(X)) | |
return pd.DataFrame(result)[0] # converting 2D array back to series |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment