Skip to content

Instantly share code, notes, and snippets.

@groverpr
Created February 25, 2020 01:42
Show Gist options
  • Save groverpr/00e12432e81e51f3d461c815bdddee3b to your computer and use it in GitHub Desktop.
Save groverpr/00e12432e81e51f3d461c815bdddee3b to your computer and use it in GitHub Desktop.
# Transformers present in Pipeline work in series. 1->2->3->4
transformer_pipe = Pipeline(steps=[
("null_impute", NullImputer(strategy="constant", fill_value="null")),
("lower_case", LowerCaser()),
("tokenize", Tokenize(f"([{string.punctuation}])")),
("token2index", Tok2Idx())
])
X_train_transformed = transformer_pipe.fit_transform(X_train) # fit and transform on train data
X_valid_transformed = transformer_pipe.transform(X_valid)
X_test_transformed = transformer_pipe.transform(X_test)
# saving transformer to a file for later use
# we need to apply same transformation when we need to ...
# evaluate models on new datasets (batch transform or real time inference)
joblib.dump(transformer_pipe, "filepath")
# loading the saved transformer
transformer_pipe = joblib.load("filepath")
# you can also build a column transformer that applies different pipelines to different
# set of features and then combine all of the transfromed features together
# e.g.
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer(transformers=[
("pipeline1", transformer_pipe1, ["list-of-features1"]),
("pipeline2", transformer_pipe2, ["list-of-features2"])
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment