Created
February 25, 2020 01:42
-
-
Save groverpr/00e12432e81e51f3d461c815bdddee3b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Transformers present in Pipeline work in series. 1->2->3->4 | |
transformer_pipe = Pipeline(steps=[ | |
("null_impute", NullImputer(strategy="constant", fill_value="null")), | |
("lower_case", LowerCaser()), | |
("tokenize", Tokenize(f"([{string.punctuation}])")), | |
("token2index", Tok2Idx()) | |
]) | |
X_train_transformed = transformer_pipe.fit_transform(X_train) # fit and transform on train data | |
X_valid_transformed = transformer_pipe.transform(X_valid) | |
X_test_transformed = transformer_pipe.transform(X_test) | |
# saving transformer to a file for later use | |
# we need to apply same transformation when we need to ... | |
# evaluate models on new datasets (batch transform or real time inference) | |
joblib.dump(transformer_pipe, "filepath") | |
# loading the saved transformer | |
transformer_pipe = joblib.load("filepath") | |
# you can also build a column transformer that applies different pipelines to different | |
# set of features and then combine all of the transfromed features together | |
# e.g. | |
from sklearn.compose import ColumnTransformer | |
column_transformer = ColumnTransformer(transformers=[ | |
("pipeline1", transformer_pipe1, ["list-of-features1"]), | |
("pipeline2", transformer_pipe2, ["list-of-features2"]) | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment