.pipe()
¶Niwako Sugimura, Ph.D., Shad Sharma, MASc., Houda Aynaou, MA
DataFrame.pipe(func, *args, **kwargs)
def func(df, args):
transformed_df = transform(df, args)
return transformed_df
transformed_df = (
df.pipe(func, args)
.pipe(some_other_func, other_args)
.pipe(yet_another_func, yet_more_args)
)
FEATURES = ["column 1", "column 2", ...]
def drop_non_features(df):
return df[FEATURES]
.pipe()
¶class NormalizeNumerical:
def __call__(self, df, is_train):
if is_train:
self.scaler = Normalizer()
self.scaler.fit(df[NUMERICAL_COLUMNS])
df = df.copy()
df[NUMERICAL_COLUMNS] = self.scaler.transform(df[NUMERICAL_COLUMNS])
return df
normalize_numerical = NormalizeNumerical()
class StandardizeNumerical:
def __call__(self, df, is_train):
if is_train:
self.scaler = StandardScaler()
self.scaler.fit(df[NUMERICAL_COLUMNS])
df = df.copy()
df[NUMERICAL_COLUMNS] = self.scaler.transform(df[NUMERICAL_COLUMNS])
return df
standardize_numerical = StandardizeNumerical()
def preprocess(df, is_train):
return (
df.pipe(drop_non_features)
# .pipe(normalize_numerical, is_train=is_train)
.pipe(standardize_numerical, is_train=is_train)
)
model = Model() # Using whatever model we want
model.fit(X_train.pipe(preprocess, is_train=True), y_train)
model.score(X_test.pipe(preprocess, is_train=False), y_test)
y_unlabeled = model.predict(X_unlabeled.pipe(preprocess, is_train=False))
def preprocess(df, is_train):
return (
df.pipe(drop_non_features)
# .pipe(normalize_numerical, is_train=is_train)
.pipe(standardize_numerical, is_train=is_train)
# .pipe(replace_missing_numerical, fill=-999)
.pipe(impute_missing_numerical, is_train=is_train)
.pipe(onehot_encode_categorical, is_train=is_train)
)
.pipe()
¶The main difference is that you either need to use a temporary variable:
def preprocess(df, is_train):
tdf = df.copy()
tdf = drop_non_features(tdf)
# tdf = normalize_numerical(tdf, is_train=is_train)
tdf = standardize_numerical(tdf, is_train=is_train)
# tdf = replace_missing_numerical(tdf, fill=-999)
tdf = impute_missing_numerical(tdf, is_train=is_train)
tdf = onehot_encode_categorical(tdf, is_train=is_train)
return tdf
Or you have some ugly function chaining:
def preprocess(df, is_train):
return onehot_encode_categorical(
impute_missing_numerical(
standardize_numerical(drop_non_features(df), is_train=is_train),
is_train=is_train,
),
is_train=is_train,
)