Creating a Pipeline model
In this section, we provide you a guide to create a Pipeline model. This type of model is highly recommended to use EXPAI.
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn import model_selection, preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn import metrics as ms
import pickle as pickle
import os
# Define the path to the sample file
original_sample_path = os.path.abspath("./car_ad.csv")
# Read the file
df = pd.read_csv(original_sample_path, encoding='iso-8859-1', sep = ";", index_col=0)
df.head()
Transform and clean your data according to your necessities.
# Drop registers with negative price (corrupted)
df = df.drop(df[df.price <= 0 ].index)
# Drop null values for engine and outliers
df = df.dropna(how = "any", subset = ["engV"])
df = df.drop(df[df.engV > 40].index)
# Drop null values
df = df.dropna()
Split your dataset into train and test.
# Select target column
y_train = df["price"]
# Drop target from input
x_train = df.drop(["price"], axis=1)
# Split with 20% for test
data_train, data_test, label_train, label_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = 42)
# Transform index into a column
data_train.reset_index(level=0, inplace=True)
data_test.reset_index(level=0, inplace=True)
It is really important to store the very same data we used for train-test split. This will be the input dataset for EXPAI.
df.to_csv('./expai_input_data.csv')
Pipelines are implemented by Scikit-Learn and allow users to build an unique object for the whole analyticial process. See docs.
In this case, we will build a Pipeline that:
- Encodes categorical variables
- Scales numerical variables
# Define a transformer for numerical and categorical variables
transformer = ColumnTransformer(
transformers=[
('num', StandardScaler(), ["mileage", "engV", "year"]),
('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), ["car", "body", "engType", "registration", "model", "drive"])
]
)
# Define XGBoost Regressor parameters
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'silent': 1
}
# Init model
model = xgb.XGBRegressor(**xgb_params)
# Create Pipeline object where steps are transformation and model.
clf = Pipeline(steps=[
('preprocessor', transformer),
('model', model)
])
# Fit the pipeline
clf.fit(X = data_train, y = label_train)
Since we are building a regressor, we will use Mean Squared Error as our metric to check performance.
# Predict price for test data
y_hat = clf.predict(data_test)
# Compute MSE
ms.mean_squared_error(label_test, y_hat)
model_path = os.path.abspath("./model_pipeline.pkl")
with open(model_path, 'wb') as f:
pickle.dump(clf, f)