Building models

Building Models¶

Corridor users can build new models using the integrated Notebook. We illustrate this process step by step in following sections:

Get the data
Prepare Train, Test Dataset
Train a Simple XGBoost Model
Convert the Model to Pickle file

Get the data¶

In [1]:

Copied!





# import spark library
# import findspark; findspark.init(); import pyspark
# spark = pyspark.sql.SparkSession.builder.getOrCreate()
from corridor import create_data

# supply the data in the create_data, so that we are recreating the DE/Features on the data supplied
df = create_data('debt_capacity', 'age_of_credit_file', 'default_flag').limit(1000)

'''
Note: Users can bring in new data instead of using registered DEs/Features from platform 
'''

df = df.toPandas()
df.head(2)
# import spark library
# import findspark; findspark.init(); import pyspark
# spark = pyspark.sql.SparkSession.builder.getOrCreate()
from corridor import create_data

# supply the data in the create_data, so that we are recreating the DE/Features on the data supplied
df = create_data('debt_capacity', 'age_of_credit_file', 'default_flag').limit(1000)

'''
Note: Users can bring in new data instead of using registered DEs/Features from platform 
'''

df = df.toPandas()
df.head(2)

Out[1]:

	debt_capacity	age_of_credit_file	default_flag
0	0.195333	231.0	0.0
1	0.088000	225.0	0.0

Prepare Train, Test Dataset¶

In [2]:

Copied!





import numpy as np
# get the dependent variable array y
y = df[['default_flag']]
# drop some columns to get the independent variables X
X = df.drop(['default_flag'], axis=1)
import numpy as np
# get the dependent variable array y
y = df[['default_flag']]
# drop some columns to get the independent variables X
X = df.drop(['default_flag'], axis=1)

In [3]:

Copied!





import xgboost as xgb
from sklearn.model_selection import train_test_split

# split up data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# transfer the data into DMatrix
# xgboost only works with DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X.columns.tolist())
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=X.columns.tolist())
import xgboost as xgb
from sklearn.model_selection import train_test_split

# split up data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# transfer the data into DMatrix
# xgboost only works with DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X.columns.tolist())
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=X.columns.tolist())

Train a Simple XGBoost Model¶

In [4]:

Copied!





# set up xgboost params
param = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.03,  # the training rate for each iteration
    'objective': 'binary:logistic',  
    'subsample': 0.8,
    'verbosity':0,
} 
# train the model
model = xgb.train(param, dtrain, num_boost_round = 100)
# set up xgboost params
param = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.03,  # the training rate for each iteration
    'objective': 'binary:logistic',  
    'subsample': 0.8,
    'verbosity':0,
} 
# train the model
model = xgb.train(param, dtrain, num_boost_round = 100)

In [5]:

Copied!

# feature importance
import matplotlib
xgb.plot_importance(model, max_num_features=10)
# feature importance
import matplotlib
xgb.plot_importance(model, max_num_features=10)

Out[5]:

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>

No description has been provided for this image

Convert the Model to Pickle file¶

In [6]:

Copied!





# Import required package
import pickle

# Converting model object to pickle file and saving locally, this pkl file can be registered in Model Studio.
# Once registered, the model can be used in policy
model_pkl = pickle.dumps(model)
# pickle.dump(model, open("simple_default_model.pkl", "wb"))
# Import required package
import pickle

# Converting model object to pickle file and saving locally, this pkl file can be registered in Model Studio.
# Once registered, the model can be used in policy
model_pkl = pickle.dumps(model)
# pickle.dump(model, open("simple_default_model.pkl", "wb"))