Running Comparisons¶

Comparison in corridor context implies evaluating two different entities on same input data. We can compare model output for registered models in notebook. This is done by calling create_data function of corridor library.

We will illustrate comaparison in this notebook in two sections:

1. Model Comparison
2. Policy Comparison

Model Comparison¶

In [1]:

Copied!





# import spark library
import findspark; findspark.init(); import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# read in the Application Level Dataset
application_table = spark.read.parquet('s3a://corridor.dev/master/sampleAppData.parquet')
# get the first 1000 records 
application_table = application_table.limit(1000)

# take a look at the first 5 rows of the application_table dataframe by runningthe line below:
application_table.limit(5).toPandas()
# import spark library
import findspark; findspark.init(); import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# read in the Application Level Dataset
application_table = spark.read.parquet('s3a://corridor.dev/master/sampleAppData.parquet')
# get the first 1000 records 
application_table = application_table.limit(1000)

# take a look at the first 5 rows of the application_table dataframe by runningthe line below:
application_table.limit(5).toPandas()

Out[1]:

	corridor_application_id	open_acc_6m	acc_open_past_24mths	addr_state	zip_code	annual_inc	corridor_application_date	application_type	simulated_age	...	total_bal_ex_mort	total_bc_limit	tot_coll_amt	tot_cur_bal	tot_hi_cred_lim	total_bal_il	total_il_high_credit_limit	total_rev_hi_lim	all_util	__index_level_0__
0	20000018440	0.0	2.0	AZ	852xx	75000.0	2015-11-24 07:00:00	Individual	41	...	100088.0	32000.0	73.0	444844.0	476098.0	67536.0	91314.0	36400.0	89.0	2122946
1	225770004638	2.0	8.0	OH	452xx	125000.0	2018-07-18 07:00:00	Individual	36	...	28524.0	20300.0	1216.0	166257.0	199832.0	13993.0	21625.0	26600.0	59.0	1810773
2	20000077834	NaN	3.0	MA	018xx	113536.0	2015-10-23 07:00:00	Individual	45	...	275761.0	36600.0	0.0	496816.0	559526.0	NaN	253426.0	51600.0	NaN	2182340
3	20000267750	NaN	7.0	MD	216xx	140000.0	2015-05-22 07:00:00	Individual	37	...	53771.0	100700.0	0.0	323232.0	439972.0	NaN	8831.0	136300.0	NaN	2372256
4	225769878100	1.0	8.0	NC	282xx	130000.0	2018-04-25 07:00:00	Individual	47	...	65131.0	56900.0	0.0	168988.0	247844.0	17643.0	26658.0	104300.0	50.0	1684235

5 rows × 89 columns

In [2]:

Copied!





# Import Corridor Package Objects
from corridor import create_data

'''
Run comparison between: 
    PD Model Strict: pd_model_ver1 is output feature alias
    PD Model Lenient: pd_model_ver2 is output feature alias
'''

df = create_data('pd_model_ver1','pd_model_ver2',
                 data={'application': application_table})
df.limit(10).toPandas()
# Import Corridor Package Objects
from corridor import create_data

'''
Run comparison between: 
    PD Model Strict: pd_model_ver1 is output feature alias
    PD Model Lenient: pd_model_ver2 is output feature alias
'''

df = create_data('pd_model_ver1','pd_model_ver2',
                 data={'application': application_table})
df.limit(10).toPandas()

Out[2]:

	pd_model_ver1	pd_model_ver2
0	0.25	0.15
1	0.10	0.15
2	0.15	0.15
3	0.10	0.05
4	0.10	0.15
5	0.10	0.15
6	0.10	0.05
7	0.10	0.15
8	0.25	0.15
9	0.25	0.15

In [3]:

Copied!

# Summarising score predicitons from two models
df.describe().show()
# Summarising score predicitons from two models
df.describe().show()

+-------+-------------------+--------------------+
|summary|      pd_model_ver1|       pd_model_ver2|
+-------+-------------------+--------------------+
|  count|                 83|                  83|
|   mean|0.16686746987951806| 0.14277108433734956|
| stddev|0.06308970985716994|0.029355402253977043|
|    min|               0.05|                0.05|
|    max|               0.25|                 0.2|
+-------+-------------------+--------------------+

Policy Comparison¶

In [4]:

Copied!





# Import necessary packages
import findspark; findspark.init(); import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from corridor import Policy
# Import necessary packages
import findspark; findspark.init(); import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from corridor import Policy

Initiate Policy Objects

In [5]:

Copied!

policy_1 = Policy('UW Policy with PD Model and Framework')
policy_2 = Policy('UW Policy with PD Model and Framework - latest')
policy_1 = Policy('UW Policy with PD Model and Framework')
policy_2 = Policy('UW Policy with PD Model and Framework - latest')

Read Application dataset

In [6]:

Copied!

# Import application data - For this illustration, application data is stored in s3 storage as a parquet file
df = spark.read.parquet('s3a://corridor.dev/master/kishan/notebook_examples/policy_inputs.parquet')
df.toPandas().head(2)
# Import application data - For this illustration, application data is stored in s3 storage as a parquet file
df = spark.read.parquet('s3a://corridor.dev/master/kishan/notebook_examples/policy_inputs.parquet')
df.toPandas().head(2)

Out[6]:

	fico_range_high	annual_inc	corridor_application_date	corridor_requested_loan_amount	earliest_cr_line	corridor_application_id
0	750.0	100000.0	2018-01-01	15000.0	2016-01-01	1.0
1	760.0	120000.0	2018-02-01	12500.0	2016-02-01	2.0

In [7]:

Copied!

# Running policy simulation using policy_1 on application data "df"
policy_entity_data_1, policy_offer_data_1 = policy_1.run(data={'application': df})

# Running policy simulation using policy_2 on application data "df"
policy_entity_data_2, policy_offer_data_2 = policy_2.run(data={'application': df})
# Running policy simulation using policy_1 on application data "df"
policy_entity_data_1, policy_offer_data_1 = policy_1.run(data={'application': df})

# Running policy simulation using policy_2 on application data "df"
policy_entity_data_2, policy_offer_data_2 = policy_2.run(data={'application': df})

In [8]:

Copied!





# Converting policy_offer_data table from policy execution to Pandas dataframe for assessment
policy_offer_data_1 = policy_offer_data_1.toPandas()
policy_offer_data_2 = policy_offer_data_2.toPandas()

# Subsetting offers for a particular application to make comparison at offer level
offer_data_app_1_policy_1 = policy_offer_data_1[policy_offer_data_1['application_id'] == 1]
offer_data_app_1_policy_2 = policy_offer_data_2[policy_offer_data_2['application_id'] == 1]
# Converting policy_offer_data table from policy execution to Pandas dataframe for assessment
policy_offer_data_1 = policy_offer_data_1.toPandas()
policy_offer_data_2 = policy_offer_data_2.toPandas()

# Subsetting offers for a particular application to make comparison at offer level
offer_data_app_1_policy_1 = policy_offer_data_1[policy_offer_data_1['application_id'] == 1]
offer_data_app_1_policy_2 = policy_offer_data_2[policy_offer_data_2['application_id'] == 1]

Compairing Policy Output

Offers from policy_1

In [9]:

Copied!





# Number of offers tested for a particular application (id =1) based on offer configuration setup  of policy_1
print('Total offers evaluated:', len(offer_data_app_1_policy_1['offer_id'].unique()))

offer_ids = []  # List to store offers_ids for which each strategy PASS
for i in range(0,len(policy_1.strategies)):
    strategy_name = policy_1.strategies[i].name
    passed_offers = offer_data_app_1_policy_1[(offer_data_app_1_policy_1['strategy_name']==policy_1.strategies[i].name) & (offer_data_app_1_policy_1['strategy_output']=="PASS")]
    unique_offers = passed_offers['offer_id'].nunique()
    offer_ids.extend(set(passed_offers.offer_id))
    print(f'Number of Offers passing strategy - {strategy_name}: {unique_offers}')

# Passed offers
offer_ids_passed = set([x for x in offer_ids if offer_ids.count(x) == len(policy_1.strategies)])    
print(f'Total offers passed: {len(offer_ids_passed)}' + "\n")

# Print passed offers -- This is visible in "All Available Offers" tab on platform
cols = ['potential_loan_amount', 'potential_int_rate', 'potential_term', 'offer_id']
print("*"*20 + " All Available Offers " + "*"*20)
print(offer_data_app_1_policy_1[offer_data_app_1_policy_1.offer_id.isin(offer_ids_passed)][cols].drop_duplicates().reset_index(drop=True))
# Number of offers tested for a particular application (id =1) based on offer configuration setup  of policy_1
print('Total offers evaluated:', len(offer_data_app_1_policy_1['offer_id'].unique()))

offer_ids = []  # List to store offers_ids for which each strategy PASS
for i in range(0,len(policy_1.strategies)):
    strategy_name = policy_1.strategies[i].name
    passed_offers = offer_data_app_1_policy_1[(offer_data_app_1_policy_1['strategy_name']==policy_1.strategies[i].name) & (offer_data_app_1_policy_1['strategy_output']=="PASS")]
    unique_offers = passed_offers['offer_id'].nunique()
    offer_ids.extend(set(passed_offers.offer_id))
    print(f'Number of Offers passing strategy - {strategy_name}: {unique_offers}')

# Passed offers
offer_ids_passed = set([x for x in offer_ids if offer_ids.count(x) == len(policy_1.strategies)])    
print(f'Total offers passed: {len(offer_ids_passed)}' + "\n")

# Print passed offers -- This is visible in "All Available Offers" tab on platform
cols = ['potential_loan_amount', 'potential_int_rate', 'potential_term', 'offer_id']
print("*"*20 + " All Available Offers " + "*"*20)
print(offer_data_app_1_policy_1[offer_data_app_1_policy_1.offer_id.isin(offer_ids_passed)][cols].drop_duplicates().reset_index(drop=True))

Total offers evaluated: 10
Number of Offers passing strategy - Min. Eligibility Requirement: 10
Number of Offers passing strategy - Loan Approval Strategy: 3
Number of Offers passing strategy - Set Customer Value: 10
Total offers passed: 3

******************** All Available Offers ********************
   potential_loan_amount  potential_int_rate  potential_term     offer_id
0                15000.0               10.99            36.0  68719476738
1                10000.0               10.99            36.0  68719476744
2                12500.0               10.99            36.0  68719476742

Offers from policy_2

In [10]:

Copied!





# Number of offers tested for a particular application (id =1) based on offer configuration setup of policy_2
print('Total offers evaluated:', len(offer_data_app_1_policy_2['offer_id'].unique()))

offer_ids = []  # List to store offers_ids for which each strategy PASS
for i in range(0,len(policy_2.strategies)):
    strategy_name = policy_2.strategies[i].name
    passed_offers = offer_data_app_1_policy_2[(offer_data_app_1_policy_2['strategy_name']==policy_2.strategies[i].name) & (offer_data_app_1_policy_2['strategy_output']=="PASS")]
    unique_offers = passed_offers['offer_id'].nunique()
    offer_ids.extend(set(passed_offers.offer_id))
    print(f'Number of Offers passing strategy - {strategy_name}: {unique_offers}')

# Passed offers
offer_ids_passed = set([x for x in offer_ids if offer_ids.count(x) == len(policy_2.strategies)])    
print(f'Total offers passed: {len(offer_ids_passed)}' + "\n")

# Print passed offers -- This is visible in "All Available Offers" tab on platform
cols = ['potential_loan_amount', 'potential_int_rate', 'potential_term', 'offer_id']
print("*"*20 + " All Available Offers " + "*"*20)
print(offer_data_app_1_policy_2[offer_data_app_1_policy_2.offer_id.isin(offer_ids_passed)][cols].drop_duplicates().reset_index(drop=True))
# Number of offers tested for a particular application (id =1) based on offer configuration setup of policy_2
print('Total offers evaluated:', len(offer_data_app_1_policy_2['offer_id'].unique()))

offer_ids = []  # List to store offers_ids for which each strategy PASS
for i in range(0,len(policy_2.strategies)):
    strategy_name = policy_2.strategies[i].name
    passed_offers = offer_data_app_1_policy_2[(offer_data_app_1_policy_2['strategy_name']==policy_2.strategies[i].name) & (offer_data_app_1_policy_2['strategy_output']=="PASS")]
    unique_offers = passed_offers['offer_id'].nunique()
    offer_ids.extend(set(passed_offers.offer_id))
    print(f'Number of Offers passing strategy - {strategy_name}: {unique_offers}')

# Passed offers
offer_ids_passed = set([x for x in offer_ids if offer_ids.count(x) == len(policy_2.strategies)])    
print(f'Total offers passed: {len(offer_ids_passed)}' + "\n")

# Print passed offers -- This is visible in "All Available Offers" tab on platform
cols = ['potential_loan_amount', 'potential_int_rate', 'potential_term', 'offer_id']
print("*"*20 + " All Available Offers " + "*"*20)
print(offer_data_app_1_policy_2[offer_data_app_1_policy_2.offer_id.isin(offer_ids_passed)][cols].drop_duplicates().reset_index(drop=True))

Total offers evaluated: 6
Number of Offers passing strategy - Min. Eligibility Requirement: 6
Number of Offers passing strategy - Loan Approval Strategy: 2
Number of Offers passing strategy - Set Customer Value: 6
Number of Offers passing strategy - Interest Optimization: 3
Total offers passed: 2

******************** All Available Offers ********************
   potential_loan_amount  potential_int_rate  potential_term     offer_id
0                12500.0               10.99            36.0  17179869184
1                15000.0               10.99            36.0  17179869186