Running Validations¶
A validation consists in evaluating same entity (e.g., a model or a feature) on two different input datasets.
We can validate model output for registered models in the Notebook. This is done by calling create_data function of corridor library
In [1]:
Copied!
# import spark library
import findspark; findspark.init(); import pyspark
from pyspark.sql.functions import *
spark = pyspark.sql.SparkSession.builder.getOrCreate()
# read in the Application Level Dataset
application_table = spark.read.parquet('s3a://corridor.dev/master/sampleAppData.parquet')
# create 'application_table_1' as applications before 2015
# application_table_1 = application_table.limit(1000)
application_table_1= application_table.filter(year(application_table.corridor_application_date)<2015).limit(1000)
# create 'application_table_2' as applications in or after 2015
application_table_2= application_table.filter(year(application_table.corridor_application_date)>=2015).limit(1000)
# take a look at the first 5 rows of the application_table dataframe by runningthe line below:
# application_table_1.limit(5).toPandas()
# CHecking if two datasets application_table_1 and application_table_2 have identical schema
print(f'Rows in application_table_1: {application_table_1.count()}')
print(f'Rows in application_table_2: {application_table_2.count()}')
# import spark library
import findspark; findspark.init(); import pyspark
from pyspark.sql.functions import *
spark = pyspark.sql.SparkSession.builder.getOrCreate()
# read in the Application Level Dataset
application_table = spark.read.parquet('s3a://corridor.dev/master/sampleAppData.parquet')
# create 'application_table_1' as applications before 2015
# application_table_1 = application_table.limit(1000)
application_table_1= application_table.filter(year(application_table.corridor_application_date)<2015).limit(1000)
# create 'application_table_2' as applications in or after 2015
application_table_2= application_table.filter(year(application_table.corridor_application_date)>=2015).limit(1000)
# take a look at the first 5 rows of the application_table dataframe by runningthe line below:
# application_table_1.limit(5).toPandas()
# CHecking if two datasets application_table_1 and application_table_2 have identical schema
print(f'Rows in application_table_1: {application_table_1.count()}')
print(f'Rows in application_table_2: {application_table_2.count()}')
Rows in application_table_1: 22 Rows in application_table_2: 61
In [2]:
Copied!
# Import Corridor Package Objects
from corridor import create_data
# Run simulation for first dataset - application_table_1:
# PD Model Strict: pd_model_ver1 is output feature alias
df_1 = create_data('pd_model_ver1',
data={'application': application_table_1})
df_1.describe().show()
# Import Corridor Package Objects
from corridor import create_data
# Run simulation for first dataset - application_table_1:
# PD Model Strict: pd_model_ver1 is output feature alias
df_1 = create_data('pd_model_ver1',
data={'application': application_table_1})
df_1.describe().show()
+-------+-------------------+ |summary| pd_model_ver1| +-------+-------------------+ | count| 22| | mean|0.16363636363636364| | stddev|0.06931951244198711| | min| 0.1| | max| 0.25| +-------+-------------------+
In [3]:
Copied!
# Run simulation for second dataset - application_table_2:
# PD Model Strict: pd_model_ver1 is output feature alias
df_2 = create_data('pd_model_ver1',
data={'application': application_table_2})
df_2.describe().show()
# Run simulation for second dataset - application_table_2:
# PD Model Strict: pd_model_ver1 is output feature alias
df_2 = create_data('pd_model_ver1',
data={'application': application_table_2})
df_2.describe().show()
+-------+--------------------+ |summary| pd_model_ver1| +-------+--------------------+ | count| 61| | mean| 0.16803278688524592| | stddev|0.061259548157304895| | min| 0.05| | max| 0.25| +-------+--------------------+