Imports¶
In [1]:
Copied!
import numpy as np
import pandas as pd
import pyspark
import random
from pyspark.sql import functions as F, types as T
import numpy as np
import pandas as pd
import pyspark
import random
from pyspark.sql import functions as F, types as T
Create spark session¶
In [2]:
Copied!
spark = pyspark.sql.SparkSession.builder \
.config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
.config("hive.metastore.uris", "thrift://127.0.0.1:9083") \
.enableHiveSupport() \
.getOrCreate()
spark = pyspark.sql.SparkSession.builder \
.config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
.config("hive.metastore.uris", "thrift://127.0.0.1:9083") \
.enableHiveSupport() \
.getOrCreate()
Data Prep¶
In [14]:
Copied!
N = 100
size = 1000
loan_perf_dict = {
'id': [1]*size,
'day_of_month': [random.randrange(1, 30) for _ in range(size)],
'principal_paid': [random.randrange(10000, 20000, 1000) for _ in range(size)],
'int_paid': [random.randrange(1000, 1500, 100) for _ in range(size)],
'co': [random.randint(0, 1) for _ in range(size)],
'co_amt': [random.randrange(10000, 15000, 1000) for _ in range(size)],
}
loan_perf_df = pd.DataFrame(loan_perf_dict)
N = 100
size = 1000
loan_perf_dict = {
'id': [1]*size,
'day_of_month': [random.randrange(1, 30) for _ in range(size)],
'principal_paid': [random.randrange(10000, 20000, 1000) for _ in range(size)],
'int_paid': [random.randrange(1000, 1500, 100) for _ in range(size)],
'co': [random.randint(0, 1) for _ in range(size)],
'co_amt': [random.randrange(10000, 15000, 1000) for _ in range(size)],
}
loan_perf_df = pd.DataFrame(loan_perf_dict)
Case 1¶
Problem statement: Given loan_performance_table:
- Consider the entries for which 10 <= day_of_month <= 20
- Compute the max principal paid for the above entries
In [4]:
Copied!
%%timeit
for _ in range(N):
filtered_df = loan_perf_df[loan_perf_df['day_of_month'].between(10, 20, inclusive=True)]
val = filtered_df['principal_paid'].max()
%%timeit
for _ in range(N):
filtered_df = loan_perf_df[loan_perf_df['day_of_month'].between(10, 20, inclusive=True)]
val = filtered_df['principal_paid'].max()
64.9 ms ± 2.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [18]:
Copied!
%%timeit
for _ in range(N):
loan_perf_df = pd.DataFrame(loan_perf_dict)
filtered_df = loan_perf_df[loan_perf_df['day_of_month'].between(10, 20, inclusive=True)]
val = filtered_df['principal_paid'].max()
%%timeit
for _ in range(N):
loan_perf_df = pd.DataFrame(loan_perf_dict)
filtered_df = loan_perf_df[loan_perf_df['day_of_month'].between(10, 20, inclusive=True)]
val = filtered_df['principal_paid'].max()
300 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [15]:
Copied!
%%timeit
for _ in range(N):
val = loan_perf_df['principal_paid'][(loan_perf_df['day_of_month'] >= 10) & (loan_perf_df['day_of_month'] <= 20)].max()
%%timeit
for _ in range(N):
val = loan_perf_df['principal_paid'][(loan_perf_df['day_of_month'] >= 10) & (loan_perf_df['day_of_month'] <= 20)].max()
51.9 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [17]:
Copied!
%%timeit
for _ in range(N):
val = loan_perf_df['principal_paid'][loan_perf_df['day_of_month'].between(10, 20, inclusive=True)].max()
%%timeit
for _ in range(N):
val = loan_perf_df['principal_paid'][loan_perf_df['day_of_month'].between(10, 20, inclusive=True)].max()
51 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Def-2¶
Python
- Create a list of the required day_of_months inside the for loop
- Find running max of the principal paid
In [5]:
Copied!
%%timeit
for _ in range(N):
res = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
required_dom_list = [i for i in (range(10, 21))] # datetime.datetime.strp()
if dom in required_dom_list:
res = max(res, pp)
val = res
%%timeit
for _ in range(N):
res = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
required_dom_list = [i for i in (range(10, 21))] # datetime.datetime.strp()
if dom in required_dom_list:
res = max(res, pp)
val = res
94.9 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Def-3¶
Python
- Create a list of the required day_of_months outside the for loop
- Find running max of the principal paid
In [6]:
Copied!
%%timeit
for _ in range(N):
res = -1
required_dom_list = list(range(10, 21))
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if dom in required_dom_list:
res = max(res, pp)
val = res
%%timeit
for _ in range(N):
res = -1
required_dom_list = list(range(10, 21))
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if dom in required_dom_list:
res = max(res, pp)
val = res
22.3 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Def-4¶
Python
- Do not create list but do range comparison in if condition
- Find running max of the principal paid
In [7]:
Copied!
%%timeit
for _ in range(N):
res = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if 10 <= dom <= 20:
res = max(res, pp)
val = res
%%timeit
for _ in range(N):
res = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if 10 <= dom <= 20:
res = max(res, pp)
val = res
13.5 ms ± 322 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [ ]:
Copied!
Case 2¶
Problem statement: Given loan_performance_table:
- Consider the entries for which day of week is not in (6, 7)
- day_of_week(dow) is derived from day_of_month(dom)
- dow = (dom % 7) if (dom % 7) != 0 else 7
- dow can take values from 1-7 (both inclusive)
- Compute the max principal paid for the above entries
Def-1¶
Pandas
- create dow column, apply on the entire dataframe
- filter based on dow column
- find max of principal paid
In [8]:
Copied!
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df.apply(lambda row: row['day_of_month'] % 7 if row['day_of_month'] % 7 != 0 else 7, axis=1)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df.apply(lambda row: row['day_of_month'] % 7 if row['day_of_month'] % 7 != 0 else 7, axis=1)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
1.15 s ± 66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [ ]:
Copied!
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df.apply(lambda row: row['day_of_month'] % 7 if row['day_of_month'] % 7 != 0 else 7, axis=1)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df.apply(lambda row: row['day_of_month'] % 7 if row['day_of_month'] % 7 != 0 else 7, axis=1)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
Def-2¶
Pandas
- create dow column
- filter based on dow column, apply only on the required column
- find max of principal paid
In [9]:
Copied!
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df['day_of_month'].apply(lambda row: row % 7 if row % 7 != 0 else 7)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
%%timeit
for _ in range(N):
pd_df = loan_perf_df
pd_df['dow'] = pd_df['day_of_month'].apply(lambda row: row % 7 if row % 7 != 0 else 7)
pd_df = pd_df[~pd_df['dow'].isin((6, 7))]
val = pd_df['principal_paid'].max()
121 ms ± 5.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Def-3¶
Python
- create filtered lists for each of the column
- define the function inside the for loop
- find running max from the filtered lists using another loop
In [10]:
Copied!
%%timeit
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
pp_required = -1
for pp, ip, co, coamt, dom in zip(pp_list, ip_list, co_list, coamt_list, dom_list):
pp_required = max(pp_required, pp)
val = pp_required
%%timeit
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
pp_required = -1
for pp, ip, co, coamt, dom in zip(pp_list, ip_list, co_list, coamt_list, dom_list):
pp_required = max(pp_required, pp)
val = pp_required
60.9 ms ± 753 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Def-4¶
Python
- create filtered lists for each of the column
- define the function outside the for loop
- find running max from the filtered lists using another loop
In [11]:
Copied!
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
pp_required = -1
for pp, ip, co, coamt, dom in zip(pp_list, ip_list, co_list, coamt_list, dom_list):
pp_required = max(pp_required, pp)
val = pp_required
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
pp_required = -1
for pp, ip, co, coamt, dom in zip(pp_list, ip_list, co_list, coamt_list, dom_list):
pp_required = max(pp_required, pp)
val = pp_required
55.4 ms ± 1.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [12]:
Copied!
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
val = max(pp_list)
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_list, ip_list, co_list, coamt_list, dom_list = [], [], [], [], []
for pp, ip, co, coamt, dom in zip(
loan_perf_dict['principal_paid'], loan_perf_dict['int_paid'], loan_perf_dict['co'],
loan_perf_dict['co_amt'], loan_perf_dict['day_of_month']
):
if dow(dom) not in (6, 7):
pp_list.append(pp)
ip_list.append(ip)
co_list.append(co)
coamt_list.append(coamt)
dom_list.append(dom)
val = max(pp_list)
44.5 ms ± 829 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [13]:
Copied!
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_required = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if dow(dom) not in (6, 7):
pp_required = max(pp_required, pp)
val = pp_required
%%timeit
def dow(dom):
modval = dom % 7
if modval == 0:
return 7
return modval
for _ in range(N):
pp_required = -1
for pp, dom in zip(loan_perf_dict['principal_paid'], loan_perf_dict['day_of_month']):
if dow(dom) not in (6, 7):
pp_required = max(pp_required, pp)
val = pp_required
31.9 ms ± 681 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [ ]:
Copied!
In [ ]:
Copied!