Skip to content

Instantly share code, notes, and snippets.

@JumpingDino
Created March 11, 2025 03:51
Show Gist options
  • Select an option

  • Save JumpingDino/798ef4e25c4fe0afc9a9f706c5e19825 to your computer and use it in GitHub Desktop.

Select an option

Save JumpingDino/798ef4e25c4fe0afc9a9f706c5e19825 to your computer and use it in GitHub Desktop.
swap in swap out functionality
import numpy as np
import pandas as pd
from IPython.display import display
class SISOAnalysis:
def __init__(self, input_df, option='equal_volume'):
"""
Initializes the SISOAnalysis class with the given input dataframe and option.
Args:
input_df (pd.DataFrame): The input dataframe containing the necessary columns.
- primary_key: The primary key of the record.
- y_true: The true target variable.
- champ_prob: The probability of the champion model.
- champ_rat: The rating of the champion model.
- chall_prob: The probability of the challenger model.
It's not expected to have the chall_rat column, because it will be calculated
through processing time based on the option selected.
option (str): The option for building the aggregate tables. Default is 'equal_volume'.
Raises:
AssertionError: If the input dataframe does not contain the necessary columns.
"""
self.option=option
#grant the input_df contains all the necessary columns
necessary_cols = ['primary_key', 'y_true', 'champ_prob', 'champ_rat', 'chall_prob']
assert all(col in input_df.columns for col in necessary_cols), f"Missing columns in input_df: {set(necessary_cols) - set(input_df.columns)}"
self.input_df = input_df.copy()
if self.input_df.index.min() == 0:
self.input_df['index'] = self.input_df.index
self.input_df['index'] = self.input_df['index'] + 1
self.champion_volume = pd.Series(self.input_df['champ_rat'], name='champ_rat').value_counts().sort_index()
self.champion_threshold = self.champion_volume.cumsum()
self.champion_agg = self._build_agg_champ()
self.challenger_agg = self._build_agg_chall()
self.challenger_volume = pd.Series(self.input_df['chall_rat'], name='chall_rat').value_counts().sort_index()
self.challenger_threshold = self.challenger_volume.cumsum()
def _build_agg_chall(self) -> pd.DataFrame:
"""
Builds the aggregate table for the challenger model.
Returns:
pd.DataFrame: A dataframe containing the aggregate statistics for the challenger model.
- count_pk: The count of primary keys in each challenger rating group.
- count_default: The count of defaults in each challenger rating group.
- default_rate: The default rate in each challenger rating group.
- cumsum_count_pk: The cumulative sum of primary keys up to each challenger rating group.
- cumsum_count_default: The cumulative sum of defaults up to each challenger rating group.
- cumsum_default_rate: The cumulative default rate up to each challenger rating group.
"""
if self.option == 'equal_volume':
_tmp = self.input_df[['primary_key', 'index', 'chall_prob', "champ_rat", "y_true"]].copy()
_tmp.sort_values(by='chall_prob', ascending=True, inplace=True)
_tmp['chall_rat'] = pd.cut(_tmp['index'],
bins=[0] + list(self.champion_threshold.values),
labels=self.champion_threshold.index)
if 'chall_rat' not in self.input_df.columns:
print("Adding challenger rating to input_df")
self.input_df = self.input_df.merge(
_tmp[['primary_key', 'chall_rat']], on='primary_key', how='left'
)
challenger_agg = _tmp.groupby(['chall_rat'], observed=False).agg(
count_pk=('primary_key', 'count'),
count_default=('y_true', 'sum'),
default_rate = ('y_true', 'mean')
).sort_index()
challenger_agg['cumsum_count_pk'] = challenger_agg['count_pk'].cumsum()
challenger_agg['cumsum_count_default'] = challenger_agg['count_default'].cumsum()
challenger_agg['cumsum_default_rate'] = challenger_agg['cumsum_count_default']/challenger_agg['cumsum_count_pk']
return challenger_agg
def _build_agg_champ(self) -> pd.DataFrame:
"""
Builds the aggregate table for the champion model."
"""
champion_agg = self.input_df.groupby(['champ_rat']).agg(
count_pk=('primary_key', 'count'),
count_default=('y_true', 'sum'),
default_rate = ('y_true', 'mean')
).sort_index()
champion_agg['cumsum_count_pk'] = champion_agg['count_pk'].cumsum()
champion_agg['cumsum_count_default'] = champion_agg['count_default'].cumsum()
champion_agg['cumsum_default_rate'] = champion_agg['cumsum_count_default']/champion_agg['cumsum_count_pk']
return champion_agg
def build_agg_tables(self):
return {'challenger': self.challenger_agg, 'champion': self.champion_agg}
def build_siso_volume(self):
siso_volume = pd.crosstab(self.input_df['champ_rat'],
self.input_df['chall_rat'],
margins=True)
return siso_volume
def build_siso_risk(self):
siso_risk = pd.pivot_table(self.input_df,
values='y_true',
index='champ_rat',
columns='chall_rat',
aggfunc='mean',
observed=False)
return siso_risk
# Create an instance of SISOAnalysis
siso_analysis = SISOAnalysis(input_df)
# Build aggregate tables
agg_tables = siso_analysis.build_agg_tables()
print("Aggregate Tables:")
display(agg_tables['challenger'])
display(agg_tables['champion'])
# Build SISO volume table
siso_volume = siso_analysis.build_siso_volume()
print("\nSISO Volume Table:")
print(siso_volume)
# Build SISO risk table
siso_risk = siso_analysis.build_siso_risk()
print("\nSISO Risk Table:")
print(siso_risk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment