data.loader
Data loading utilities for real insurance pricing datasets.
1"""Data loading utilities for real insurance pricing datasets.""" 2 3from __future__ import annotations 4 5import pickle 6from pathlib import Path 7from typing import Any, Literal 8 9import numpy as np 10import pandas as pd 11 12# Directory containing model artifacts and CSV datasets 13_DATA_DIR = Path(__file__).parent 14 15# Base 9 feature cols shared by both models (no premium, no U, no extra) 16_BASE_COLS: list[str] = [ 17 "X_age", 18 "X_bonus_malus_rating", 19 "X_distr_channel", 20 "X_vehicle_type", 21 "X_ttm_claims", 22 "X_policy_count", 23 "X_risk_code", 24 "X_vehicle_age", 25 "X_policy_tenure", 26] 27 28# GLM state features: 12 cols (9 base + premium + prev_renewal_perc + year). 29# The GLM acceptance CSV has all 12; extra cols are for the policy only. 30FEATURE_COLS_GLM: list[str] = _BASE_COLS + [ 31 "X_policy_premium", # index 9; used in revenue term u * premium(x) 32 "X_prev_renewal_perc", 33 "X_year", 34] 35 36# XGB state features: 10 cols (9 base + premium). 37# The XGB acceptance CSV doesn't carry X_prev_renewal_perc / X_year. 38FEATURE_COLS_XGB: list[str] = _BASE_COLS + [ 39 "X_policy_premium", # index 9; used in revenue term u * premium(x) 40] 41 42# Default alias — use when you don't need model-specific differences 43FEATURE_COLS = FEATURE_COLS_XGB 44 45# Columns consumed by the loss model (no premium, no U) 46LOSS_FEATURE_COLS: list[str] = _BASE_COLS 47 48# Columns consumed by acceptance model state part: base + premium (no U) 49ACCEPTANCE_STATE_COLS: list[str] = _BASE_COLS + ["X_policy_premium"] 50 51_PREMIUM_COL_INDEX: int = 9 # index of X_policy_premium in both FEATURE_COLS variants 52 53_ARTIFACT_PATHS: dict[str, dict[str, Path]] = { 54 "glm": { 55 "acceptance": _DATA_DIR / "model_artifacts" / "glm_logistic_prob_acceptance.pkl", 56 "loss": _DATA_DIR / "model_artifacts" / "linear_regression_expected_fin_loss.pkl", 57 }, 58 "xgb": { 59 "acceptance": _DATA_DIR / "model_artifacts" / "xgb_classifier_prob_acceptance.pkl", 60 "loss": _DATA_DIR / "model_artifacts" / "xgb_regressor_expected_fin_loss.pkl", 61 }, 62} 63 64_ACCEPTANCE_CSV_PATHS: dict[str, Path] = { 65 "glm": _DATA_DIR 66 / "dataset_bbox_optim_linear_models" 67 / "df_acceptance_linear_model_black_box.csv", 68 "xgb": _DATA_DIR 69 / "dataset_bbox_optim_xgb_models" 70 / "df_acceptance_xgb_black_box.csv", 71} 72 73 74def load_model_artifacts(model_type: Literal["glm", "xgb"]) -> tuple[Any, Any]: 75 """Load and return (acceptance_model, loss_model) from pickle files. 76 77 acceptance_model: GLM logistic Pipeline or XGBClassifier. 78 loss_model: Linear Regression or XGBRegressor. 79 """ 80 if model_type not in _ARTIFACT_PATHS: 81 raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.") 82 paths = _ARTIFACT_PATHS[model_type] 83 with open(paths["acceptance"], "rb") as f: 84 acceptance_model = pickle.load(f) 85 with open(paths["loss"], "rb") as f: 86 loss_model = pickle.load(f) 87 return acceptance_model, loss_model 88 89 90def load_x_array(model_type: Literal["glm", "xgb"], n_rows: int = 5000) -> np.ndarray: 91 """Load first n_rows of real X features from the acceptance CSV. 92 93 GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year). 94 XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only). 95 Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded. 96 """ 97 if model_type not in _ACCEPTANCE_CSV_PATHS: 98 raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.") 99 feature_cols = FEATURE_COLS_GLM if model_type == "glm" else FEATURE_COLS_XGB 100 df = pd.read_csv(_ACCEPTANCE_CSV_PATHS[model_type], sep=";", nrows=n_rows) 101 return df[feature_cols].to_numpy(dtype=float) 102 103 104def extract_glm_u_coef(glm_pipeline: Any) -> float: 105 """Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline. 106 107 Uses the ColumnTransformer's feature names and the StandardScaler scale 108 to compute the unscaled coefficient for U in the logistic regression. 109 """ 110 preprocessor = glm_pipeline.named_steps["preprocessor"] 111 classifier = glm_pipeline.named_steps["classifier"] 112 113 # Get full output feature names from ColumnTransformer (sklearn >= 1.0) 114 transformed_names = list(preprocessor.get_feature_names_out()) 115 # Names are like "pipeline-1__U", "pipeline-2__X_distr_channel_0", etc. 116 u_indices = [i for i, name in enumerate(transformed_names) if name.endswith("__U") or name == "U"] 117 if len(u_indices) != 1: 118 raise ValueError( 119 f"Expected exactly 1 'U' column in preprocessor output, found at indices: {u_indices}. " 120 f"Available names: {transformed_names}" 121 ) 122 i_U_out = u_indices[0] 123 w_U = float(classifier.coef_[0, i_U_out]) 124 125 # Find std_U from the StandardScaler in the numeric sub-pipeline 126 for _, transformer, cols in preprocessor.transformers_: 127 if not hasattr(transformer, "named_steps"): 128 continue 129 if "scaler" not in transformer.named_steps: 130 continue 131 col_list = list(cols) 132 if "U" not in col_list: 133 continue 134 i_U_numeric = col_list.index("U") 135 std_U = float(transformer.named_steps["scaler"].scale_[i_U_numeric]) 136 return w_U / std_U 137 138 raise ValueError("Could not find a StandardScaler containing 'U' in the GLM pipeline preprocessor.") 139 140 141__all__ = [ 142 "FEATURE_COLS", 143 "FEATURE_COLS_GLM", 144 "FEATURE_COLS_XGB", 145 "LOSS_FEATURE_COLS", 146 "ACCEPTANCE_STATE_COLS", 147 "load_model_artifacts", 148 "load_x_array", 149 "extract_glm_u_coef", 150]
FEATURE_COLS =
['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
FEATURE_COLS_GLM: list[str] =
['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium', 'X_prev_renewal_perc', 'X_year']
FEATURE_COLS_XGB: list[str] =
['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
LOSS_FEATURE_COLS: list[str] =
['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure']
ACCEPTANCE_STATE_COLS: list[str] =
['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
def
load_model_artifacts(model_type: Literal['glm', 'xgb']) -> tuple[typing.Any, typing.Any]:
75def load_model_artifacts(model_type: Literal["glm", "xgb"]) -> tuple[Any, Any]: 76 """Load and return (acceptance_model, loss_model) from pickle files. 77 78 acceptance_model: GLM logistic Pipeline or XGBClassifier. 79 loss_model: Linear Regression or XGBRegressor. 80 """ 81 if model_type not in _ARTIFACT_PATHS: 82 raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.") 83 paths = _ARTIFACT_PATHS[model_type] 84 with open(paths["acceptance"], "rb") as f: 85 acceptance_model = pickle.load(f) 86 with open(paths["loss"], "rb") as f: 87 loss_model = pickle.load(f) 88 return acceptance_model, loss_model
Load and return (acceptance_model, loss_model) from pickle files.
acceptance_model: GLM logistic Pipeline or XGBClassifier. loss_model: Linear Regression or XGBRegressor.
def
load_x_array(model_type: Literal['glm', 'xgb'], n_rows: int = 5000) -> numpy.ndarray:
91def load_x_array(model_type: Literal["glm", "xgb"], n_rows: int = 5000) -> np.ndarray: 92 """Load first n_rows of real X features from the acceptance CSV. 93 94 GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year). 95 XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only). 96 Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded. 97 """ 98 if model_type not in _ACCEPTANCE_CSV_PATHS: 99 raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.") 100 feature_cols = FEATURE_COLS_GLM if model_type == "glm" else FEATURE_COLS_XGB 101 df = pd.read_csv(_ACCEPTANCE_CSV_PATHS[model_type], sep=";", nrows=n_rows) 102 return df[feature_cols].to_numpy(dtype=float)
Load first n_rows of real X features from the acceptance CSV.
GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year). XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only). Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded.
def
extract_glm_u_coef(glm_pipeline: Any) -> float:
105def extract_glm_u_coef(glm_pipeline: Any) -> float: 106 """Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline. 107 108 Uses the ColumnTransformer's feature names and the StandardScaler scale 109 to compute the unscaled coefficient for U in the logistic regression. 110 """ 111 preprocessor = glm_pipeline.named_steps["preprocessor"] 112 classifier = glm_pipeline.named_steps["classifier"] 113 114 # Get full output feature names from ColumnTransformer (sklearn >= 1.0) 115 transformed_names = list(preprocessor.get_feature_names_out()) 116 # Names are like "pipeline-1__U", "pipeline-2__X_distr_channel_0", etc. 117 u_indices = [i for i, name in enumerate(transformed_names) if name.endswith("__U") or name == "U"] 118 if len(u_indices) != 1: 119 raise ValueError( 120 f"Expected exactly 1 'U' column in preprocessor output, found at indices: {u_indices}. " 121 f"Available names: {transformed_names}" 122 ) 123 i_U_out = u_indices[0] 124 w_U = float(classifier.coef_[0, i_U_out]) 125 126 # Find std_U from the StandardScaler in the numeric sub-pipeline 127 for _, transformer, cols in preprocessor.transformers_: 128 if not hasattr(transformer, "named_steps"): 129 continue 130 if "scaler" not in transformer.named_steps: 131 continue 132 col_list = list(cols) 133 if "U" not in col_list: 134 continue 135 i_U_numeric = col_list.index("U") 136 std_U = float(transformer.named_steps["scaler"].scale_[i_U_numeric]) 137 return w_U / std_U 138 139 raise ValueError("Could not find a StandardScaler containing 'U' in the GLM pipeline preprocessor.")
Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline.
Uses the ColumnTransformer's feature names and the StandardScaler scale to compute the unscaled coefficient for U in the logistic regression.