data.loader

Data loading utilities for real insurance pricing datasets.

  1"""Data loading utilities for real insurance pricing datasets."""
  2
  3from __future__ import annotations
  4
  5import pickle
  6from pathlib import Path
  7from typing import Any, Literal
  8
  9import numpy as np
 10import pandas as pd
 11
 12# Directory containing model artifacts and CSV datasets
 13_DATA_DIR = Path(__file__).parent
 14
 15# Base 9 feature cols shared by both models (no premium, no U, no extra)
 16_BASE_COLS: list[str] = [
 17    "X_age",
 18    "X_bonus_malus_rating",
 19    "X_distr_channel",
 20    "X_vehicle_type",
 21    "X_ttm_claims",
 22    "X_policy_count",
 23    "X_risk_code",
 24    "X_vehicle_age",
 25    "X_policy_tenure",
 26]
 27
 28# GLM state features: 12 cols (9 base + premium + prev_renewal_perc + year).
 29# The GLM acceptance CSV has all 12; extra cols are for the policy only.
 30FEATURE_COLS_GLM: list[str] = _BASE_COLS + [
 31    "X_policy_premium",   # index 9; used in revenue term u * premium(x)
 32    "X_prev_renewal_perc",
 33    "X_year",
 34]
 35
 36# XGB state features: 10 cols (9 base + premium).
 37# The XGB acceptance CSV doesn't carry X_prev_renewal_perc / X_year.
 38FEATURE_COLS_XGB: list[str] = _BASE_COLS + [
 39    "X_policy_premium",   # index 9; used in revenue term u * premium(x)
 40]
 41
 42# Default alias — use when you don't need model-specific differences
 43FEATURE_COLS = FEATURE_COLS_XGB
 44
 45# Columns consumed by the loss model (no premium, no U)
 46LOSS_FEATURE_COLS: list[str] = _BASE_COLS
 47
 48# Columns consumed by acceptance model state part: base + premium (no U)
 49ACCEPTANCE_STATE_COLS: list[str] = _BASE_COLS + ["X_policy_premium"]
 50
 51_PREMIUM_COL_INDEX: int = 9  # index of X_policy_premium in both FEATURE_COLS variants
 52
 53_ARTIFACT_PATHS: dict[str, dict[str, Path]] = {
 54    "glm": {
 55        "acceptance": _DATA_DIR / "model_artifacts" / "glm_logistic_prob_acceptance.pkl",
 56        "loss": _DATA_DIR / "model_artifacts" / "linear_regression_expected_fin_loss.pkl",
 57    },
 58    "xgb": {
 59        "acceptance": _DATA_DIR / "model_artifacts" / "xgb_classifier_prob_acceptance.pkl",
 60        "loss": _DATA_DIR / "model_artifacts" / "xgb_regressor_expected_fin_loss.pkl",
 61    },
 62}
 63
 64_ACCEPTANCE_CSV_PATHS: dict[str, Path] = {
 65    "glm": _DATA_DIR
 66    / "dataset_bbox_optim_linear_models"
 67    / "df_acceptance_linear_model_black_box.csv",
 68    "xgb": _DATA_DIR
 69    / "dataset_bbox_optim_xgb_models"
 70    / "df_acceptance_xgb_black_box.csv",
 71}
 72
 73
 74def load_model_artifacts(model_type: Literal["glm", "xgb"]) -> tuple[Any, Any]:
 75    """Load and return (acceptance_model, loss_model) from pickle files.
 76
 77    acceptance_model: GLM logistic Pipeline or XGBClassifier.
 78    loss_model: Linear Regression or XGBRegressor.
 79    """
 80    if model_type not in _ARTIFACT_PATHS:
 81        raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.")
 82    paths = _ARTIFACT_PATHS[model_type]
 83    with open(paths["acceptance"], "rb") as f:
 84        acceptance_model = pickle.load(f)
 85    with open(paths["loss"], "rb") as f:
 86        loss_model = pickle.load(f)
 87    return acceptance_model, loss_model
 88
 89
 90def load_x_array(model_type: Literal["glm", "xgb"], n_rows: int = 5000) -> np.ndarray:
 91    """Load first n_rows of real X features from the acceptance CSV.
 92
 93    GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year).
 94    XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only).
 95    Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded.
 96    """
 97    if model_type not in _ACCEPTANCE_CSV_PATHS:
 98        raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.")
 99    feature_cols = FEATURE_COLS_GLM if model_type == "glm" else FEATURE_COLS_XGB
100    df = pd.read_csv(_ACCEPTANCE_CSV_PATHS[model_type], sep=";", nrows=n_rows)
101    return df[feature_cols].to_numpy(dtype=float)
102
103
104def extract_glm_u_coef(glm_pipeline: Any) -> float:
105    """Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline.
106
107    Uses the ColumnTransformer's feature names and the StandardScaler scale
108    to compute the unscaled coefficient for U in the logistic regression.
109    """
110    preprocessor = glm_pipeline.named_steps["preprocessor"]
111    classifier = glm_pipeline.named_steps["classifier"]
112
113    # Get full output feature names from ColumnTransformer (sklearn >= 1.0)
114    transformed_names = list(preprocessor.get_feature_names_out())
115    # Names are like "pipeline-1__U", "pipeline-2__X_distr_channel_0", etc.
116    u_indices = [i for i, name in enumerate(transformed_names) if name.endswith("__U") or name == "U"]
117    if len(u_indices) != 1:
118        raise ValueError(
119            f"Expected exactly 1 'U' column in preprocessor output, found at indices: {u_indices}. "
120            f"Available names: {transformed_names}"
121        )
122    i_U_out = u_indices[0]
123    w_U = float(classifier.coef_[0, i_U_out])
124
125    # Find std_U from the StandardScaler in the numeric sub-pipeline
126    for _, transformer, cols in preprocessor.transformers_:
127        if not hasattr(transformer, "named_steps"):
128            continue
129        if "scaler" not in transformer.named_steps:
130            continue
131        col_list = list(cols)
132        if "U" not in col_list:
133            continue
134        i_U_numeric = col_list.index("U")
135        std_U = float(transformer.named_steps["scaler"].scale_[i_U_numeric])
136        return w_U / std_U
137
138    raise ValueError("Could not find a StandardScaler containing 'U' in the GLM pipeline preprocessor.")
139
140
141__all__ = [
142    "FEATURE_COLS",
143    "FEATURE_COLS_GLM",
144    "FEATURE_COLS_XGB",
145    "LOSS_FEATURE_COLS",
146    "ACCEPTANCE_STATE_COLS",
147    "load_model_artifacts",
148    "load_x_array",
149    "extract_glm_u_coef",
150]
FEATURE_COLS = ['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
FEATURE_COLS_GLM: list[str] = ['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium', 'X_prev_renewal_perc', 'X_year']
FEATURE_COLS_XGB: list[str] = ['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
LOSS_FEATURE_COLS: list[str] = ['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure']
ACCEPTANCE_STATE_COLS: list[str] = ['X_age', 'X_bonus_malus_rating', 'X_distr_channel', 'X_vehicle_type', 'X_ttm_claims', 'X_policy_count', 'X_risk_code', 'X_vehicle_age', 'X_policy_tenure', 'X_policy_premium']
def load_model_artifacts(model_type: Literal['glm', 'xgb']) -> tuple[typing.Any, typing.Any]:
75def load_model_artifacts(model_type: Literal["glm", "xgb"]) -> tuple[Any, Any]:
76    """Load and return (acceptance_model, loss_model) from pickle files.
77
78    acceptance_model: GLM logistic Pipeline or XGBClassifier.
79    loss_model: Linear Regression or XGBRegressor.
80    """
81    if model_type not in _ARTIFACT_PATHS:
82        raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.")
83    paths = _ARTIFACT_PATHS[model_type]
84    with open(paths["acceptance"], "rb") as f:
85        acceptance_model = pickle.load(f)
86    with open(paths["loss"], "rb") as f:
87        loss_model = pickle.load(f)
88    return acceptance_model, loss_model

Load and return (acceptance_model, loss_model) from pickle files.

acceptance_model: GLM logistic Pipeline or XGBClassifier. loss_model: Linear Regression or XGBRegressor.

def load_x_array(model_type: Literal['glm', 'xgb'], n_rows: int = 5000) -> numpy.ndarray:
 91def load_x_array(model_type: Literal["glm", "xgb"], n_rows: int = 5000) -> np.ndarray:
 92    """Load first n_rows of real X features from the acceptance CSV.
 93
 94    GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year).
 95    XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only).
 96    Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded.
 97    """
 98    if model_type not in _ACCEPTANCE_CSV_PATHS:
 99        raise ValueError(f"model_type must be 'glm' or 'xgb', got '{model_type}'.")
100    feature_cols = FEATURE_COLS_GLM if model_type == "glm" else FEATURE_COLS_XGB
101    df = pd.read_csv(_ACCEPTANCE_CSV_PATHS[model_type], sep=";", nrows=n_rows)
102    return df[feature_cols].to_numpy(dtype=float)

Load first n_rows of real X features from the acceptance CSV.

GLM: returns shape (n_rows, 12) using FEATURE_COLS_GLM (includes X_prev_renewal_perc, X_year). XGB: returns shape (n_rows, 10) using FEATURE_COLS_XGB (base + premium only). Metadata columns (1-Z, Z, Y, Y_hat, U, prob_acceptance, etc.) are excluded.

def extract_glm_u_coef(glm_pipeline: Any) -> float:
105def extract_glm_u_coef(glm_pipeline: Any) -> float:
106    """Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline.
107
108    Uses the ColumnTransformer's feature names and the StandardScaler scale
109    to compute the unscaled coefficient for U in the logistic regression.
110    """
111    preprocessor = glm_pipeline.named_steps["preprocessor"]
112    classifier = glm_pipeline.named_steps["classifier"]
113
114    # Get full output feature names from ColumnTransformer (sklearn >= 1.0)
115    transformed_names = list(preprocessor.get_feature_names_out())
116    # Names are like "pipeline-1__U", "pipeline-2__X_distr_channel_0", etc.
117    u_indices = [i for i, name in enumerate(transformed_names) if name.endswith("__U") or name == "U"]
118    if len(u_indices) != 1:
119        raise ValueError(
120            f"Expected exactly 1 'U' column in preprocessor output, found at indices: {u_indices}. "
121            f"Available names: {transformed_names}"
122        )
123    i_U_out = u_indices[0]
124    w_U = float(classifier.coef_[0, i_U_out])
125
126    # Find std_U from the StandardScaler in the numeric sub-pipeline
127    for _, transformer, cols in preprocessor.transformers_:
128        if not hasattr(transformer, "named_steps"):
129            continue
130        if "scaler" not in transformer.named_steps:
131            continue
132        col_list = list(cols)
133        if "U" not in col_list:
134            continue
135        i_U_numeric = col_list.index("U")
136        std_U = float(transformer.named_steps["scaler"].scale_[i_U_numeric])
137        return w_U / std_U
138
139    raise ValueError("Could not find a StandardScaler containing 'U' in the GLM pipeline preprocessor.")

Extract effective d_logit/dU = w_U / std_U from a fitted GLM Pipeline.

Uses the ColumnTransformer's feature names and the StandardScaler scale to compute the unscaled coefficient for U in the logistic regression.