Home-Credit-Risk-API/risk_preprocessing.py at main · anmolsharma152/Home-Credit-Risk-API · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# Configuration
INPUT_FILE = 'application_train.csv'


def prepare_credit_data():
    print(f"⏳ Loading {INPUT_FILE}...")
    try:
        df = pd.read_csv(INPUT_FILE)
    except FileNotFoundError:
        print("❌ Error: 'application_train.csv' not found. Please download it from Kaggle.")
        return

    print(f"   ...Raw Data Shape: {df.shape}")

    # --- 1. Target Imbalance Check ---
    # In banking, defaults (1) are rare. We need to know how rare.
    default_rate = df['TARGET'].mean()
    print(f"   ...Default Rate: {default_rate:.2%}")
    if default_rate < 0.10:
        print("   ⚠️ Note: Highly Imbalanced Dataset. We will need 'Class Weights' in LightGBM.")

    # --- 2. Feature Engineering (The "Banker's Logic") ---
    print("   ...Engineering Financial Ratios...")

    # CREDIT_INCOME_PERCENT: How much loan relative to salary?
    df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']

    # ANNUITY_INCOME_PERCENT: How much of monthly salary goes to loan repayment?
    df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

    # CREDIT_TERM: Estimated length of loan (Credit / Annuity)
    df['CREDIT_TERM'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']

    # DAYS_EMPLOYED_PERCENT: What % of their life have they worked?
    df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']

    # --- 3. Handling Categorical Data (LightGBM Prep) ---
    print("   ...Encoding Categorical Variables...")

    # LightGBM handles categories well, but we need to label encode strings first
    # Select object columns (Gender, Education, etc.)
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']

    # We will save the encoders to use in the API later
    encoders = {}

    for col in cat_cols:
        le = LabelEncoder()
        # Fill NaN with 'Unknown' before encoding
        df[col] = df[col].fillna("Unknown")
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le

    # --- 4. Handling Outliers (The "365243" Bug) ---
    # In this specific dataset, 'DAYS_EMPLOYED' has a value 365243 which means "Unemployed/Pensioner"
    # It acts as a massive outlier (1000 years). We replace it with NaN.
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    # --- 5. Split Data ---
    print("   ...Splitting Train/Test...")

    # Drop ID and Target from features
    X = df.drop(columns=['SK_ID_CURR', 'TARGET'])
    y = df['TARGET']

    # Stratify split ensures we keep the same % of defaulters in test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)

    # --- 6. Save ---
    print("   ...Saving processed chunks...")
    X_train.to_csv('X_train.csv', index=False)
    X_test.to_csv('X_test.csv', index=False)
    y_train.to_csv('y_train.csv', index=False)
    y_test.to_csv('y_test.csv', index=False)

    # Save column names for API consistency later
    joblib.dump(X.columns.tolist(), 'model_features.pkl')

    print("✅ Preprocessing Complete. Ready for LightGBM.")


if __name__ == "__main__":
    prepare_credit_data()