-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrisk_preprocessing.py
More file actions
90 lines (67 loc) · 3.22 KB
/
risk_preprocessing.py
File metadata and controls
90 lines (67 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
# Configuration
INPUT_FILE = 'application_train.csv'
def prepare_credit_data():
print(f"⏳ Loading {INPUT_FILE}...")
try:
df = pd.read_csv(INPUT_FILE)
except FileNotFoundError:
print("❌ Error: 'application_train.csv' not found. Please download it from Kaggle.")
return
print(f" ...Raw Data Shape: {df.shape}")
# --- 1. Target Imbalance Check ---
# In banking, defaults (1) are rare. We need to know how rare.
default_rate = df['TARGET'].mean()
print(f" ...Default Rate: {default_rate:.2%}")
if default_rate < 0.10:
print(" ⚠️ Note: Highly Imbalanced Dataset. We will need 'Class Weights' in LightGBM.")
# --- 2. Feature Engineering (The "Banker's Logic") ---
print(" ...Engineering Financial Ratios...")
# CREDIT_INCOME_PERCENT: How much loan relative to salary?
df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
# ANNUITY_INCOME_PERCENT: How much of monthly salary goes to loan repayment?
df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
# CREDIT_TERM: Estimated length of loan (Credit / Annuity)
df['CREDIT_TERM'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
# DAYS_EMPLOYED_PERCENT: What % of their life have they worked?
df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
# --- 3. Handling Categorical Data (LightGBM Prep) ---
print(" ...Encoding Categorical Variables...")
# LightGBM handles categories well, but we need to label encode strings first
# Select object columns (Gender, Education, etc.)
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
# We will save the encoders to use in the API later
encoders = {}
for col in cat_cols:
le = LabelEncoder()
# Fill NaN with 'Unknown' before encoding
df[col] = df[col].fillna("Unknown")
df[col] = le.fit_transform(df[col].astype(str))
encoders[col] = le
# --- 4. Handling Outliers (The "365243" Bug) ---
# In this specific dataset, 'DAYS_EMPLOYED' has a value 365243 which means "Unemployed/Pensioner"
# It acts as a massive outlier (1000 years). We replace it with NaN.
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
# --- 5. Split Data ---
print(" ...Splitting Train/Test...")
# Drop ID and Target from features
X = df.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df['TARGET']
# Stratify split ensures we keep the same % of defaulters in test set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42)
# --- 6. Save ---
print(" ...Saving processed chunks...")
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
# Save column names for API consistency later
joblib.dump(X.columns.tolist(), 'model_features.pkl')
print("✅ Preprocessing Complete. Ready for LightGBM.")
if __name__ == "__main__":
prepare_credit_data()