Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion generate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from src.data_preprocessing import preprocess_data
from src.feature_engineering import engineer_features
from src.clustering import cluster_users
from src.profiling import profile_segments
import os

# Load dataset
Expand All @@ -20,8 +21,10 @@
# Perform clustering and save model
labels = cluster_users(X, k=4)

# Assign segments and build profiles
profile_segments(df, labels)

# Save segmented output
df['segment'] = labels
os.makedirs("outputs", exist_ok=True)
df.to_csv("outputs/segmented_users.csv", index=False)

Expand Down
24 changes: 24 additions & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
TIME_WEEKDAY_COL = 'Time Spent Online (hrs/weekday)'
TIME_WEEKEND_COL = 'Time Spent Online (hrs/weekend)'
CTR_COL = 'Click-Through Rates (CTR)'
CONVERSION_COL = 'Conversion Rates'
ENGAGEMENT_SCORE_COL = 'engagement_score'
AD_RESPONSIVENESS_COL = 'ad_responsiveness'

NUMERICAL_COLS = [
TIME_WEEKDAY_COL,
TIME_WEEKEND_COL,
CTR_COL,
CONVERSION_COL,
'Ad Interaction Time (sec)',
ENGAGEMENT_SCORE_COL,
AD_RESPONSIVENESS_COL,
]

CATEGORICAL_COLS = [
'Age',
'Gender',
'Income Level',
'Education Level',
'Device Usage',
]
23 changes: 3 additions & 20 deletions src/data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,12 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from src.constants import NUMERICAL_COLS, CATEGORICAL_COLS

def preprocess_data(df):
numerical_cols = [
'Time Spent Online (hrs/weekday)',
'Time Spent Online (hrs/weekend)',
'Click-Through Rates (CTR)',
'Conversion Rates',
'Ad Interaction Time (sec)',
'engagement_score',
'ad_responsiveness'
]

categorical_cols = [
'Age', # ← categorical (e.g., '25-34')
'Gender',
'Income Level', # ← also likely a string like 'High'
'Education Level',
'Device Usage'
]

preprocessor = ColumnTransformer([
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(handle_unknown="ignore"), categorical_cols)
('num', StandardScaler(), NUMERICAL_COLS),
('cat', OneHotEncoder(handle_unknown="ignore"), CATEGORICAL_COLS)
])

X = preprocessor.fit_transform(df)
Expand Down
15 changes: 12 additions & 3 deletions src/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from src.constants import (
TIME_WEEKDAY_COL,
TIME_WEEKEND_COL,
CTR_COL,
CONVERSION_COL,
ENGAGEMENT_SCORE_COL,
AD_RESPONSIVENESS_COL,
)

def engineer_features(df):
df['engagement_score'] = (
df['Time Spent Online (hrs/weekday)'] + df['Time Spent Online (hrs/weekend)']
df[ENGAGEMENT_SCORE_COL] = (
df[TIME_WEEKDAY_COL] + df[TIME_WEEKEND_COL]
) / 2
df['ad_responsiveness'] = df['Click-Through Rates (CTR)'] * df['Conversion Rates']
df[AD_RESPONSIVENESS_COL] = df[CTR_COL] * df[CONVERSION_COL]
return df