diff --git a/generate_model.py b/generate_model.py index dcd17b2..db04510 100644 --- a/generate_model.py +++ b/generate_model.py @@ -3,6 +3,7 @@ from src.data_preprocessing import preprocess_data from src.feature_engineering import engineer_features from src.clustering import cluster_users +from src.profiling import profile_segments import os # Load dataset @@ -20,8 +21,10 @@ # Perform clustering and save model labels = cluster_users(X, k=4) +# Assign segments and build profiles +profile_segments(df, labels) + # Save segmented output -df['segment'] = labels os.makedirs("outputs", exist_ok=True) df.to_csv("outputs/segmented_users.csv", index=False) diff --git a/src/constants.py b/src/constants.py new file mode 100644 index 0000000..90aaad8 --- /dev/null +++ b/src/constants.py @@ -0,0 +1,24 @@ +TIME_WEEKDAY_COL = 'Time Spent Online (hrs/weekday)' +TIME_WEEKEND_COL = 'Time Spent Online (hrs/weekend)' +CTR_COL = 'Click-Through Rates (CTR)' +CONVERSION_COL = 'Conversion Rates' +ENGAGEMENT_SCORE_COL = 'engagement_score' +AD_RESPONSIVENESS_COL = 'ad_responsiveness' + +NUMERICAL_COLS = [ + TIME_WEEKDAY_COL, + TIME_WEEKEND_COL, + CTR_COL, + CONVERSION_COL, + 'Ad Interaction Time (sec)', + ENGAGEMENT_SCORE_COL, + AD_RESPONSIVENESS_COL, +] + +CATEGORICAL_COLS = [ + 'Age', + 'Gender', + 'Income Level', + 'Education Level', + 'Device Usage', +] diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py index fe1b07c..01cee4e 100644 --- a/src/data_preprocessing.py +++ b/src/data_preprocessing.py @@ -1,29 +1,12 @@ import pandas as pd from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer +from src.constants import NUMERICAL_COLS, CATEGORICAL_COLS def preprocess_data(df): - numerical_cols = [ - 'Time Spent Online (hrs/weekday)', - 'Time Spent Online (hrs/weekend)', - 'Click-Through Rates (CTR)', - 'Conversion Rates', - 'Ad Interaction Time (sec)', - 'engagement_score', - 'ad_responsiveness' - ] - - categorical_cols = [ - 'Age', # ← categorical (e.g., '25-34') - 'Gender', - 'Income Level', # ← also likely a string like 'High' - 'Education Level', - 'Device Usage' - ] - preprocessor = ColumnTransformer([ - ('num', StandardScaler(), numerical_cols), - ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_cols) + ('num', StandardScaler(), NUMERICAL_COLS), + ('cat', OneHotEncoder(handle_unknown="ignore"), CATEGORICAL_COLS) ]) X = preprocessor.fit_transform(df) diff --git a/src/feature_engineering.py b/src/feature_engineering.py index e5d3727..d9102d9 100644 --- a/src/feature_engineering.py +++ b/src/feature_engineering.py @@ -1,6 +1,15 @@ +from src.constants import ( + TIME_WEEKDAY_COL, + TIME_WEEKEND_COL, + CTR_COL, + CONVERSION_COL, + ENGAGEMENT_SCORE_COL, + AD_RESPONSIVENESS_COL, +) + def engineer_features(df): - df['engagement_score'] = ( - df['Time Spent Online (hrs/weekday)'] + df['Time Spent Online (hrs/weekend)'] + df[ENGAGEMENT_SCORE_COL] = ( + df[TIME_WEEKDAY_COL] + df[TIME_WEEKEND_COL] ) / 2 - df['ad_responsiveness'] = df['Click-Through Rates (CTR)'] * df['Conversion Rates'] + df[AD_RESPONSIVENESS_COL] = df[CTR_COL] * df[CONVERSION_COL] return df