diff --git a/FeatureExtraction.py b/FeatureExtraction.py
deleted file mode 100644
index c27d629..0000000
--- a/FeatureExtraction.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import pandas as pd
-import numpy as np
-from dataCleaning import read_run, column_clean, preprocessing
-import pdb
-
-def overall_cleaning():
-    df_p3_exo = read_run("P3_Exo_1_0.csv") # second run, male
-    df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
-    df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
-    df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female
-
-    df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')
-    df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')
-    df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')
-    df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')
-    combined_df = pd.concat([df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo], ignore_index=True)
-    dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. 
-    # # Show the head of the data
-    # df_p3_exo.describe()
-    df_p3_noexo.head()
-    # df_p4_exo.head()
-    # df_p4_noexo.head()
-    # # Choose inputs
-    # features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()
-    # features.head()
-    feature_sets = []
-    # Run functions to extract features for each dataframe
-    #CP: does this make sure to remove the redundant time series columns?
-    #can keep  ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name 
-    for df in dfs:
-        emg_features = compute_emg_features(df['EMG 1 (mV)'])
-        accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])
-        gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])
-        features = {
-            'emg': emg_features,
-            'accel': accel_features,
-            'gyro': gyro_features
-        }
-        feature_sets.append(features)
-
-    # feature_sets now contains extracted features for each df
-    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets
-    return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats
-
-# Calculations for Feature Extraction from Project_Guide
-def compute_emg_features(signal):
-    return {
-        'mean': np.mean(signal),
-        'max': np.max(signal),
-        'min': np.min(signal),
-        'std': np.std(signal),
-        'rms': np.sqrt(np.mean(signal**2))
-    }
-
-def compute_accel_features(a_x, a_y, a_z):
-    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
-    
-    features = {
-        'peak_accel': np.max(a_mag),
-        'mean_accel': np.mean(a_mag),
-        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),
-        'accel_range': np.max(a_mag) - np.min(a_mag)
-    }
-    return features
-
-def compute_gyro_features(w_x, w_y, w_z):
-    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
-    
-    features = {
-        'peak_angular_vel': np.max(w_mag),
-        'mean_angular_vel': np.mean(w_mag),
-        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),
-        'angular_vel_range': np.max(w_mag) - np.min(w_mag)
-    }
-    return features    
-
-
-if __name__ == '__main__':
-    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()
\ No newline at end of file
diff --git a/dataCleaning.py b/dataCleaning.py
index 087d7e8..4ae484f 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -1,8 +1,17 @@
 
 import pandas as pd
 import numpy as np 
-import pdb 
+import pdb
 
+from sklearn.model_selection import train_test_split 
+from modif_cols import tidy_emg_imu_as_measured 
+from resampling import upsample, downsample
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import ColumnTransformer
 # Data Labels:
 # Label for EMG Data shared:
 #     Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
@@ -27,13 +36,13 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel
                      usecols = usecols,
                      on_bad_lines='skip') 
     df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)',   'RDelt_ACC Y (G)',  'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',
-                 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
-                 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
-                 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
+                 'LDelt_TimeSeries', 'LDelt_EMG_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
+                 'RBicep_TimeSeries', 'RBicep_EMG_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
+                 'LBicep_TimeSeries', 'LBicep_EMG_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
                 ]
-    return df
+    return df #raw data 
 
-def column_clean(df, run_num, gender):
+def column_clean(df):
     #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU 
     extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',
                          'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', 
@@ -45,14 +54,65 @@ def column_clean(df, run_num, gender):
                          'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']
     
     df = df.drop(extr_time_series, axis = 1)
+    df = df.rename(columns={'RDelt_EMG_TimeSeries': 'EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)': 'IMU_TimeSeries'})
     # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols
     # df.columns = df.columns.str.strip()           # Remove leading/trailing spaces (Yuxuan)
     # df = df.apply(pd.to_numeric, errors='coerce') # Conver  t everything to numeric (Yuxuan)
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
-    df['gender'] = gender
-    df['run_num'] = run_num
-    df.to_csv("test.csv")
     return df 
 
-def preprocessing(full_df):
-    pass
+
+#melting and stuff
+def create_sensor_col(df, run_num, gender, exo): 
+    df_pivoted = tidy_emg_imu_as_measured(df)
+    df_pivoted.columns = df_pivoted.columns.str.strip()
+    df_pivoted = df_pivoted.reset_index()
+    df_pivoted['gender'] = gender
+    df_pivoted['run_num'] = run_num
+    df_pivoted['exo'] = exo
+    df_pivoted.to_csv("pivoted_df.csv")
+    return df_pivoted
+
+def preprocessing_actions(full_df, neural_net=False):
+    num_attribs = [
+        'EMG_MilliVolts_filtered',
+        'ACC X (G)_filtered',
+        'ACC Y (G)_filtered',
+        'ACC Z (G)_filtered',
+        'GYRO X (deg/s)_filtered',
+        'GYRO Y (deg/s)_filtered',
+        'GYRO Z (deg/s)_filtered',
+        # Add any other numerical features here
+    ]
+    cat_attribs = [
+        'BodyPart',
+        'gender'
+        #exo is the target variable
+    ]
+    if neural_net:
+        num_pipeline = Pipeline([
+            ("impute", SimpleImputer(strategy="median")),
+            ("standardize", MinMaxScaler()),
+        ])  
+    else:
+        num_pipeline = Pipeline([
+            ("impute", SimpleImputer(strategy="median")),
+            ("standardize", StandardScaler()),
+        ])
+    cat_pipeline = Pipeline([
+        ("impute", SimpleImputer(strategy="most_frequent")),
+        ("oneHot", OneHotEncoder()),
+    ])
+
+    preprocessing = ColumnTransformer([
+        ("num", num_pipeline, num_attribs),
+        ("cat", cat_pipeline, cat_attribs),
+    ])
+    # Prepare data for modeling
+    X = full_df[num_attribs + cat_attribs]
+    y = full_df["exo"]
+    
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    X_train_prepared = preprocessing.fit_transform(X_train)
+    X_test_prepared = preprocessing.transform(X_test)
+    return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing
\ No newline at end of file
diff --git a/debug.ipynb b/debug.ipynb
new file mode 100644
index 0000000..c6ad330
--- /dev/null
+++ b/debug.ipynb
@@ -0,0 +1,952 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e0bebc80-fe7f-4d6c-8387-5c512308e48d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np \n",
+    "from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col\n",
+    "import pdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "3bd13a4f-893e-43c3-bc4a-afabc40bbcde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fields as well as metadata)\n",
+    "    usecols = list(range(0, 56)) \n",
+    "    df = pd.read_csv(filename, low_memory = False, \n",
+    "                     header = 0,  \n",
+    "                     skiprows=skiprows,\n",
+    "                    #  names=header,\n",
+    "                     usecols = usecols,\n",
+    "                     on_bad_lines='skip') \n",
+    "    df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)',   'RDelt_ACC Y (G)',  'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',\n",
+    "                 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',\n",
+    "                 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',\n",
+    "                 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'\n",
+    "                ]\n",
+    "    return df #raw data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "07d4e282-5741-449e-a2a8-71f3b3a6c66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def column_clean(df, run_num, gender):\n",
+    "    #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU \n",
+    "    extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',\n",
+    "                         'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', \n",
+    "                         'LDelt_Acc Y Time Series(s)', 'LDelt_Acc Z Time Series(s)', 'LDelt_GyroXTime Series(s)',\n",
+    "                         'LDelt_GyroYTime Series(s)', 'LDelt_GyroZTime Series(s)', 'RBicep_TimeSeries', 'RBicep_Acc X Time Series(s)',\n",
+    "                         'RBicep_Acc Y Time Series(s)', 'RBicep_Acc Z Time Series(s)', 'RBicep_GyroXTime Series(s)',\n",
+    "                         'RBicep_GyroYTime Series(s)', 'RBicep_GyroZTime Series(s)', 'LBicep_TimeSeries', 'LBicep_Acc X Time Series(s)',\n",
+    "                         'LBicep_Acc Y Time Series(s)', 'LBicep_Acc Z Time Series(s)', 'LBicep_GyroXTime Series(s)', \n",
+    "                         'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']\n",
+    "    \n",
+    "    df = df.drop(extr_time_series, axis = 1)\n",
+    "    # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols\n",
+    "    # df.columns = df.columns.str.strip()           # Remove leading/trailing spaces (Yuxuan)\n",
+    "    # df = df.apply(pd.to_numeric, errors='coerce') # Conver  t everything to numeric (Yuxuan)\n",
+    "    df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data\n",
+    "    df['gender'] = gender\n",
+    "    df['run_num'] = run_num\n",
+    "    return df "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c74acdb7-01e3-443f-92b0-6d2a6e45696c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_sensor_col(df): \n",
+    "    columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n",
+    "    pdb.set_trace()\n",
+    "    # Identify all measurement columns, including EMG millivolts\n",
+    "    measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n",
+    "    df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n",
+    "    \n",
+    "    # Extract the Sensor Body Position\n",
+    "    df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "    # Extract measurement type, including EMG millivolts\n",
+    "    df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column\n",
+    "    df_melted = df_melted.drop(columns=[\"sensor_measurement\"])    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "    df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n",
+    "    df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n",
+    "    df_melted.fillna(np.nan, inplace=True)\n",
+    "    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "    df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n",
+    "                                    columns='measurement_type', values='value')\n",
+    "    df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n",
+    "    df_pivoted.columns = df_pivoted.columns.str.strip()\n",
+    "    df_pivoted = df_pivoted.reset_index()\n",
+    "    pdb.set_trace()\n",
+    "    df_pivoted.to_csv(\"pivoted_df.csv\")\n",
+    "    return df_pivoted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2a446022-09eb-4f86-a77a-980490d55b0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def standardize_time_series():\n",
+    "    # interpolate()\n",
+    "    pass\n",
+    "\n",
+    "def preprocessing(full_df):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fc81654-185e-4738-919a-57afa144341e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dab168eb-f1dc-4d7e-99fe-3ec1042db065",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculations for Feature Extraction from Project_Guide\n",
+    "def compute_emg_features(signal):\n",
+    "    return {\n",
+    "        'mean': np.mean(signal),\n",
+    "        'max': np.max(signal),\n",
+    "        'min': np.min(signal),\n",
+    "        'std': np.std(signal),\n",
+    "        'rms': np.sqrt(np.mean(signal**2))\n",
+    "    }\n",
+    "\n",
+    "def compute_accel_features(a_x, a_y, a_z):\n",
+    "    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)\n",
+    "    \n",
+    "    features = {\n",
+    "        'peak_accel': np.max(a_mag),\n",
+    "        'mean_accel': np.mean(a_mag),\n",
+    "        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),\n",
+    "        'accel_range': np.max(a_mag) - np.min(a_mag)\n",
+    "    }\n",
+    "    return features\n",
+    "\n",
+    "def compute_gyro_features(w_x, w_y, w_z):\n",
+    "    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)\n",
+    "    \n",
+    "    features = {\n",
+    "        'peak_angular_vel': np.max(w_mag),\n",
+    "        'mean_angular_vel': np.mean(w_mag),\n",
+    "        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),\n",
+    "        'angular_vel_range': np.max(w_mag) - np.min(w_mag)\n",
+    "    }\n",
+    "    return features    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72ed77bf-dbc1-406c-a5b1-57dcce504502",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "P3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d8305190-17ac-43fd-b7d9-d9a90e329533",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = read_run(\"P3_Exo_1_0.csv\") # 2nd run, male"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ab22fd91-5089-4c1e-91e5-f2fedb609f69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_noexo = read_run(\"P3_NoExo_1_0.csv\") # first run, male"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2b503e14-fc38-46c2-9a10-3d7ea8ade855",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p4_exo = read_run(\"P4_Exo_1_0.csv\") # 1st run female\n",
+    "df_p4_noexo = read_run(\"P4_NoExo_1_0.csv\") # 2nd female"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ad941e26-6be4-4ae5-8d57-f148509e1675",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')\n",
+    "df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')\n",
+    "df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')\n",
+    "df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "0372fab5-cc33-4ec4-a878-a20207b8b542",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>RDelt_EMG_TimeSeries</th>\n",
+       "      <th>RDelt_IMU_Acc X Time Series(s)</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>run_num</th>\n",
+       "      <th>sensor_measurement</th>\n",
+       "      <th>value</th>\n",
+       "      <th>Sensor_Body_Position</th>\n",
+       "      <th>measurement_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.004868</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.000794</td>\n",
+       "      <td>0.00675</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005875</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.001588</td>\n",
+       "      <td>0.0135</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005203</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.002382</td>\n",
+       "      <td>0.02025</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005539</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.003177</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.007721</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852739</th>\n",
+       "      <td>109.265029</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852740</th>\n",
+       "      <td>109.265823</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852741</th>\n",
+       "      <td>109.266618</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852742</th>\n",
+       "      <td>109.267412</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852743</th>\n",
+       "      <td>109.268206</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3852744 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         RDelt_EMG_TimeSeries RDelt_IMU_Acc X Time Series(s) gender  run_num  \\\n",
+       "0                    0.000000                              0   male        2   \n",
+       "1                    0.000794                        0.00675   male        2   \n",
+       "2                    0.001588                         0.0135   male        2   \n",
+       "3                    0.002382                        0.02025   male        2   \n",
+       "4                    0.003177                          0.027   male        2   \n",
+       "...                       ...                            ...    ...      ...   \n",
+       "3852739            109.265029                            NaN   male        2   \n",
+       "3852740            109.265823                            NaN   male        2   \n",
+       "3852741            109.266618                            NaN   male        2   \n",
+       "3852742            109.267412                            NaN   male        2   \n",
+       "3852743            109.268206                            NaN   male        2   \n",
+       "\n",
+       "            sensor_measurement     value Sensor_Body_Position measurement_type  \n",
+       "0         RDelt_EMG_MilliVolts  0.004868                RDelt   EMG_MilliVolts  \n",
+       "1         RDelt_EMG_MilliVolts  0.005875                RDelt   EMG_MilliVolts  \n",
+       "2         RDelt_EMG_MilliVolts  0.005203                RDelt   EMG_MilliVolts  \n",
+       "3         RDelt_EMG_MilliVolts  0.005539                RDelt   EMG_MilliVolts  \n",
+       "4         RDelt_EMG_MilliVolts  0.007721                RDelt   EMG_MilliVolts  \n",
+       "...                        ...       ...                  ...              ...  \n",
+       "3852739  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852740  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852741  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852742  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852743  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "\n",
+       "[3852744 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df_p3_exo\n",
+    "columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n",
+    "measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n",
+    "df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n",
+    "    # Extract the Sensor Body Position\n",
+    "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "# Extract measurement type, including EMG millivolts\n",
+    "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') \n",
+    "df_melted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9aea9bd-e997-4330-9116-e4d9c6db90a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>RDelt_EMG_TimeSeries</th>\n",
+       "      <th>RDelt_EMG_MilliVolts</th>\n",
+       "      <th>RDelt_IMU_Acc X Time Series(s)</th>\n",
+       "      <th>RDelt_ACC X (G)</th>\n",
+       "      <th>RDelt_ACC Y (G)</th>\n",
+       "      <th>RDelt_ACC Z (G)</th>\n",
+       "      <th>RDelt_GYRO X (deg/s)</th>\n",
+       "      <th>RDelt_GYRO Y (deg/s)</th>\n",
+       "      <th>RDelt_GYRO Z (deg/s)</th>\n",
+       "      <th>LDelt_MilliVolts</th>\n",
+       "      <th>...</th>\n",
+       "      <th>RBicep_GYRO Z (deg/s)</th>\n",
+       "      <th>LBicep_MilliVolts</th>\n",
+       "      <th>LBicep_ACC X (G)</th>\n",
+       "      <th>LBicep_ACC Y (G)</th>\n",
+       "      <th>LBicep_ACC Z (G)</th>\n",
+       "      <th>LBicep_GYRO X (deg/s)</th>\n",
+       "      <th>LBicep_GYRO Y (deg/s)</th>\n",
+       "      <th>LBicep_GYRO Z (deg/s)</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>run_num</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.004868</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0747681</td>\n",
+       "      <td>0.9061279</td>\n",
+       "      <td>0.2548828</td>\n",
+       "      <td>-30.7404575</td>\n",
+       "      <td>-4.2519083</td>\n",
+       "      <td>9.358779</td>\n",
+       "      <td>-0.006546</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-11.9541988</td>\n",
+       "      <td>0.041962</td>\n",
+       "      <td>0.2507324</td>\n",
+       "      <td>0.8808594</td>\n",
+       "      <td>0.1972656</td>\n",
+       "      <td>-30.801527</td>\n",
+       "      <td>8.5572519</td>\n",
+       "      <td>12.6870232</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.000794</td>\n",
+       "      <td>0.005875</td>\n",
+       "      <td>0.00675</td>\n",
+       "      <td>0.0795288</td>\n",
+       "      <td>0.913208</td>\n",
+       "      <td>0.2689209</td>\n",
+       "      <td>-30.7786255</td>\n",
+       "      <td>-5.961832</td>\n",
+       "      <td>8.5419846</td>\n",
+       "      <td>-0.006546</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.5343513</td>\n",
+       "      <td>0.041962</td>\n",
+       "      <td>0.2453003</td>\n",
+       "      <td>0.8790283</td>\n",
+       "      <td>0.2055054</td>\n",
+       "      <td>-29.038168</td>\n",
+       "      <td>9.9007635</td>\n",
+       "      <td>13.0305347</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.001588</td>\n",
+       "      <td>0.005203</td>\n",
+       "      <td>0.0135</td>\n",
+       "      <td>0.0804443</td>\n",
+       "      <td>0.9194336</td>\n",
+       "      <td>0.2719116</td>\n",
+       "      <td>-29.9312973</td>\n",
+       "      <td>-6.8015265</td>\n",
+       "      <td>8.4503813</td>\n",
+       "      <td>-0.007217</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.801527</td>\n",
+       "      <td>0.041459</td>\n",
+       "      <td>0.2486572</td>\n",
+       "      <td>0.880188</td>\n",
+       "      <td>0.2092896</td>\n",
+       "      <td>-27.6641216</td>\n",
+       "      <td>8.9694653</td>\n",
+       "      <td>12.358779</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.002382</td>\n",
+       "      <td>0.005539</td>\n",
+       "      <td>0.02025</td>\n",
+       "      <td>0.0809326</td>\n",
+       "      <td>0.9316406</td>\n",
+       "      <td>0.2680054</td>\n",
+       "      <td>-29.1068707</td>\n",
+       "      <td>-6.8854961</td>\n",
+       "      <td>7.6793895</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.7480917</td>\n",
+       "      <td>0.039780</td>\n",
+       "      <td>0.2533569</td>\n",
+       "      <td>0.880127</td>\n",
+       "      <td>0.2134399</td>\n",
+       "      <td>-25.442749</td>\n",
+       "      <td>7.6106873</td>\n",
+       "      <td>11.1145039</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.003177</td>\n",
+       "      <td>0.007721</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>0.0866699</td>\n",
+       "      <td>0.9319458</td>\n",
+       "      <td>0.2663574</td>\n",
+       "      <td>-29.3129768</td>\n",
+       "      <td>-7.4045801</td>\n",
+       "      <td>6.7557254</td>\n",
+       "      <td>-0.005203</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-11.1832066</td>\n",
+       "      <td>0.041459</td>\n",
+       "      <td>0.2590942</td>\n",
+       "      <td>0.8770752</td>\n",
+       "      <td>0.2124634</td>\n",
+       "      <td>-23.557251</td>\n",
+       "      <td>6.0916033</td>\n",
+       "      <td>9.7862597</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137593</th>\n",
+       "      <td>109.265029</td>\n",
+       "      <td>0.017960</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.002182</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.030716</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137594</th>\n",
+       "      <td>109.265823</td>\n",
+       "      <td>0.019974</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.001846</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.035416</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137595</th>\n",
+       "      <td>109.266618</td>\n",
+       "      <td>0.020981</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.034745</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137596</th>\n",
+       "      <td>109.267412</td>\n",
+       "      <td>0.018631</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.005707</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.035248</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137597</th>\n",
+       "      <td>109.268206</td>\n",
+       "      <td>0.019974</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.036591</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>137598 rows × 32 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        RDelt_EMG_TimeSeries  RDelt_EMG_MilliVolts  \\\n",
+       "0                   0.000000              0.004868   \n",
+       "1                   0.000794              0.005875   \n",
+       "2                   0.001588              0.005203   \n",
+       "3                   0.002382              0.005539   \n",
+       "4                   0.003177              0.007721   \n",
+       "...                      ...                   ...   \n",
+       "137593            109.265029              0.017960   \n",
+       "137594            109.265823              0.019974   \n",
+       "137595            109.266618              0.020981   \n",
+       "137596            109.267412              0.018631   \n",
+       "137597            109.268206              0.019974   \n",
+       "\n",
+       "       RDelt_IMU_Acc X Time Series(s) RDelt_ACC X (G) RDelt_ACC Y (G)  \\\n",
+       "0                                   0       0.0747681       0.9061279   \n",
+       "1                             0.00675       0.0795288        0.913208   \n",
+       "2                              0.0135       0.0804443       0.9194336   \n",
+       "3                             0.02025       0.0809326       0.9316406   \n",
+       "4                               0.027       0.0866699       0.9319458   \n",
+       "...                               ...             ...             ...   \n",
+       "137593                            NaN             NaN             NaN   \n",
+       "137594                            NaN             NaN             NaN   \n",
+       "137595                            NaN             NaN             NaN   \n",
+       "137596                            NaN             NaN             NaN   \n",
+       "137597                            NaN             NaN             NaN   \n",
+       "\n",
+       "       RDelt_ACC Z (G) RDelt_GYRO X (deg/s) RDelt_GYRO Y (deg/s)  \\\n",
+       "0            0.2548828          -30.7404575           -4.2519083   \n",
+       "1            0.2689209          -30.7786255            -5.961832   \n",
+       "2            0.2719116          -29.9312973           -6.8015265   \n",
+       "3            0.2680054          -29.1068707           -6.8854961   \n",
+       "4            0.2663574          -29.3129768           -7.4045801   \n",
+       "...                ...                  ...                  ...   \n",
+       "137593             NaN                  NaN                  NaN   \n",
+       "137594             NaN                  NaN                  NaN   \n",
+       "137595             NaN                  NaN                  NaN   \n",
+       "137596             NaN                  NaN                  NaN   \n",
+       "137597             NaN                  NaN                  NaN   \n",
+       "\n",
+       "       RDelt_GYRO Z (deg/s)  LDelt_MilliVolts  ... RBicep_GYRO Z (deg/s)  \\\n",
+       "0                  9.358779         -0.006546  ...           -11.9541988   \n",
+       "1                 8.5419846         -0.006546  ...           -12.5343513   \n",
+       "2                 8.4503813         -0.007217  ...            -12.801527   \n",
+       "3                 7.6793895         -0.004196  ...           -12.7480917   \n",
+       "4                 6.7557254         -0.005203  ...           -11.1832066   \n",
+       "...                     ...               ...  ...                   ...   \n",
+       "137593                  NaN         -0.002182  ...                   NaN   \n",
+       "137594                  NaN         -0.001846  ...                   NaN   \n",
+       "137595                  NaN         -0.004196  ...                   NaN   \n",
+       "137596                  NaN         -0.005707  ...                   NaN   \n",
+       "137597                  NaN         -0.004196  ...                   NaN   \n",
+       "\n",
+       "       LBicep_MilliVolts LBicep_ACC X (G) LBicep_ACC Y (G) LBicep_ACC Z (G)  \\\n",
+       "0               0.041962        0.2507324        0.8808594        0.1972656   \n",
+       "1               0.041962        0.2453003        0.8790283        0.2055054   \n",
+       "2               0.041459        0.2486572         0.880188        0.2092896   \n",
+       "3               0.039780        0.2533569         0.880127        0.2134399   \n",
+       "4               0.041459        0.2590942        0.8770752        0.2124634   \n",
+       "...                  ...              ...              ...              ...   \n",
+       "137593          0.030716              NaN              NaN              NaN   \n",
+       "137594          0.035416              NaN              NaN              NaN   \n",
+       "137595          0.034745              NaN              NaN              NaN   \n",
+       "137596          0.035248              NaN              NaN              NaN   \n",
+       "137597          0.036591              NaN              NaN              NaN   \n",
+       "\n",
+       "       LBicep_GYRO X (deg/s)  LBicep_GYRO Y (deg/s) LBicep_GYRO Z (deg/s)  \\\n",
+       "0                 -30.801527              8.5572519            12.6870232   \n",
+       "1                 -29.038168              9.9007635            13.0305347   \n",
+       "2                -27.6641216              8.9694653             12.358779   \n",
+       "3                 -25.442749              7.6106873            11.1145039   \n",
+       "4                 -23.557251              6.0916033             9.7862597   \n",
+       "...                      ...                    ...                   ...   \n",
+       "137593                   NaN                    NaN                   NaN   \n",
+       "137594                   NaN                    NaN                   NaN   \n",
+       "137595                   NaN                    NaN                   NaN   \n",
+       "137596                   NaN                    NaN                   NaN   \n",
+       "137597                   NaN                    NaN                   NaN   \n",
+       "\n",
+       "       gender run_num  \n",
+       "0        male       2  \n",
+       "1        male       2  \n",
+       "2        male       2  \n",
+       "3        male       2  \n",
+       "4        male       2  \n",
+       "...       ...     ...  \n",
+       "137593   male       2  \n",
+       "137594   male       2  \n",
+       "137595   male       2  \n",
+       "137596   male       2  \n",
+       "137597   male       2  \n",
+       "\n",
+       "[137598 rows x 32 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "    # Extract the Sensor Body Position\n",
+    "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "# Extract measurement type, including EMG millivolts\n",
+    "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column\n",
+    "df_melted = df_melted.drop(columns=[\"sensor_measurement\"])    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n",
+    "df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n",
+    "df_melted.fillna(np.nan, inplace=True)\n",
+    "# Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n",
+    "                                columns='measurement_type', values='value')\n",
+    "df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n",
+    "df_pivoted.columns = df_pivoted.columns.str.strip()\n",
+    "df_pivoted = df_pivoted.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bf43181-6aa6-44ba-8fc1-44dc7bbeaed3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = create_sensor_col(df_p3_exo)\n",
+    "df_p3_noexo = create_sensor_col(df_p3_noexo)\n",
+    "df_p4_exo = create_sensor_col(df_p4_exo)\n",
+    "df_p4_noexo = create_sensor_col(df_p4_noexo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01bf5481-3d22-4143-8990-fec243ae013e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.\n",
+    "combined_df = pd.concat(dfs, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6263007-da44-4d49-b437-0754e56a2def",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Show the head of the data\n",
+    "# df_p3_exo.describe()\n",
+    "df_p3_noexo.head()\n",
+    "# df_p4_exo.head()\n",
+    "# df_p4_noexo.head()\n",
+    "# # Choose inputs\n",
+    "# features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()\n",
+    "# features.head()\n",
+    "feature_sets = []\n",
+    "# Run functions to extract features for each dataframe\n",
+    "#CP: does this make sure to remove the redundant time series columns?\n",
+    "#can keep  ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name \n",
+    "for df in dfs:\n",
+    "    emg_features = compute_emg_features(df['EMG 1 (mV)'])\n",
+    "    accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])\n",
+    "    gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])\n",
+    "    features = {\n",
+    "        'emg': emg_features,\n",
+    "        'accel': accel_features,\n",
+    "        'gyro': gyro_features\n",
+    "    }\n",
+    "    feature_sets.append(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccbbdce4-9c51-4b5c-b263-d23cc0c79154",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/dk.txt b/dk.txt
new file mode 100644
index 0000000..dfcadb6
--- /dev/null
+++ b/dk.txt
@@ -0,0 +1,57 @@
+import pandas as pd
+import numpy as np
+
+def emg_to_imu_asof_all_muscles(df, emg_time_col='EMG_TimeSeries', imu_time_col='IMU_TimeSeries'):
+    """
+    For each muscle group, create a tidy DataFrame where each row is an EMG measurement,
+    with corresponding IMU data (if available) joined via pandas.merge_asof (backward).
+    IMU columns may have NaNs if not available at that EMG time.
+    Returns a concatenated DataFrame for all muscles with a 'Muscle' column.
+    """
+    muscle_names = []
+    for col in df.columns:
+        if '_EMG_MilliVolts' in col:
+            muscle = col.replace('_EMG_MilliVolts', '')
+            muscle_names.append(muscle)
+
+    all_muscles = []
+    for muscle in muscle_names:
+        # Build EMG DataFrame for this muscle
+        emg_df = pd.DataFrame({
+            'EMG_TimeSeries': pd.to_numeric(df[emg_time_col], errors='coerce'),
+            'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce')
+        }).dropna(subset=['EMG_TimeSeries', 'EMG_MV'])
+
+        # Build IMU DataFrame for this muscle
+        imu_df = pd.DataFrame({
+            'IMU_TimeSeries': pd.to_numeric(df[imu_time_col], errors='coerce'),
+            'ACC X': pd.to_numeric(df.get(f'{muscle}_ACC X (G)'), errors='coerce'),
+            'ACC Y': pd.to_numeric(df.get(f'{muscle}_ACC Y (G)'), errors='coerce'),
+            'ACC Z': pd.to_numeric(df.get(f'{muscle}_ACC Z (G)'), errors='coerce'),
+            'GYRO X': pd.to_numeric(df.get(f'{muscle}_GYRO X (deg/s)'), errors='coerce'),
+            'GYRO Y': pd.to_numeric(df.get(f'{muscle}_GYRO Y (deg/s)'), errors='coerce'),
+            'GYRO Z': pd.to_numeric(df.get(f'{muscle}_GYRO Z (deg/s)'), errors='coerce')
+        })
+
+        # Merge IMU onto EMG (backward: most recent IMU)
+        merged = pd.merge_asof(
+            emg_df.sort_values('EMG_TimeSeries'),
+            imu_df.sort_values('IMU_TimeSeries'),
+            left_on='EMG_TimeSeries',
+            right_on='IMU_TimeSeries',
+            direction='backward'
+        )
+
+        merged['Muscle'] = muscle
+        all_muscles.append(merged)
+
+    tidy = pd.concat(all_muscles, ignore_index=True)
+    # Order columns
+    cols = ['Muscle', 'EMG_TimeSeries', 'EMG_MV', 'IMU_TimeSeries',
+            'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z']
+    tidy = tidy[cols]
+    return tidy
+
+# Usage example:
+# tidy_df = emg_to_imu_asof_all_muscles(df)
+# tidy_df.to_csv('emg_imu_tidy.csv', index=False)
\ No newline at end of file
diff --git a/feature_extraction.py b/feature_extraction.py
new file mode 100644
index 0000000..5eaa17b
--- /dev/null
+++ b/feature_extraction.py
@@ -0,0 +1,111 @@
+import pdb
+import pandas as pd
+import numpy as np
+# Calculations for Feature Extraction from Project_Guide
+
+def extract_features(df):
+    # Group by relevant columns
+    group_cols = ['BodyPart', 'run_num', 'gender', 'exo']  # adapt as needed
+    feature_rows = []
+    for group_vals, group in df.groupby(group_cols):
+        # Accelerometer features
+        a_x, a_y, a_z = group['ACC X (G)_filtered'], group['ACC Y (G)_filtered'], group['ACC Z (G)_filtered']
+        a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
+        accel_peak = np.max(a_mag)
+        accel_mean = np.mean(a_mag)
+        accel_total = np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2))
+        accel_range = np.max(a_mag) - np.min(a_mag)
+
+        # Gyroscope features
+        w_x, w_y, w_z = group['GYRO X (deg/s)_filtered'], group['GYRO Y (deg/s)_filtered'], group['GYRO Z (deg/s)_filtered']
+        w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
+        gyro_peak = np.max(w_mag)
+        gyro_mean = np.mean(w_mag)
+        gyro_total = np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2))
+        gyro_range = np.max(w_mag) - np.min(w_mag)
+
+        # EMG features (filtered)
+        emg = group['EMG_MilliVolts_filtered']
+        emg_mean = np.mean(emg)
+        emg_max = np.max(emg)
+        emg_min = np.min(emg)
+        emg_std = np.std(emg)
+        emg_rms = np.sqrt(np.mean(emg**2))
+
+        # Build feature dict
+        feature_dict = {
+            'BodyPart': group_vals[0],
+            'run_num': group_vals[1],
+            'gender': group_vals[2],
+            'exo': group_vals[3],
+            'accel_peak': accel_peak,
+            'accel_mean': accel_mean,
+            'accel_total': accel_total,
+            'accel_range': accel_range,
+            'gyro_peak': gyro_peak,
+            'gyro_mean': gyro_mean,
+            'gyro_total': gyro_total,
+            'gyro_range': gyro_range,
+            'emg_mean': emg_mean,
+            'emg_max': emg_max,
+            'emg_min': emg_min,
+            'emg_std': emg_std,
+            'emg_rms': emg_rms,
+        }
+        feature_rows.append(feature_dict)
+    #THIS IS LAME (only 17 rows) BRUH
+    # Return as a new DataFrame
+    return pd.DataFrame(feature_rows)
+
+#old funcs 
+def compute_emg_features(df):
+    signal = df['EMG_MilliVolts']
+    return {
+        'mean': np.mean(signal),
+        'max': np.max(signal),
+        'min': np.min(signal),
+        'std': np.std(signal),
+        'rms': np.sqrt(np.mean(signal**2))
+    }
+
+def compute_accel_features(df):
+    a_x = df['ACC X (G)'], a_y = df['ACC Y (G)'], a_z = df['ACC Z (G)']
+    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
+    
+    features = {
+        'peak_accel': np.max(a_mag),
+        'mean_accel': np.mean(a_mag),
+        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),
+        'accel_range': np.max(a_mag) - np.min(a_mag)
+    }
+    return features
+
+def compute_gyro_features(df):
+    w_x = df['GYRO X (deg/s)'], w_y = df['GYRO Y (deg/s)'], w_z = df['GYRO Z (deg/s)']
+    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
+    
+    features = {
+        'peak_angular_vel': np.max(w_mag),
+        'mean_angular_vel': np.mean(w_mag),
+        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),
+        'angular_vel_range': np.max(w_mag) - np.min(w_mag)
+    }
+    return features 
+    # fft_mean = mean(valid_freqs * valid_fft)
+    # fft_median = median(valid_freqs * valid_fft)
+    # fft_power = np.sum(valid_fft**2)
+
+    # feature_row = {
+    #     'emg_max': emg.max(),
+    #     'emg_min': emg.min(),
+    #     'emg_rms': np.sqrt(np.mean(emg**2)),
+    #     'acc_peak': np.linalg.norm(acc, axis=1).max(),
+    #     'acc_range': np.ptp(np.linalg.norm(acc, axis=1)),
+    #     'gyro_peak': np.linalg.norm(gyro, axis=1).max(),
+    #     'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)),
+    #     'emg_fft_mean_freq': fft_mean,
+    #     'emg_fft_median_freq': fft_median,
+    #     'emg_fft_power': fft_power,
+    #     'label': label,
+    #     'gender': gender
+    # }
\ No newline at end of file
diff --git a/filtering.py b/filtering.py
new file mode 100644
index 0000000..e75973f
--- /dev/null
+++ b/filtering.py
@@ -0,0 +1,23 @@
+from scipy.signal import filtfilt, butter
+import numpy as np
+import pandas as pd
+
+def bandpass_filter_emg(series_signal, fs=1259, lowcut=20, highcut=450, order=4):
+    arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal)    
+    if np.isnan(arr).all() or len(arr) == 0: #edge case check
+        return arr
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = butter(order, [low, high], btype='band')
+    return filtfilt(b, a, series_signal)
+
+# IMU Low-pass Filter (<20Hz)
+def lowpass_filter_imu(series_signal, fs=148, cutoff=20, order=4):
+    arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal)
+    if np.isnan(arr).all() or len(arr) == 0: #edge case check
+        return arr
+    nyq = 0.5 * fs
+    normal_cutoff = cutoff / nyq
+    b, a = butter(order, normal_cutoff, btype='low')
+    return filtfilt(b, a, series_signal)
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..df28536
--- /dev/null
+++ b/main.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import numpy as np
+from dataCleaning import read_run, column_clean, preprocessing_actions
+from dataCleaning import create_sensor_col
+from resampling import downsample
+from feature_extraction import extract_features
+from filtering import bandpass_filter_emg, lowpass_filter_imu
+import pdb 
+def overall_cleaning():
+    df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male
+    df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
+    df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
+    df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female
+
+    df_p3_exo = column_clean(df_p3_exo)
+    df_p3_noexo = column_clean(df_p3_noexo)
+    df_p4_exo = column_clean(df_p4_exo)
+    df_p4_noexo = column_clean(df_p4_noexo)
+    #downsample EMG to match IMU
+    df_p3_exo = downsample(df_p3_exo)
+    df_p3_noexo = downsample(df_p3_noexo)
+    df_p4_exo = downsample(df_p4_exo)
+    df_p4_noexo = downsample(df_p4_noexo)
+    #melt sensor columns into a body part sensor
+    df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True)
+    df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False)
+    df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True)
+    df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False)
+    dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
+    combined_df = pd.concat(dfs, ignore_index=True)
+    # Run functions to extract features for each dataframe
+    #filter out IMU and EMG outliers using filters: 
+    imu_cols = ['ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']
+    for col in imu_cols:
+        combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu)
+    combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg)
+    pdb.set_trace()
+    features_df = extract_features(combined_df) #TO-DO FIX
+    #machine learning on combined_df
+    #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data
+    X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df)
+    #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data.
+    return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline
+
+def main():
+    final_df =  overall_cleaning()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/modif_cols.py b/modif_cols.py
new file mode 100644
index 0000000..2556d16
--- /dev/null
+++ b/modif_cols.py
@@ -0,0 +1,25 @@
+import pandas as pd
+import numpy as np
+
+def tidy_emg_imu_as_measured(df):
+    # Identify columns to melt (all sensor columns)
+    measurement_cols = [c for c in df.columns if any(
+        sensor in c for sensor in ['RDelt', 'LDelt', 'RBicep', 'LBicep'])]
+    id_vars = [c for c in df.columns if c not in measurement_cols]
+    # Melt
+    df_long = df.melt(id_vars=id_vars, value_vars=measurement_cols,
+                      var_name='Measurement', value_name='Value')
+    # Extract BodyPart and Signal
+    df_long['BodyPart'] = df_long['Measurement'].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
+    df_long['Signal'] = df_long['Measurement'].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True)
+    
+    # Pivot so each signal is a separate column
+    df_wide = df_long.pivot_table(
+        index=id_vars + ['BodyPart'],
+        columns='Signal',
+        values='Value'
+    ).reset_index()
+    # flatten columns if needed
+    df_wide.columns.name = None
+    df_wide.columns = [str(col) for col in df_wide.columns]  
+    return df_wide
\ No newline at end of file
diff --git a/resampling.py b/resampling.py
new file mode 100644
index 0000000..fd5df69
--- /dev/null
+++ b/resampling.py
@@ -0,0 +1,71 @@
+import pandas as pd
+import numpy as np
+import pdb 
+
+def upsample(df):
+    IMU_cols = [
+       'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)',
+       'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',  'LDelt_ACC X (G)', 'LDelt_ACC Y (G)',
+       'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)',
+       'LDelt_GYRO Z (deg/s)',  'RBicep_ACC X (G)',
+       'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)',
+       'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
+       'LBicep_ACC X (G)', 'LBicep_ACC Y (G)',
+       'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)',
+       'LBicep_GYRO Z (deg/s)'
+       ]
+    df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries'])
+    df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s')
+    df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN
+    df = df.set_index('time')
+    freq_nanseconds = int(0.0007941176470588235 * 1e9)  # Convert to integer microseconds
+    IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq()
+    IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values 
+    IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x))
+    IMU_upsampled = IMU_upsampled.interpolate(method='linear')  # Interpolates the data using the linear method to match EMG data
+    #sine interpolation is best 
+    df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts'])
+    return df_new
+
+def downsample(df):
+  EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts']
+  IMU_cols = [
+    'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',
+    'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GYRO Z (deg/s)',
+    'RBicep_ACC X (G)', 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
+    'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GYRO Z (deg/s)'
+  ]
+  df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries'])
+  for col in df.columns:
+    # Optionally skip time columns if you want to preserve them as objects/strings
+    if "TimeSeries" in col:
+        continue
+    df[col] = pd.to_numeric(df[col], errors='coerce')
+  df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s')
+  df = df.set_index('time')
+  df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq()  # Scales these columns to be the same length as IMU data
+  df[EMG_cols] = df[EMG_cols].interpolate(method='linear')
+  df[EMG_cols] = df[EMG_cols].fillna(method='bfill').fillna(method='ffill') #back fill and forward fill all Nans.
+  # Find the last index where at least one IMU value is real
+  last_idx = df[IMU_cols].last_valid_index()
+  # Trim DataFrame to that index
+  df = df.loc[:last_idx]
+  df = df.reset_index() 
+  return df
+
+## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index
+def alternative(df):
+  high_rate = 1259
+  low_rate = 148
+  step = high_rate / low_rate
+  
+  EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts']
+  
+  # Indexes to sample
+  indices = np.round(np.arange(0, len(df), step)).astype(int)
+  indices = indices[indices < len(df)]  # Ensure we stay within bounds
+
+  # Downsample using nearest index
+  df[EMG_cols] = df[EMG_cols].iloc[indices].reset_index(drop=True)
+  df = df.drop(columns=['EMG_TimeSeries'])
+  return df