From 9f3f0544e75a558d6eaf0870549d702658152364 Mon Sep 17 00:00:00 2001
From: Cyrus Parvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Sun, 18 May 2025 15:51:39 -0700
Subject: [PATCH 01/12] more

---
 FeatureExtraction.py | 7 ++++---
 dataCleaning.py      | 8 ++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index c27d629..de1624d 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from dataCleaning import read_run, column_clean, preprocessing
+from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col
 import pdb
 
 def overall_cleaning():
@@ -13,8 +13,9 @@ def overall_cleaning():
     df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')
     df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')
     df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')
-    combined_df = pd.concat([df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo], ignore_index=True)
-    dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. 
+    dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
+    combined_df = pd.concat(dfs, ignore_index=True) 
+    create_sensor_col(combined_df)
     # # Show the head of the data
     # df_p3_exo.describe()
     df_p3_noexo.head()
diff --git a/dataCleaning.py b/dataCleaning.py
index 087d7e8..1797251 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -54,5 +54,13 @@ def column_clean(df, run_num, gender):
     df.to_csv("test.csv")
     return df 
 
+def create_sensor_col(full_df): 
+    df_melted = full_df.melt(var_name="measurement_type", value_name="value")
+    # Extract the sensor location from the column names
+    df_melted["sensor_location"] = df_melted["measurement_type"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
+    # Remove the sensor location prefix from the measurement type column
+    df_melted["measurement_type"] = df_melted["measurement_type"].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True)
+    pdb.set_trace()
+    
 def preprocessing(full_df):
     pass

From 77c0266fc2bb9557e42970060cf32f04cddde890 Mon Sep 17 00:00:00 2001
From: Cyrus Parvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Mon, 19 May 2025 22:04:41 -0700
Subject: [PATCH 02/12] saving

---
 FeatureExtraction.py | 12 +++++++++---
 dataCleaning.py      | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index de1624d..914c85d 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -4,7 +4,7 @@
 import pdb
 
 def overall_cleaning():
-    df_p3_exo = read_run("P3_Exo_1_0.csv") # second run, male
+    df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male
     df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
     df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
     df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female
@@ -13,9 +13,15 @@ def overall_cleaning():
     df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')
     df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')
     df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')
+
+    df_p3_exo = create_sensor_col(df_p3_exo)
+    df_p3_noexo = create_sensor_col(df_p3_noexo)
+    df_p4_exo = create_sensor_col(df_p4_exo)
+    df_p4_noexo = create_sensor_col(df_p4_noexo)
+
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
-    combined_df = pd.concat(dfs, ignore_index=True) 
-    create_sensor_col(combined_df)
+    combined_df = pd.concat(dfs, ignore_index=True)
+    
     # # Show the head of the data
     # df_p3_exo.describe()
     df_p3_noexo.head()
diff --git a/dataCleaning.py b/dataCleaning.py
index 1797251..8c05090 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -31,7 +31,7 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel
                  'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
                  'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
                 ]
-    return df
+    return df #raw data 
 
 def column_clean(df, run_num, gender):
     #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU 
@@ -51,16 +51,36 @@ def column_clean(df, run_num, gender):
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
     df['gender'] = gender
     df['run_num'] = run_num
-    df.to_csv("test.csv")
     return df 
 
-def create_sensor_col(full_df): 
-    df_melted = full_df.melt(var_name="measurement_type", value_name="value")
-    # Extract the sensor location from the column names
-    df_melted["sensor_location"] = df_melted["measurement_type"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
-    # Remove the sensor location prefix from the measurement type column
-    df_melted["measurement_type"] = df_melted["measurement_type"].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True)
+def create_sensor_col(df): 
+    columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']
     pdb.set_trace()
+    # Identify all measurement columns, including EMG millivolts
+    measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]
+    df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name="sensor_measurement", value_name="value")
     
+    # Extract the Sensor Body Position
+    df_melted["Sensor_Body_Position"] = df_melted["sensor_measurement"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
+    # Extract measurement type, including EMG millivolts
+    df_melted["measurement_type"] = df_melted["sensor_measurement"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column
+    df_melted = df_melted.drop(columns=["sensor_measurement"])    # Pivot the DataFrame so each measurement type becomes a separate column
+    df_melted["value"] = df_melted["value"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)
+    df_melted["value"] = pd.to_numeric(df_melted["value"], errors="coerce") #make sure all values are cast to numeric
+    df_melted.fillna(np.nan, inplace=True)
+    # Pivot the DataFrame so each measurement type becomes a separate column
+    df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], 
+                                    columns='measurement_type', values='value')
+    df_pivoted.columns = df_pivoted.columns.get_level_values(0)
+    df_pivoted.columns = df_pivoted.columns.str.strip()
+    df_pivoted = df_pivoted.reset_index()
+    pdb.set_trace()
+    df_pivoted.to_csv("pivoted_df.csv")
+    return df_pivoted
+
+def standardize_time_series():
+    # interpolate()
+    pass
+
 def preprocessing(full_df):
     pass

From 945f380d3f06c4e59f7cff01181e0fbbcc40c7bf Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Mon, 19 May 2025 23:43:32 -0700
Subject: [PATCH 03/12] tried to finish cleaning but failed

---
 FeatureExtraction.py | 16 ++++-----
 dataCleaning.py      | 80 +++++++++++++++++++++++++++++---------------
 modif_cols.py        | 39 +++++++++++++++++++++
 3 files changed, 100 insertions(+), 35 deletions(-)
 create mode 100644 modif_cols.py

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index 914c85d..1fba79e 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -9,15 +9,15 @@ def overall_cleaning():
     df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
     df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female
 
-    df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')
-    df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')
-    df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')
-    df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')
+    df_p3_exo = column_clean(df_p3_exo)
+    df_p3_noexo = column_clean(df_p3_noexo)
+    df_p4_exo = column_clean(df_p4_exo)
+    df_p4_noexo = column_clean(df_p4_noexo)
 
-    df_p3_exo = create_sensor_col(df_p3_exo)
-    df_p3_noexo = create_sensor_col(df_p3_noexo)
-    df_p4_exo = create_sensor_col(df_p4_exo)
-    df_p4_noexo = create_sensor_col(df_p4_noexo)
+    df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True)
+    df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False)
+    df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True)
+    df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False)
 
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
     combined_df = pd.concat(dfs, ignore_index=True)
diff --git a/dataCleaning.py b/dataCleaning.py
index 8c05090..74407d6 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np 
 import pdb 
-
+from modif_cols import tidy_emg_imu_as_measured 
 # Data Labels:
 # Label for EMG Data shared:
 #     Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
@@ -27,13 +27,13 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel
                      usecols = usecols,
                      on_bad_lines='skip') 
     df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)',   'RDelt_ACC Y (G)',  'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',
-                 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
-                 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
-                 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
+                 'LDelt_TimeSeries', 'LDelt_EMG_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
+                 'RBicep_TimeSeries', 'RBicep_EMG_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
+                 'LBicep_TimeSeries', 'LBicep_EMG_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
                 ]
     return df #raw data 
 
-def column_clean(df, run_num, gender):
+def column_clean(df):
     #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU 
     extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',
                          'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', 
@@ -45,39 +45,65 @@ def column_clean(df, run_num, gender):
                          'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']
     
     df = df.drop(extr_time_series, axis = 1)
+    df = df.rename(columns={'RDelt_EMG_TimeSeries': 'EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)': 'IMU_TimeSeries'})
     # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols
     # df.columns = df.columns.str.strip()           # Remove leading/trailing spaces (Yuxuan)
     # df = df.apply(pd.to_numeric, errors='coerce') # Conver  t everything to numeric (Yuxuan)
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
-    df['gender'] = gender
-    df['run_num'] = run_num
     return df 
 
-def create_sensor_col(df): 
-    columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']
-    pdb.set_trace()
-    # Identify all measurement columns, including EMG millivolts
-    measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]
-    df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name="sensor_measurement", value_name="value")
-    
-    # Extract the Sensor Body Position
-    df_melted["Sensor_Body_Position"] = df_melted["sensor_measurement"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
-    # Extract measurement type, including EMG millivolts
-    df_melted["measurement_type"] = df_melted["sensor_measurement"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column
-    df_melted = df_melted.drop(columns=["sensor_measurement"])    # Pivot the DataFrame so each measurement type becomes a separate column
-    df_melted["value"] = df_melted["value"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)
-    df_melted["value"] = pd.to_numeric(df_melted["value"], errors="coerce") #make sure all values are cast to numeric
-    df_melted.fillna(np.nan, inplace=True)
-    # Pivot the DataFrame so each measurement type becomes a separate column
-    df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], 
-                                    columns='measurement_type', values='value')
-    df_pivoted.columns = df_pivoted.columns.get_level_values(0)
+def create_sensor_col(df, run_num, gender, exo): 
+    # muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep']
+    # # Identify all measurement columns for melting
+    # measurement_cols = [col for col in df.columns if any(muscle in col for muscle in muscles)]
+    # # Melt all measurement columns (EMG, ACC, GYRO)
+    # df_long = df.melt(
+    #     id_vars=['EMG_TimeSeries', 'IMU_TimeSeries'],
+    #     value_vars=measurement_cols,
+    #     var_name='sensor_measurement',
+    #     value_name='value'
+    # )
+    # # Extract Muscle, Sensor, and Axis from the column name
+    # df_long[['Muscle', 'Sensor', 'Axis']] = df_long['sensor_measurement'].str.extract(
+    #     r'^(RDelt|LDelt|RBicep|LBicep)_(EMG|ACC|GYRO)[ _]?(X|Y|Z)?'
+    # )
+    # # Build measurement type column for pivoting
+    # df_long['Measurement'] = np.where(
+    #     df_long['Sensor'] == 'EMG',
+    #     'EMG_MV',
+    #     df_long['Sensor'] + ' ' + df_long['Axis']
+    # )
+    # pdb.set_trace()
+
+    # # Pivot so each row is a Muscle-Timepoint, columns are measurement types
+    # tidy = df_long.pivot_table(
+    #     index=['EMG_TimeSeries', 'IMU_TimeSeries', 'Muscle'],
+    #     columns='Measurement',
+    #     values='value',
+    #     aggfunc='first'
+    # ).reset_index()
+    # # Flatten columns so pivot table multiindexes don't persist
+    # tidy.columns.name = None
+    # tidy = tidy.rename_axis(None, axis=1)
+    # #  sort columns
+    # columns_order = ['EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV',
+    #                  'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z', 'Muscle']
+    # # Only include columns that actually exist in the data
+    # columns_order = [col for col in columns_order if col in tidy.columns]
+
+    # df_pivoted = tidy[columns_order]
+    df_pivoted = tidy_emg_imu_as_measured(df)
     df_pivoted.columns = df_pivoted.columns.str.strip()
-    df_pivoted = df_pivoted.reset_index()
     pdb.set_trace()
+    df_pivoted = df_pivoted.reset_index()
+
+    df_pivoted['gender'] = gender
+    df_pivoted['run_num'] = run_num
+    df_pivoted['exo'] = exo
     df_pivoted.to_csv("pivoted_df.csv")
     return df_pivoted
 
+
 def standardize_time_series():
     # interpolate()
     pass
diff --git a/modif_cols.py b/modif_cols.py
new file mode 100644
index 0000000..7008e02
--- /dev/null
+++ b/modif_cols.py
@@ -0,0 +1,39 @@
+import pandas as pd
+import numpy as np
+
+def tidy_emg_imu_as_measured(df):
+    muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep']
+    all_muscle_tables = []
+    for muscle in muscles:
+        # EMG rows (as measured)
+        emg_df = pd.DataFrame({
+            'Muscle': muscle,
+            'EMG_TimeSeries': pd.to_numeric(df['EMG_TimeSeries'], errors='coerce'),
+            'IMU_TimeSeries': np.nan,
+            'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce'),
+            'ACC X': np.nan, 'ACC Y': np.nan, 'ACC Z': np.nan,
+            'GYRO X': np.nan, 'GYRO Y': np.nan, 'GYRO Z': np.nan
+        })
+        # IMU rows (as measured)
+        imu_df = pd.DataFrame({
+            'Muscle': muscle,
+            'EMG_TimeSeries': np.nan,
+            'IMU_TimeSeries': pd.to_numeric(df['IMU_TimeSeries'], errors='coerce'),
+            'EMG_MV': np.nan,
+            'ACC X': pd.to_numeric(df[f'{muscle}_ACC X (G)'], errors='coerce'),
+            'ACC Y': pd.to_numeric(df[f'{muscle}_ACC Y (G)'], errors='coerce'),
+            'ACC Z': pd.to_numeric(df[f'{muscle}_ACC Z (G)'], errors='coerce'),
+            'GYRO X': pd.to_numeric(df[f'{muscle}_GYRO X (deg/s)'], errors='coerce'),
+            'GYRO Y': pd.to_numeric(df[f'{muscle}_GYRO Y (deg/s)'], errors='coerce'),
+            'GYRO Z': pd.to_numeric(df[f'{muscle}_GYRO Z (deg/s)'], errors='coerce')
+        })
+        all_muscle_tables.append(pd.concat([emg_df, imu_df], ignore_index=True))
+    tidy = pd.concat(all_muscle_tables, ignore_index=True)
+    # Order and sort (optional)
+    columns_order = ['Muscle', 'EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV',
+                     'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z']
+    tidy = tidy[columns_order]
+    tidy['SortTime'] = tidy['EMG_TimeSeries'].combine_first(tidy['IMU_TimeSeries'])
+    tidy = tidy.sort_values(['Muscle', 'SortTime']).drop(columns=['SortTime'])
+    df_pivoted_sorted = tidy.sort_values('EMG_TimeSeries', na_position='last')
+    return df_pivoted_sorted
\ No newline at end of file

From 218cefe2013285fcbe5084cb1aecbc7d2deb3a0f Mon Sep 17 00:00:00 2001
From: Cyrus Parvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Wed, 21 May 2025 21:55:16 -0700
Subject: [PATCH 04/12] made jupyter notebook

---
 debug.ipynb | 952 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 952 insertions(+)
 create mode 100644 debug.ipynb

diff --git a/debug.ipynb b/debug.ipynb
new file mode 100644
index 0000000..c6ad330
--- /dev/null
+++ b/debug.ipynb
@@ -0,0 +1,952 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e0bebc80-fe7f-4d6c-8387-5c512308e48d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np \n",
+    "from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col\n",
+    "import pdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "3bd13a4f-893e-43c3-bc4a-afabc40bbcde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fields as well as metadata)\n",
+    "    usecols = list(range(0, 56)) \n",
+    "    df = pd.read_csv(filename, low_memory = False, \n",
+    "                     header = 0,  \n",
+    "                     skiprows=skiprows,\n",
+    "                    #  names=header,\n",
+    "                     usecols = usecols,\n",
+    "                     on_bad_lines='skip') \n",
+    "    df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)',   'RDelt_ACC Y (G)',  'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',\n",
+    "                 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)',   'LDelt_ACC Y (G)',  'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',\n",
+    "                 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)',   'RBicep_ACC Y (G)',  'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',\n",
+    "                 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)',   'LBicep_ACC Y (G)',  'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'\n",
+    "                ]\n",
+    "    return df #raw data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "07d4e282-5741-449e-a2a8-71f3b3a6c66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def column_clean(df, run_num, gender):\n",
+    "    #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU \n",
+    "    extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',\n",
+    "                         'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', \n",
+    "                         'LDelt_Acc Y Time Series(s)', 'LDelt_Acc Z Time Series(s)', 'LDelt_GyroXTime Series(s)',\n",
+    "                         'LDelt_GyroYTime Series(s)', 'LDelt_GyroZTime Series(s)', 'RBicep_TimeSeries', 'RBicep_Acc X Time Series(s)',\n",
+    "                         'RBicep_Acc Y Time Series(s)', 'RBicep_Acc Z Time Series(s)', 'RBicep_GyroXTime Series(s)',\n",
+    "                         'RBicep_GyroYTime Series(s)', 'RBicep_GyroZTime Series(s)', 'LBicep_TimeSeries', 'LBicep_Acc X Time Series(s)',\n",
+    "                         'LBicep_Acc Y Time Series(s)', 'LBicep_Acc Z Time Series(s)', 'LBicep_GyroXTime Series(s)', \n",
+    "                         'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']\n",
+    "    \n",
+    "    df = df.drop(extr_time_series, axis = 1)\n",
+    "    # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols\n",
+    "    # df.columns = df.columns.str.strip()           # Remove leading/trailing spaces (Yuxuan)\n",
+    "    # df = df.apply(pd.to_numeric, errors='coerce') # Conver  t everything to numeric (Yuxuan)\n",
+    "    df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data\n",
+    "    df['gender'] = gender\n",
+    "    df['run_num'] = run_num\n",
+    "    return df "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c74acdb7-01e3-443f-92b0-6d2a6e45696c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_sensor_col(df): \n",
+    "    columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n",
+    "    pdb.set_trace()\n",
+    "    # Identify all measurement columns, including EMG millivolts\n",
+    "    measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n",
+    "    df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n",
+    "    \n",
+    "    # Extract the Sensor Body Position\n",
+    "    df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "    # Extract measurement type, including EMG millivolts\n",
+    "    df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column\n",
+    "    df_melted = df_melted.drop(columns=[\"sensor_measurement\"])    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "    df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n",
+    "    df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n",
+    "    df_melted.fillna(np.nan, inplace=True)\n",
+    "    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "    df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n",
+    "                                    columns='measurement_type', values='value')\n",
+    "    df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n",
+    "    df_pivoted.columns = df_pivoted.columns.str.strip()\n",
+    "    df_pivoted = df_pivoted.reset_index()\n",
+    "    pdb.set_trace()\n",
+    "    df_pivoted.to_csv(\"pivoted_df.csv\")\n",
+    "    return df_pivoted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2a446022-09eb-4f86-a77a-980490d55b0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def standardize_time_series():\n",
+    "    # interpolate()\n",
+    "    pass\n",
+    "\n",
+    "def preprocessing(full_df):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fc81654-185e-4738-919a-57afa144341e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dab168eb-f1dc-4d7e-99fe-3ec1042db065",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculations for Feature Extraction from Project_Guide\n",
+    "def compute_emg_features(signal):\n",
+    "    return {\n",
+    "        'mean': np.mean(signal),\n",
+    "        'max': np.max(signal),\n",
+    "        'min': np.min(signal),\n",
+    "        'std': np.std(signal),\n",
+    "        'rms': np.sqrt(np.mean(signal**2))\n",
+    "    }\n",
+    "\n",
+    "def compute_accel_features(a_x, a_y, a_z):\n",
+    "    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)\n",
+    "    \n",
+    "    features = {\n",
+    "        'peak_accel': np.max(a_mag),\n",
+    "        'mean_accel': np.mean(a_mag),\n",
+    "        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),\n",
+    "        'accel_range': np.max(a_mag) - np.min(a_mag)\n",
+    "    }\n",
+    "    return features\n",
+    "\n",
+    "def compute_gyro_features(w_x, w_y, w_z):\n",
+    "    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)\n",
+    "    \n",
+    "    features = {\n",
+    "        'peak_angular_vel': np.max(w_mag),\n",
+    "        'mean_angular_vel': np.mean(w_mag),\n",
+    "        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),\n",
+    "        'angular_vel_range': np.max(w_mag) - np.min(w_mag)\n",
+    "    }\n",
+    "    return features    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72ed77bf-dbc1-406c-a5b1-57dcce504502",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "P3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d8305190-17ac-43fd-b7d9-d9a90e329533",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = read_run(\"P3_Exo_1_0.csv\") # 2nd run, male"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ab22fd91-5089-4c1e-91e5-f2fedb609f69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_noexo = read_run(\"P3_NoExo_1_0.csv\") # first run, male"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2b503e14-fc38-46c2-9a10-3d7ea8ade855",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p4_exo = read_run(\"P4_Exo_1_0.csv\") # 1st run female\n",
+    "df_p4_noexo = read_run(\"P4_NoExo_1_0.csv\") # 2nd female"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ad941e26-6be4-4ae5-8d57-f148509e1675",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')\n",
+    "df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')\n",
+    "df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')\n",
+    "df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "0372fab5-cc33-4ec4-a878-a20207b8b542",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>RDelt_EMG_TimeSeries</th>\n",
+       "      <th>RDelt_IMU_Acc X Time Series(s)</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>run_num</th>\n",
+       "      <th>sensor_measurement</th>\n",
+       "      <th>value</th>\n",
+       "      <th>Sensor_Body_Position</th>\n",
+       "      <th>measurement_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.004868</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.000794</td>\n",
+       "      <td>0.00675</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005875</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.001588</td>\n",
+       "      <td>0.0135</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005203</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.002382</td>\n",
+       "      <td>0.02025</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.005539</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.003177</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>RDelt_EMG_MilliVolts</td>\n",
+       "      <td>0.007721</td>\n",
+       "      <td>RDelt</td>\n",
+       "      <td>EMG_MilliVolts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852739</th>\n",
+       "      <td>109.265029</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852740</th>\n",
+       "      <td>109.265823</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852741</th>\n",
+       "      <td>109.266618</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852742</th>\n",
+       "      <td>109.267412</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3852743</th>\n",
+       "      <td>109.268206</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "      <td>LBicep_GYRO Z (deg/s)</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LBicep</td>\n",
+       "      <td>GYRO Z</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3852744 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         RDelt_EMG_TimeSeries RDelt_IMU_Acc X Time Series(s) gender  run_num  \\\n",
+       "0                    0.000000                              0   male        2   \n",
+       "1                    0.000794                        0.00675   male        2   \n",
+       "2                    0.001588                         0.0135   male        2   \n",
+       "3                    0.002382                        0.02025   male        2   \n",
+       "4                    0.003177                          0.027   male        2   \n",
+       "...                       ...                            ...    ...      ...   \n",
+       "3852739            109.265029                            NaN   male        2   \n",
+       "3852740            109.265823                            NaN   male        2   \n",
+       "3852741            109.266618                            NaN   male        2   \n",
+       "3852742            109.267412                            NaN   male        2   \n",
+       "3852743            109.268206                            NaN   male        2   \n",
+       "\n",
+       "            sensor_measurement     value Sensor_Body_Position measurement_type  \n",
+       "0         RDelt_EMG_MilliVolts  0.004868                RDelt   EMG_MilliVolts  \n",
+       "1         RDelt_EMG_MilliVolts  0.005875                RDelt   EMG_MilliVolts  \n",
+       "2         RDelt_EMG_MilliVolts  0.005203                RDelt   EMG_MilliVolts  \n",
+       "3         RDelt_EMG_MilliVolts  0.005539                RDelt   EMG_MilliVolts  \n",
+       "4         RDelt_EMG_MilliVolts  0.007721                RDelt   EMG_MilliVolts  \n",
+       "...                        ...       ...                  ...              ...  \n",
+       "3852739  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852740  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852741  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852742  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "3852743  LBicep_GYRO Z (deg/s)       NaN               LBicep           GYRO Z  \n",
+       "\n",
+       "[3852744 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df_p3_exo\n",
+    "columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n",
+    "measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n",
+    "df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n",
+    "    # Extract the Sensor Body Position\n",
+    "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "# Extract measurement type, including EMG millivolts\n",
+    "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') \n",
+    "df_melted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9aea9bd-e997-4330-9116-e4d9c6db90a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>RDelt_EMG_TimeSeries</th>\n",
+       "      <th>RDelt_EMG_MilliVolts</th>\n",
+       "      <th>RDelt_IMU_Acc X Time Series(s)</th>\n",
+       "      <th>RDelt_ACC X (G)</th>\n",
+       "      <th>RDelt_ACC Y (G)</th>\n",
+       "      <th>RDelt_ACC Z (G)</th>\n",
+       "      <th>RDelt_GYRO X (deg/s)</th>\n",
+       "      <th>RDelt_GYRO Y (deg/s)</th>\n",
+       "      <th>RDelt_GYRO Z (deg/s)</th>\n",
+       "      <th>LDelt_MilliVolts</th>\n",
+       "      <th>...</th>\n",
+       "      <th>RBicep_GYRO Z (deg/s)</th>\n",
+       "      <th>LBicep_MilliVolts</th>\n",
+       "      <th>LBicep_ACC X (G)</th>\n",
+       "      <th>LBicep_ACC Y (G)</th>\n",
+       "      <th>LBicep_ACC Z (G)</th>\n",
+       "      <th>LBicep_GYRO X (deg/s)</th>\n",
+       "      <th>LBicep_GYRO Y (deg/s)</th>\n",
+       "      <th>LBicep_GYRO Z (deg/s)</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>run_num</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.004868</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0747681</td>\n",
+       "      <td>0.9061279</td>\n",
+       "      <td>0.2548828</td>\n",
+       "      <td>-30.7404575</td>\n",
+       "      <td>-4.2519083</td>\n",
+       "      <td>9.358779</td>\n",
+       "      <td>-0.006546</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-11.9541988</td>\n",
+       "      <td>0.041962</td>\n",
+       "      <td>0.2507324</td>\n",
+       "      <td>0.8808594</td>\n",
+       "      <td>0.1972656</td>\n",
+       "      <td>-30.801527</td>\n",
+       "      <td>8.5572519</td>\n",
+       "      <td>12.6870232</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.000794</td>\n",
+       "      <td>0.005875</td>\n",
+       "      <td>0.00675</td>\n",
+       "      <td>0.0795288</td>\n",
+       "      <td>0.913208</td>\n",
+       "      <td>0.2689209</td>\n",
+       "      <td>-30.7786255</td>\n",
+       "      <td>-5.961832</td>\n",
+       "      <td>8.5419846</td>\n",
+       "      <td>-0.006546</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.5343513</td>\n",
+       "      <td>0.041962</td>\n",
+       "      <td>0.2453003</td>\n",
+       "      <td>0.8790283</td>\n",
+       "      <td>0.2055054</td>\n",
+       "      <td>-29.038168</td>\n",
+       "      <td>9.9007635</td>\n",
+       "      <td>13.0305347</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.001588</td>\n",
+       "      <td>0.005203</td>\n",
+       "      <td>0.0135</td>\n",
+       "      <td>0.0804443</td>\n",
+       "      <td>0.9194336</td>\n",
+       "      <td>0.2719116</td>\n",
+       "      <td>-29.9312973</td>\n",
+       "      <td>-6.8015265</td>\n",
+       "      <td>8.4503813</td>\n",
+       "      <td>-0.007217</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.801527</td>\n",
+       "      <td>0.041459</td>\n",
+       "      <td>0.2486572</td>\n",
+       "      <td>0.880188</td>\n",
+       "      <td>0.2092896</td>\n",
+       "      <td>-27.6641216</td>\n",
+       "      <td>8.9694653</td>\n",
+       "      <td>12.358779</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.002382</td>\n",
+       "      <td>0.005539</td>\n",
+       "      <td>0.02025</td>\n",
+       "      <td>0.0809326</td>\n",
+       "      <td>0.9316406</td>\n",
+       "      <td>0.2680054</td>\n",
+       "      <td>-29.1068707</td>\n",
+       "      <td>-6.8854961</td>\n",
+       "      <td>7.6793895</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-12.7480917</td>\n",
+       "      <td>0.039780</td>\n",
+       "      <td>0.2533569</td>\n",
+       "      <td>0.880127</td>\n",
+       "      <td>0.2134399</td>\n",
+       "      <td>-25.442749</td>\n",
+       "      <td>7.6106873</td>\n",
+       "      <td>11.1145039</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.003177</td>\n",
+       "      <td>0.007721</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>0.0866699</td>\n",
+       "      <td>0.9319458</td>\n",
+       "      <td>0.2663574</td>\n",
+       "      <td>-29.3129768</td>\n",
+       "      <td>-7.4045801</td>\n",
+       "      <td>6.7557254</td>\n",
+       "      <td>-0.005203</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-11.1832066</td>\n",
+       "      <td>0.041459</td>\n",
+       "      <td>0.2590942</td>\n",
+       "      <td>0.8770752</td>\n",
+       "      <td>0.2124634</td>\n",
+       "      <td>-23.557251</td>\n",
+       "      <td>6.0916033</td>\n",
+       "      <td>9.7862597</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137593</th>\n",
+       "      <td>109.265029</td>\n",
+       "      <td>0.017960</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.002182</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.030716</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137594</th>\n",
+       "      <td>109.265823</td>\n",
+       "      <td>0.019974</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.001846</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.035416</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137595</th>\n",
+       "      <td>109.266618</td>\n",
+       "      <td>0.020981</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.034745</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137596</th>\n",
+       "      <td>109.267412</td>\n",
+       "      <td>0.018631</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.005707</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.035248</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137597</th>\n",
+       "      <td>109.268206</td>\n",
+       "      <td>0.019974</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-0.004196</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.036591</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>male</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>137598 rows × 32 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        RDelt_EMG_TimeSeries  RDelt_EMG_MilliVolts  \\\n",
+       "0                   0.000000              0.004868   \n",
+       "1                   0.000794              0.005875   \n",
+       "2                   0.001588              0.005203   \n",
+       "3                   0.002382              0.005539   \n",
+       "4                   0.003177              0.007721   \n",
+       "...                      ...                   ...   \n",
+       "137593            109.265029              0.017960   \n",
+       "137594            109.265823              0.019974   \n",
+       "137595            109.266618              0.020981   \n",
+       "137596            109.267412              0.018631   \n",
+       "137597            109.268206              0.019974   \n",
+       "\n",
+       "       RDelt_IMU_Acc X Time Series(s) RDelt_ACC X (G) RDelt_ACC Y (G)  \\\n",
+       "0                                   0       0.0747681       0.9061279   \n",
+       "1                             0.00675       0.0795288        0.913208   \n",
+       "2                              0.0135       0.0804443       0.9194336   \n",
+       "3                             0.02025       0.0809326       0.9316406   \n",
+       "4                               0.027       0.0866699       0.9319458   \n",
+       "...                               ...             ...             ...   \n",
+       "137593                            NaN             NaN             NaN   \n",
+       "137594                            NaN             NaN             NaN   \n",
+       "137595                            NaN             NaN             NaN   \n",
+       "137596                            NaN             NaN             NaN   \n",
+       "137597                            NaN             NaN             NaN   \n",
+       "\n",
+       "       RDelt_ACC Z (G) RDelt_GYRO X (deg/s) RDelt_GYRO Y (deg/s)  \\\n",
+       "0            0.2548828          -30.7404575           -4.2519083   \n",
+       "1            0.2689209          -30.7786255            -5.961832   \n",
+       "2            0.2719116          -29.9312973           -6.8015265   \n",
+       "3            0.2680054          -29.1068707           -6.8854961   \n",
+       "4            0.2663574          -29.3129768           -7.4045801   \n",
+       "...                ...                  ...                  ...   \n",
+       "137593             NaN                  NaN                  NaN   \n",
+       "137594             NaN                  NaN                  NaN   \n",
+       "137595             NaN                  NaN                  NaN   \n",
+       "137596             NaN                  NaN                  NaN   \n",
+       "137597             NaN                  NaN                  NaN   \n",
+       "\n",
+       "       RDelt_GYRO Z (deg/s)  LDelt_MilliVolts  ... RBicep_GYRO Z (deg/s)  \\\n",
+       "0                  9.358779         -0.006546  ...           -11.9541988   \n",
+       "1                 8.5419846         -0.006546  ...           -12.5343513   \n",
+       "2                 8.4503813         -0.007217  ...            -12.801527   \n",
+       "3                 7.6793895         -0.004196  ...           -12.7480917   \n",
+       "4                 6.7557254         -0.005203  ...           -11.1832066   \n",
+       "...                     ...               ...  ...                   ...   \n",
+       "137593                  NaN         -0.002182  ...                   NaN   \n",
+       "137594                  NaN         -0.001846  ...                   NaN   \n",
+       "137595                  NaN         -0.004196  ...                   NaN   \n",
+       "137596                  NaN         -0.005707  ...                   NaN   \n",
+       "137597                  NaN         -0.004196  ...                   NaN   \n",
+       "\n",
+       "       LBicep_MilliVolts LBicep_ACC X (G) LBicep_ACC Y (G) LBicep_ACC Z (G)  \\\n",
+       "0               0.041962        0.2507324        0.8808594        0.1972656   \n",
+       "1               0.041962        0.2453003        0.8790283        0.2055054   \n",
+       "2               0.041459        0.2486572         0.880188        0.2092896   \n",
+       "3               0.039780        0.2533569         0.880127        0.2134399   \n",
+       "4               0.041459        0.2590942        0.8770752        0.2124634   \n",
+       "...                  ...              ...              ...              ...   \n",
+       "137593          0.030716              NaN              NaN              NaN   \n",
+       "137594          0.035416              NaN              NaN              NaN   \n",
+       "137595          0.034745              NaN              NaN              NaN   \n",
+       "137596          0.035248              NaN              NaN              NaN   \n",
+       "137597          0.036591              NaN              NaN              NaN   \n",
+       "\n",
+       "       LBicep_GYRO X (deg/s)  LBicep_GYRO Y (deg/s) LBicep_GYRO Z (deg/s)  \\\n",
+       "0                 -30.801527              8.5572519            12.6870232   \n",
+       "1                 -29.038168              9.9007635            13.0305347   \n",
+       "2                -27.6641216              8.9694653             12.358779   \n",
+       "3                 -25.442749              7.6106873            11.1145039   \n",
+       "4                 -23.557251              6.0916033             9.7862597   \n",
+       "...                      ...                    ...                   ...   \n",
+       "137593                   NaN                    NaN                   NaN   \n",
+       "137594                   NaN                    NaN                   NaN   \n",
+       "137595                   NaN                    NaN                   NaN   \n",
+       "137596                   NaN                    NaN                   NaN   \n",
+       "137597                   NaN                    NaN                   NaN   \n",
+       "\n",
+       "       gender run_num  \n",
+       "0        male       2  \n",
+       "1        male       2  \n",
+       "2        male       2  \n",
+       "3        male       2  \n",
+       "4        male       2  \n",
+       "...       ...     ...  \n",
+       "137593   male       2  \n",
+       "137594   male       2  \n",
+       "137595   male       2  \n",
+       "137596   male       2  \n",
+       "137597   male       2  \n",
+       "\n",
+       "[137598 rows x 32 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "    # Extract the Sensor Body Position\n",
+    "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n",
+    "# Extract measurement type, including EMG millivolts\n",
+    "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)')    # Drop the original sensor_measurement column\n",
+    "df_melted = df_melted.drop(columns=[\"sensor_measurement\"])    # Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n",
+    "df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n",
+    "df_melted.fillna(np.nan, inplace=True)\n",
+    "# Pivot the DataFrame so each measurement type becomes a separate column\n",
+    "df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n",
+    "                                columns='measurement_type', values='value')\n",
+    "df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n",
+    "df_pivoted.columns = df_pivoted.columns.str.strip()\n",
+    "df_pivoted = df_pivoted.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bf43181-6aa6-44ba-8fc1-44dc7bbeaed3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_p3_exo = create_sensor_col(df_p3_exo)\n",
+    "df_p3_noexo = create_sensor_col(df_p3_noexo)\n",
+    "df_p4_exo = create_sensor_col(df_p4_exo)\n",
+    "df_p4_noexo = create_sensor_col(df_p4_noexo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01bf5481-3d22-4143-8990-fec243ae013e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.\n",
+    "combined_df = pd.concat(dfs, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6263007-da44-4d49-b437-0754e56a2def",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Show the head of the data\n",
+    "# df_p3_exo.describe()\n",
+    "df_p3_noexo.head()\n",
+    "# df_p4_exo.head()\n",
+    "# df_p4_noexo.head()\n",
+    "# # Choose inputs\n",
+    "# features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()\n",
+    "# features.head()\n",
+    "feature_sets = []\n",
+    "# Run functions to extract features for each dataframe\n",
+    "#CP: does this make sure to remove the redundant time series columns?\n",
+    "#can keep  ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name \n",
+    "for df in dfs:\n",
+    "    emg_features = compute_emg_features(df['EMG 1 (mV)'])\n",
+    "    accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])\n",
+    "    gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])\n",
+    "    features = {\n",
+    "        'emg': emg_features,\n",
+    "        'accel': accel_features,\n",
+    "        'gyro': gyro_features\n",
+    "    }\n",
+    "    feature_sets.append(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccbbdce4-9c51-4b5c-b263-d23cc0c79154",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 93b616b640e81c7e53b2b76441845c04fb296350 Mon Sep 17 00:00:00 2001
From: Cyrus Parvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Wed, 21 May 2025 22:35:15 -0700
Subject: [PATCH 05/12] worked on upsampling BEFORE melt

---
 FeatureExtraction.py | 10 +++++++--
 UpsamplingIMU.py     | 27 +++++++++++++++++++++++++
 dataCleaning.py      | 48 +++++---------------------------------------
 3 files changed, 40 insertions(+), 45 deletions(-)
 create mode 100644 UpsamplingIMU.py

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index 1fba79e..519c6ce 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
-from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col
+from dataCleaning import read_run, column_clean, preprocessing
+from dataCleaning import create_sensor_col, standardize_time_series
 import pdb
 
 def overall_cleaning():
@@ -13,6 +14,11 @@ def overall_cleaning():
     df_p3_noexo = column_clean(df_p3_noexo)
     df_p4_exo = column_clean(df_p4_exo)
     df_p4_noexo = column_clean(df_p4_noexo)
+    #upsample IMU to match EMG
+    standardize_time_series(df_p3_exo)
+    standardize_time_series(df_p3_noexo)
+    standardize_time_series(df_p4_exo)
+    standardize_time_series(df_p4_noexo)
 
     df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True)
     df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False)
@@ -21,7 +27,7 @@ def overall_cleaning():
 
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
     combined_df = pd.concat(dfs, ignore_index=True)
-    
+
     # # Show the head of the data
     # df_p3_exo.describe()
     df_p3_noexo.head()
diff --git a/UpsamplingIMU.py b/UpsamplingIMU.py
new file mode 100644
index 0000000..39162b7
--- /dev/null
+++ b/UpsamplingIMU.py
@@ -0,0 +1,27 @@
+import pandas as pd
+import pdb
+
+def upsample(df):
+  IMU_cols = [
+       'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)',
+       'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',  'LDelt_ACC X (G)', 'LDelt_ACC Y (G)',
+       'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)',
+       'LDelt_GYRO Z (deg/s)',  'RBicep_ACC X (G)',
+       'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)',
+       'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
+       'LBicep_ACC X (G)', 'LBicep_ACC Y (G)',
+       'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)',
+       'LBicep_GYRO Z (deg/s)'
+       ]
+  df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries'])
+  df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s')
+  df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN
+  df = df.set_index('time')
+  freq_nanseconds = int(0.0007941176470588235 * 1e9)  # Convert to integer microseconds
+  IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq()
+  IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values 
+  IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x))
+  IMU_upsampled = IMU_upsampled.interpolate(method='linear')  # Interpolates the data using the linear method to match EMG data
+  #sine interpolation is best 
+  df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts'])
+  return df_new
\ No newline at end of file
diff --git a/dataCleaning.py b/dataCleaning.py
index 74407d6..5f84da0 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -3,6 +3,7 @@
 import numpy as np 
 import pdb 
 from modif_cols import tidy_emg_imu_as_measured 
+from UpsamplingIMU import upsample
 # Data Labels:
 # Label for EMG Data shared:
 #     Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
@@ -52,46 +53,11 @@ def column_clean(df):
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
     return df 
 
-def create_sensor_col(df, run_num, gender, exo): 
-    # muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep']
-    # # Identify all measurement columns for melting
-    # measurement_cols = [col for col in df.columns if any(muscle in col for muscle in muscles)]
-    # # Melt all measurement columns (EMG, ACC, GYRO)
-    # df_long = df.melt(
-    #     id_vars=['EMG_TimeSeries', 'IMU_TimeSeries'],
-    #     value_vars=measurement_cols,
-    #     var_name='sensor_measurement',
-    #     value_name='value'
-    # )
-    # # Extract Muscle, Sensor, and Axis from the column name
-    # df_long[['Muscle', 'Sensor', 'Axis']] = df_long['sensor_measurement'].str.extract(
-    #     r'^(RDelt|LDelt|RBicep|LBicep)_(EMG|ACC|GYRO)[ _]?(X|Y|Z)?'
-    # )
-    # # Build measurement type column for pivoting
-    # df_long['Measurement'] = np.where(
-    #     df_long['Sensor'] == 'EMG',
-    #     'EMG_MV',
-    #     df_long['Sensor'] + ' ' + df_long['Axis']
-    # )
-    # pdb.set_trace()
-
-    # # Pivot so each row is a Muscle-Timepoint, columns are measurement types
-    # tidy = df_long.pivot_table(
-    #     index=['EMG_TimeSeries', 'IMU_TimeSeries', 'Muscle'],
-    #     columns='Measurement',
-    #     values='value',
-    #     aggfunc='first'
-    # ).reset_index()
-    # # Flatten columns so pivot table multiindexes don't persist
-    # tidy.columns.name = None
-    # tidy = tidy.rename_axis(None, axis=1)
-    # #  sort columns
-    # columns_order = ['EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV',
-    #                  'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z', 'Muscle']
-    # # Only include columns that actually exist in the data
-    # columns_order = [col for col in columns_order if col in tidy.columns]
+def standardize_time_series(df):
+    upsample(df)
 
-    # df_pivoted = tidy[columns_order]
+#melting and stuff
+def create_sensor_col(df, run_num, gender, exo): 
     df_pivoted = tidy_emg_imu_as_measured(df)
     df_pivoted.columns = df_pivoted.columns.str.strip()
     pdb.set_trace()
@@ -104,9 +70,5 @@ def create_sensor_col(df, run_num, gender, exo):
     return df_pivoted
 
 
-def standardize_time_series():
-    # interpolate()
-    pass
-
 def preprocessing(full_df):
     pass

From ac4c667041e35c63089651c57aed0cbdf270f021 Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Fri, 23 May 2025 23:38:27 -0700
Subject: [PATCH 06/12] something

---
 FeatureExtraction.py |  5 +++--
 UpsamplingIMU.py     | 27 ----------------------
 dataCleaning.py      |  5 +++--
 resampling.py        | 53 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 31 deletions(-)
 delete mode 100644 UpsamplingIMU.py
 create mode 100644 resampling.py

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index 519c6ce..daa5082 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -27,7 +27,7 @@ def overall_cleaning():
 
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
     combined_df = pd.concat(dfs, ignore_index=True)
-
+    combined_df = standardize_time_series(combined_df)
     # # Show the head of the data
     # df_p3_exo.describe()
     df_p3_noexo.head()
@@ -50,7 +50,8 @@ def overall_cleaning():
             'gyro': gyro_features
         }
         feature_sets.append(features)
-
+    #TO-DO make Exo or No Exo variable?
+    #imputation/preprocessing
     # feature_sets now contains extracted features for each df
     p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets
     return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats
diff --git a/UpsamplingIMU.py b/UpsamplingIMU.py
deleted file mode 100644
index 39162b7..0000000
--- a/UpsamplingIMU.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import pandas as pd
-import pdb
-
-def upsample(df):
-  IMU_cols = [
-       'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)',
-       'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',  'LDelt_ACC X (G)', 'LDelt_ACC Y (G)',
-       'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)',
-       'LDelt_GYRO Z (deg/s)',  'RBicep_ACC X (G)',
-       'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)',
-       'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
-       'LBicep_ACC X (G)', 'LBicep_ACC Y (G)',
-       'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)',
-       'LBicep_GYRO Z (deg/s)'
-       ]
-  df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries'])
-  df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s')
-  df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN
-  df = df.set_index('time')
-  freq_nanseconds = int(0.0007941176470588235 * 1e9)  # Convert to integer microseconds
-  IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq()
-  IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values 
-  IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x))
-  IMU_upsampled = IMU_upsampled.interpolate(method='linear')  # Interpolates the data using the linear method to match EMG data
-  #sine interpolation is best 
-  df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts'])
-  return df_new
\ No newline at end of file
diff --git a/dataCleaning.py b/dataCleaning.py
index 5f84da0..83fd67d 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -3,7 +3,7 @@
 import numpy as np 
 import pdb 
 from modif_cols import tidy_emg_imu_as_measured 
-from UpsamplingIMU import upsample
+from resampling import upsample, downsample
 # Data Labels:
 # Label for EMG Data shared:
 #     Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
@@ -54,7 +54,8 @@ def column_clean(df):
     return df 
 
 def standardize_time_series(df):
-    upsample(df)
+    df = downsample(df)
+    return df
 
 #melting and stuff
 def create_sensor_col(df, run_num, gender, exo): 
diff --git a/resampling.py b/resampling.py
new file mode 100644
index 0000000..22da474
--- /dev/null
+++ b/resampling.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import numpy as np
+def upsample(df):
+    IMU_cols = [
+       'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)',
+       'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',  'LDelt_ACC X (G)', 'LDelt_ACC Y (G)',
+       'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)',
+       'LDelt_GYRO Z (deg/s)',  'RBicep_ACC X (G)',
+       'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)',
+       'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
+       'LBicep_ACC X (G)', 'LBicep_ACC Y (G)',
+       'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)',
+       'LBicep_GYRO Z (deg/s)'
+       ]
+    df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries'])
+    df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s')
+    df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN
+    df = df.set_index('time')
+    freq_nanseconds = int(0.0007941176470588235 * 1e9)  # Convert to integer microseconds
+    IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq()
+    IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values 
+    IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x))
+    IMU_upsampled = IMU_upsampled.interpolate(method='linear')  # Interpolates the data using the linear method to match EMG data
+    #sine interpolation is best 
+    df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts'])
+    return df_new
+
+def downsample(df):
+  EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts']
+  
+  df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries'])
+  df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s')
+  df = df.set_index('time')
+  
+  df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq()  # Scales these columns to be the same length as IMU data
+  return df
+
+## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index
+def alternative(df):
+  high_rate = 1259
+  low_rate = 148
+  step = high_rate / low_rate
+  
+  EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts']
+  
+  # Indexes to sample
+  indices = np.round(np.arange(0, len(df), step)).astype(int)
+  indices = indices[indices < len(df)]  # Ensure we stay within bounds
+
+  # Downsample using nearest index
+  df[EMG_cols] = df[EMG_cols].iloc[indices].reset_index(drop=True)
+  df = df.drop(columns=['EMG_TimeSeries'])
+  return df

From db5326756b41e805b78daa87ef00ba5c9764331d Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Fri, 23 May 2025 23:42:16 -0700
Subject: [PATCH 07/12] adde din filters

---
 FeatureExtraction.py |  2 +-
 dataCleaning.py      | 14 +++++++++++
 dk.txt               | 57 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 dk.txt

diff --git a/FeatureExtraction.py b/FeatureExtraction.py
index daa5082..6408401 100644
--- a/FeatureExtraction.py
+++ b/FeatureExtraction.py
@@ -30,7 +30,7 @@ def overall_cleaning():
     combined_df = standardize_time_series(combined_df)
     # # Show the head of the data
     # df_p3_exo.describe()
-    df_p3_noexo.head()
+    # df_p3_noexo.head()
     # df_p4_exo.head()
     # df_p4_noexo.head()
     # # Choose inputs
diff --git a/dataCleaning.py b/dataCleaning.py
index 83fd67d..07c0acc 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -53,6 +53,20 @@ def column_clean(df):
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
     return df 
 
+def bandpass_filter_emg(signal, fs=1259, lowcut=20, highcut=450, order=4):
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = butter(order, [low, high], btype='band')
+    return filtfilt(b, a, signal)
+
+# IMU Low-pass Filter (<20Hz)
+def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4):
+    nyq = 0.5 * fs
+    normal_cutoff = cutoff / nyq
+    b, a = butter(order, normal_cutoff, btype='low')
+    return filtfilt(b, a, signal)
+
 def standardize_time_series(df):
     df = downsample(df)
     return df
diff --git a/dk.txt b/dk.txt
new file mode 100644
index 0000000..dfcadb6
--- /dev/null
+++ b/dk.txt
@@ -0,0 +1,57 @@
+import pandas as pd
+import numpy as np
+
+def emg_to_imu_asof_all_muscles(df, emg_time_col='EMG_TimeSeries', imu_time_col='IMU_TimeSeries'):
+    """
+    For each muscle group, create a tidy DataFrame where each row is an EMG measurement,
+    with corresponding IMU data (if available) joined via pandas.merge_asof (backward).
+    IMU columns may have NaNs if not available at that EMG time.
+    Returns a concatenated DataFrame for all muscles with a 'Muscle' column.
+    """
+    muscle_names = []
+    for col in df.columns:
+        if '_EMG_MilliVolts' in col:
+            muscle = col.replace('_EMG_MilliVolts', '')
+            muscle_names.append(muscle)
+
+    all_muscles = []
+    for muscle in muscle_names:
+        # Build EMG DataFrame for this muscle
+        emg_df = pd.DataFrame({
+            'EMG_TimeSeries': pd.to_numeric(df[emg_time_col], errors='coerce'),
+            'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce')
+        }).dropna(subset=['EMG_TimeSeries', 'EMG_MV'])
+
+        # Build IMU DataFrame for this muscle
+        imu_df = pd.DataFrame({
+            'IMU_TimeSeries': pd.to_numeric(df[imu_time_col], errors='coerce'),
+            'ACC X': pd.to_numeric(df.get(f'{muscle}_ACC X (G)'), errors='coerce'),
+            'ACC Y': pd.to_numeric(df.get(f'{muscle}_ACC Y (G)'), errors='coerce'),
+            'ACC Z': pd.to_numeric(df.get(f'{muscle}_ACC Z (G)'), errors='coerce'),
+            'GYRO X': pd.to_numeric(df.get(f'{muscle}_GYRO X (deg/s)'), errors='coerce'),
+            'GYRO Y': pd.to_numeric(df.get(f'{muscle}_GYRO Y (deg/s)'), errors='coerce'),
+            'GYRO Z': pd.to_numeric(df.get(f'{muscle}_GYRO Z (deg/s)'), errors='coerce')
+        })
+
+        # Merge IMU onto EMG (backward: most recent IMU)
+        merged = pd.merge_asof(
+            emg_df.sort_values('EMG_TimeSeries'),
+            imu_df.sort_values('IMU_TimeSeries'),
+            left_on='EMG_TimeSeries',
+            right_on='IMU_TimeSeries',
+            direction='backward'
+        )
+
+        merged['Muscle'] = muscle
+        all_muscles.append(merged)
+
+    tidy = pd.concat(all_muscles, ignore_index=True)
+    # Order columns
+    cols = ['Muscle', 'EMG_TimeSeries', 'EMG_MV', 'IMU_TimeSeries',
+            'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z']
+    tidy = tidy[cols]
+    return tidy
+
+# Usage example:
+# tidy_df = emg_to_imu_asof_all_muscles(df)
+# tidy_df.to_csv('emg_imu_tidy.csv', index=False)
\ No newline at end of file

From 8aeecc817b365e1915a453991e3c69ccf427d416 Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Fri, 23 May 2025 23:44:41 -0700
Subject: [PATCH 08/12] separated files out

---
 feature_extraction.py           | 51 +++++++++++++++++++++++++++++++++
 FeatureExtraction.py => main.py | 36 +++--------------------
 2 files changed, 55 insertions(+), 32 deletions(-)
 create mode 100644 feature_extraction.py
 rename FeatureExtraction.py => main.py (72%)

diff --git a/feature_extraction.py b/feature_extraction.py
new file mode 100644
index 0000000..8f9c4cb
--- /dev/null
+++ b/feature_extraction.py
@@ -0,0 +1,51 @@
+# Calculations for Feature Extraction from Project_Guide
+def compute_emg_features(signal):
+    return {
+        'mean': np.mean(signal),
+        'max': np.max(signal),
+        'min': np.min(signal),
+        'std': np.std(signal),
+        'rms': np.sqrt(np.mean(signal**2))
+    }
+
+def compute_accel_features(a_x, a_y, a_z):
+    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
+    
+    features = {
+        'peak_accel': np.max(a_mag),
+        'mean_accel': np.mean(a_mag),
+        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),
+        'accel_range': np.max(a_mag) - np.min(a_mag)
+    }
+    return features
+
+def compute_gyro_features(w_x, w_y, w_z):
+    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
+    
+    features = {
+        'peak_angular_vel': np.max(w_mag),
+        'mean_angular_vel': np.mean(w_mag),
+        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),
+        'angular_vel_range': np.max(w_mag) - np.min(w_mag)
+    }
+    return features 
+
+
+        # fft_mean = mean(valid_freqs * valid_fft)
+        # fft_median = median(valid_freqs * valid_fft)
+        # fft_power = np.sum(valid_fft**2)
+
+        # feature_row = {
+        #     'emg_max': emg.max(),
+        #     'emg_min': emg.min(),
+        #     'emg_rms': np.sqrt(np.mean(emg**2)),
+        #     'acc_peak': np.linalg.norm(acc, axis=1).max(),
+        #     'acc_range': np.ptp(np.linalg.norm(acc, axis=1)),
+        #     'gyro_peak': np.linalg.norm(gyro, axis=1).max(),
+        #     'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)),
+        #     'emg_fft_mean_freq': fft_mean,
+        #     'emg_fft_median_freq': fft_median,
+        #     'emg_fft_power': fft_power,
+        #     'label': label,
+        #     'gender': gender
+        # }
\ No newline at end of file
diff --git a/FeatureExtraction.py b/main.py
similarity index 72%
rename from FeatureExtraction.py
rename to main.py
index 6408401..9142f53 100644
--- a/FeatureExtraction.py
+++ b/main.py
@@ -4,7 +4,7 @@
 from dataCleaning import create_sensor_col, standardize_time_series
 import pdb
 
-def overall_cleaning():
+def main_cleaning():
     df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male
     df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
     df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
@@ -56,38 +56,10 @@ def overall_cleaning():
     p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets
     return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats
 
-# Calculations for Feature Extraction from Project_Guide
-def compute_emg_features(signal):
-    return {
-        'mean': np.mean(signal),
-        'max': np.max(signal),
-        'min': np.min(signal),
-        'std': np.std(signal),
-        'rms': np.sqrt(np.mean(signal**2))
-    }
 
-def compute_accel_features(a_x, a_y, a_z):
-    a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
-    
-    features = {
-        'peak_accel': np.max(a_mag),
-        'mean_accel': np.mean(a_mag),
-        'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),
-        'accel_range': np.max(a_mag) - np.min(a_mag)
-    }
-    return features
-
-def compute_gyro_features(w_x, w_y, w_z):
-    w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
-    
-    features = {
-        'peak_angular_vel': np.max(w_mag),
-        'mean_angular_vel': np.mean(w_mag),
-        'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),
-        'angular_vel_range': np.max(w_mag) - np.min(w_mag)
-    }
-    return features    
+def main():
+    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()
 
 
 if __name__ == '__main__':
-    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()
\ No newline at end of file
+    main()
\ No newline at end of file

From 4536564f684233ad75b726a8fcb3ee6491658c1d Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Sat, 24 May 2025 22:08:43 -0700
Subject: [PATCH 09/12] saving

---
 feature_extraction.py |  2 ++
 main.py               | 11 +++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/feature_extraction.py b/feature_extraction.py
index 8f9c4cb..a7edb07 100644
--- a/feature_extraction.py
+++ b/feature_extraction.py
@@ -1,3 +1,5 @@
+import pandas as pd
+import numpy as np
 # Calculations for Feature Extraction from Project_Guide
 def compute_emg_features(signal):
     return {
diff --git a/main.py b/main.py
index 9142f53..0cd1b56 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,7 @@
 from dataCleaning import create_sensor_col, standardize_time_series
 import pdb
 
-def main_cleaning():
+def overall_cleaning():
     df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male
     df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
     df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female
@@ -15,11 +15,10 @@ def main_cleaning():
     df_p4_exo = column_clean(df_p4_exo)
     df_p4_noexo = column_clean(df_p4_noexo)
     #upsample IMU to match EMG
-    standardize_time_series(df_p3_exo)
-    standardize_time_series(df_p3_noexo)
-    standardize_time_series(df_p4_exo)
-    standardize_time_series(df_p4_noexo)
-
+    df_p3_exo = standardize_time_series(df_p3_exo)
+    df_p3_noexo = standardize_time_series(df_p3_noexo)
+    df_p4_exo = standardize_time_series(df_p4_exo)
+    df_p4_noexo = standardize_time_series(df_p4_noexo)
     df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True)
     df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False)
     df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True)

From 0d8b0b0d4bffe130bebd7ceb4e29672ee262652e Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Sun, 25 May 2025 00:26:05 -0700
Subject: [PATCH 10/12] did pretty much all data cleaning have to fix
 preprocessing and filters

---
 dataCleaning.py       | 35 ++++++++++++++++++++------
 feature_extraction.py | 46 ++++++++++++++++++----------------
 main.py               | 58 ++++++++++++++++---------------------------
 modif_cols.py         | 56 ++++++++++++++++-------------------------
 resampling.py         | 22 ++++++++++++++--
 5 files changed, 113 insertions(+), 104 deletions(-)

diff --git a/dataCleaning.py b/dataCleaning.py
index 07c0acc..0bb0d31 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -4,6 +4,13 @@
 import pdb 
 from modif_cols import tidy_emg_imu_as_measured 
 from resampling import upsample, downsample
+from scipy.signal import filtfilt, butter
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import ColumnTransformer
 # Data Labels:
 # Label for EMG Data shared:
 #     Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
@@ -67,23 +74,35 @@ def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4):
     b, a = butter(order, normal_cutoff, btype='low')
     return filtfilt(b, a, signal)
 
-def standardize_time_series(df):
-    df = downsample(df)
-    return df
-
 #melting and stuff
 def create_sensor_col(df, run_num, gender, exo): 
     df_pivoted = tidy_emg_imu_as_measured(df)
     df_pivoted.columns = df_pivoted.columns.str.strip()
-    pdb.set_trace()
     df_pivoted = df_pivoted.reset_index()
-
     df_pivoted['gender'] = gender
     df_pivoted['run_num'] = run_num
     df_pivoted['exo'] = exo
     df_pivoted.to_csv("pivoted_df.csv")
     return df_pivoted
 
-
 def preprocessing(full_df):
-    pass
+    # num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
+    #             "total_bedrooms", "population", "households", "median_income"]
+    # cat_attribs = ["ocean_proximity"]
+    num_attribs = full_df.select_dtypes(include=['number']).columns.tolist()
+    cat_attribs = full_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
+    num_pipeline = Pipeline([
+    ("impute", SimpleImputer(strategy="median")),
+    ("standardize", StandardScaler()),
+    ])
+
+    cat_pipeline = Pipeline([
+    ("impute", SimpleImputer(strategy="most_frequent")),
+    ("oneHot", OneHotEncoder()),
+    ])
+
+    preprocessing = ColumnTransformer([
+    ("num", num_pipeline, num_attribs),
+    ("cat", cat_pipeline, cat_attribs),
+    ])
+    return full_df
\ No newline at end of file
diff --git a/feature_extraction.py b/feature_extraction.py
index a7edb07..29c4871 100644
--- a/feature_extraction.py
+++ b/feature_extraction.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import numpy as np
 # Calculations for Feature Extraction from Project_Guide
-def compute_emg_features(signal):
+
+def compute_emg_features(df):
+    signal = df['EMG_MilliVolts']
     return {
         'mean': np.mean(signal),
         'max': np.max(signal),
@@ -10,7 +12,8 @@ def compute_emg_features(signal):
         'rms': np.sqrt(np.mean(signal**2))
     }
 
-def compute_accel_features(a_x, a_y, a_z):
+def compute_accel_features(df):
+    a_x = df['ACC X (G)'], a_y = df['ACC Y (G)'], a_z = df['ACC Z (G)']
     a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
     
     features = {
@@ -21,7 +24,8 @@ def compute_accel_features(a_x, a_y, a_z):
     }
     return features
 
-def compute_gyro_features(w_x, w_y, w_z):
+def compute_gyro_features(df):
+    w_x = df['GYRO X (deg/s)'], w_y = df['GYRO Y (deg/s)'], w_z = df['GYRO Z (deg/s)']
     w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
     
     features = {
@@ -31,23 +35,21 @@ def compute_gyro_features(w_x, w_y, w_z):
         'angular_vel_range': np.max(w_mag) - np.min(w_mag)
     }
     return features 
+    # fft_mean = mean(valid_freqs * valid_fft)
+    # fft_median = median(valid_freqs * valid_fft)
+    # fft_power = np.sum(valid_fft**2)
 
-
-        # fft_mean = mean(valid_freqs * valid_fft)
-        # fft_median = median(valid_freqs * valid_fft)
-        # fft_power = np.sum(valid_fft**2)
-
-        # feature_row = {
-        #     'emg_max': emg.max(),
-        #     'emg_min': emg.min(),
-        #     'emg_rms': np.sqrt(np.mean(emg**2)),
-        #     'acc_peak': np.linalg.norm(acc, axis=1).max(),
-        #     'acc_range': np.ptp(np.linalg.norm(acc, axis=1)),
-        #     'gyro_peak': np.linalg.norm(gyro, axis=1).max(),
-        #     'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)),
-        #     'emg_fft_mean_freq': fft_mean,
-        #     'emg_fft_median_freq': fft_median,
-        #     'emg_fft_power': fft_power,
-        #     'label': label,
-        #     'gender': gender
-        # }
\ No newline at end of file
+    # feature_row = {
+    #     'emg_max': emg.max(),
+    #     'emg_min': emg.min(),
+    #     'emg_rms': np.sqrt(np.mean(emg**2)),
+    #     'acc_peak': np.linalg.norm(acc, axis=1).max(),
+    #     'acc_range': np.ptp(np.linalg.norm(acc, axis=1)),
+    #     'gyro_peak': np.linalg.norm(gyro, axis=1).max(),
+    #     'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)),
+    #     'emg_fft_mean_freq': fft_mean,
+    #     'emg_fft_median_freq': fft_median,
+    #     'emg_fft_power': fft_power,
+    #     'label': label,
+    #     'gender': gender
+    # }
\ No newline at end of file
diff --git a/main.py b/main.py
index 0cd1b56..cad286f 100644
--- a/main.py
+++ b/main.py
@@ -1,7 +1,10 @@
 import pandas as pd
 import numpy as np
 from dataCleaning import read_run, column_clean, preprocessing
-from dataCleaning import create_sensor_col, standardize_time_series
+from dataCleaning import create_sensor_col
+from resampling import downsample
+from feature_extraction import compute_emg_features, compute_accel_features, compute_gyro_features
+
 import pdb
 
 def overall_cleaning():
@@ -14,51 +17,32 @@ def overall_cleaning():
     df_p3_noexo = column_clean(df_p3_noexo)
     df_p4_exo = column_clean(df_p4_exo)
     df_p4_noexo = column_clean(df_p4_noexo)
-    #upsample IMU to match EMG
-    df_p3_exo = standardize_time_series(df_p3_exo)
-    df_p3_noexo = standardize_time_series(df_p3_noexo)
-    df_p4_exo = standardize_time_series(df_p4_exo)
-    df_p4_noexo = standardize_time_series(df_p4_noexo)
+    #downsample EMG to match IMU
+    df_p3_exo = downsample(df_p3_exo)
+    df_p3_noexo = downsample(df_p3_noexo)
+    df_p4_exo = downsample(df_p4_exo)
+    df_p4_noexo = downsample(df_p4_noexo)
+    #melt sensor columns into a body part sensor
     df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True)
     df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False)
     df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True)
     df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False)
-
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
     combined_df = pd.concat(dfs, ignore_index=True)
-    combined_df = standardize_time_series(combined_df)
-    # # Show the head of the data
-    # df_p3_exo.describe()
-    # df_p3_noexo.head()
-    # df_p4_exo.head()
-    # df_p4_noexo.head()
-    # # Choose inputs
-    # features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()
-    # features.head()
-    feature_sets = []
     # Run functions to extract features for each dataframe
-    #CP: does this make sure to remove the redundant time series columns?
-    #can keep  ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name 
-    for df in dfs:
-        emg_features = compute_emg_features(df['EMG 1 (mV)'])
-        accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])
-        gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])
-        features = {
-            'emg': emg_features,
-            'accel': accel_features,
-            'gyro': gyro_features
-        }
-        feature_sets.append(features)
-    #TO-DO make Exo or No Exo variable?
-    #imputation/preprocessing
-    # feature_sets now contains extracted features for each df
-    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets
-    return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats
-
+    pdb.set_trace() 
+    emg_features = compute_emg_features(combined_df)
+    accel_features = compute_accel_features(combined_df)
+    gyro_features = compute_gyro_features(combined_df)  # feature_sets = []
+    #do this on EMG cols:
+    # bandpass_filter_emg(df)
+    #on IMU cols: 
+    # lowpass_filter_imu(df)
+    
+    return combined_df
 
 def main():
-    p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()
-
+    final_df =  overall_cleaning()
 
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/modif_cols.py b/modif_cols.py
index 7008e02..2556d16 100644
--- a/modif_cols.py
+++ b/modif_cols.py
@@ -2,38 +2,24 @@
 import numpy as np
 
 def tidy_emg_imu_as_measured(df):
-    muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep']
-    all_muscle_tables = []
-    for muscle in muscles:
-        # EMG rows (as measured)
-        emg_df = pd.DataFrame({
-            'Muscle': muscle,
-            'EMG_TimeSeries': pd.to_numeric(df['EMG_TimeSeries'], errors='coerce'),
-            'IMU_TimeSeries': np.nan,
-            'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce'),
-            'ACC X': np.nan, 'ACC Y': np.nan, 'ACC Z': np.nan,
-            'GYRO X': np.nan, 'GYRO Y': np.nan, 'GYRO Z': np.nan
-        })
-        # IMU rows (as measured)
-        imu_df = pd.DataFrame({
-            'Muscle': muscle,
-            'EMG_TimeSeries': np.nan,
-            'IMU_TimeSeries': pd.to_numeric(df['IMU_TimeSeries'], errors='coerce'),
-            'EMG_MV': np.nan,
-            'ACC X': pd.to_numeric(df[f'{muscle}_ACC X (G)'], errors='coerce'),
-            'ACC Y': pd.to_numeric(df[f'{muscle}_ACC Y (G)'], errors='coerce'),
-            'ACC Z': pd.to_numeric(df[f'{muscle}_ACC Z (G)'], errors='coerce'),
-            'GYRO X': pd.to_numeric(df[f'{muscle}_GYRO X (deg/s)'], errors='coerce'),
-            'GYRO Y': pd.to_numeric(df[f'{muscle}_GYRO Y (deg/s)'], errors='coerce'),
-            'GYRO Z': pd.to_numeric(df[f'{muscle}_GYRO Z (deg/s)'], errors='coerce')
-        })
-        all_muscle_tables.append(pd.concat([emg_df, imu_df], ignore_index=True))
-    tidy = pd.concat(all_muscle_tables, ignore_index=True)
-    # Order and sort (optional)
-    columns_order = ['Muscle', 'EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV',
-                     'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z']
-    tidy = tidy[columns_order]
-    tidy['SortTime'] = tidy['EMG_TimeSeries'].combine_first(tidy['IMU_TimeSeries'])
-    tidy = tidy.sort_values(['Muscle', 'SortTime']).drop(columns=['SortTime'])
-    df_pivoted_sorted = tidy.sort_values('EMG_TimeSeries', na_position='last')
-    return df_pivoted_sorted
\ No newline at end of file
+    # Identify columns to melt (all sensor columns)
+    measurement_cols = [c for c in df.columns if any(
+        sensor in c for sensor in ['RDelt', 'LDelt', 'RBicep', 'LBicep'])]
+    id_vars = [c for c in df.columns if c not in measurement_cols]
+    # Melt
+    df_long = df.melt(id_vars=id_vars, value_vars=measurement_cols,
+                      var_name='Measurement', value_name='Value')
+    # Extract BodyPart and Signal
+    df_long['BodyPart'] = df_long['Measurement'].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')
+    df_long['Signal'] = df_long['Measurement'].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True)
+    
+    # Pivot so each signal is a separate column
+    df_wide = df_long.pivot_table(
+        index=id_vars + ['BodyPart'],
+        columns='Signal',
+        values='Value'
+    ).reset_index()
+    # flatten columns if needed
+    df_wide.columns.name = None
+    df_wide.columns = [str(col) for col in df_wide.columns]  
+    return df_wide
\ No newline at end of file
diff --git a/resampling.py b/resampling.py
index 22da474..fd5df69 100644
--- a/resampling.py
+++ b/resampling.py
@@ -1,5 +1,7 @@
 import pandas as pd
 import numpy as np
+import pdb 
+
 def upsample(df):
     IMU_cols = [
        'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)',
@@ -27,12 +29,28 @@ def upsample(df):
 
 def downsample(df):
   EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts']
-  
+  IMU_cols = [
+    'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)',
+    'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GYRO Z (deg/s)',
+    'RBicep_ACC X (G)', 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)',
+    'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GYRO Z (deg/s)'
+  ]
   df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries'])
+  for col in df.columns:
+    # Optionally skip time columns if you want to preserve them as objects/strings
+    if "TimeSeries" in col:
+        continue
+    df[col] = pd.to_numeric(df[col], errors='coerce')
   df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s')
   df = df.set_index('time')
-  
   df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq()  # Scales these columns to be the same length as IMU data
+  df[EMG_cols] = df[EMG_cols].interpolate(method='linear')
+  df[EMG_cols] = df[EMG_cols].fillna(method='bfill').fillna(method='ffill') #back fill and forward fill all Nans.
+  # Find the last index where at least one IMU value is real
+  last_idx = df[IMU_cols].last_valid_index()
+  # Trim DataFrame to that index
+  df = df.loc[:last_idx]
+  df = df.reset_index() 
   return df
 
 ## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index

From 7fc2e9d8dc40bc97dcd259f06a03a3ac8f388176 Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Sun, 25 May 2025 10:44:44 -0700
Subject: [PATCH 11/12] did best could with data cleaning and preprocessing,
 not sure how to extract shit for the features she mentioned

---
 dataCleaning.py       | 62 ++++++++++++++++++++++++-------------------
 feature_extraction.py | 56 ++++++++++++++++++++++++++++++++++++++
 filtering.py          | 23 ++++++++++++++++
 main.py               | 23 +++++++++-------
 4 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 filtering.py

diff --git a/dataCleaning.py b/dataCleaning.py
index 0bb0d31..255fa10 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -1,10 +1,11 @@
 
 import pandas as pd
 import numpy as np 
-import pdb 
+import pdb
+
+from sklearn.model_selection import train_test_split 
 from modif_cols import tidy_emg_imu_as_measured 
 from resampling import upsample, downsample
-from scipy.signal import filtfilt, butter
 
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -60,19 +61,6 @@ def column_clean(df):
     df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
     return df 
 
-def bandpass_filter_emg(signal, fs=1259, lowcut=20, highcut=450, order=4):
-    nyq = 0.5 * fs
-    low = lowcut / nyq
-    high = highcut / nyq
-    b, a = butter(order, [low, high], btype='band')
-    return filtfilt(b, a, signal)
-
-# IMU Low-pass Filter (<20Hz)
-def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4):
-    nyq = 0.5 * fs
-    normal_cutoff = cutoff / nyq
-    b, a = butter(order, normal_cutoff, btype='low')
-    return filtfilt(b, a, signal)
 
 #melting and stuff
 def create_sensor_col(df, run_num, gender, exo): 
@@ -85,24 +73,42 @@ def create_sensor_col(df, run_num, gender, exo):
     df_pivoted.to_csv("pivoted_df.csv")
     return df_pivoted
 
-def preprocessing(full_df):
-    # num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
-    #             "total_bedrooms", "population", "households", "median_income"]
-    # cat_attribs = ["ocean_proximity"]
-    num_attribs = full_df.select_dtypes(include=['number']).columns.tolist()
-    cat_attribs = full_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
+def preprocessing_actions(full_df):
+    num_attribs = [
+        'EMG_MilliVolts_filtered',
+        'ACC X (G)_filtered',
+        'ACC Y (G)_filtered',
+        'ACC Z (G)_filtered',
+        'GYRO X (deg/s)_filtered',
+        'GYRO Y (deg/s)_filtered',
+        'GYRO Z (deg/s)_filtered',
+        # Add any other numerical features here
+    ]
+    cat_attribs = [
+        'BodyPart',
+        'gender'
+        #exo is the target variable
+    ]
+
     num_pipeline = Pipeline([
-    ("impute", SimpleImputer(strategy="median")),
-    ("standardize", StandardScaler()),
+        ("impute", SimpleImputer(strategy="median")),
+        ("standardize", StandardScaler()),
     ])
 
     cat_pipeline = Pipeline([
-    ("impute", SimpleImputer(strategy="most_frequent")),
-    ("oneHot", OneHotEncoder()),
+        ("impute", SimpleImputer(strategy="most_frequent")),
+        ("oneHot", OneHotEncoder()),
     ])
 
     preprocessing = ColumnTransformer([
-    ("num", num_pipeline, num_attribs),
-    ("cat", cat_pipeline, cat_attribs),
+        ("num", num_pipeline, num_attribs),
+        ("cat", cat_pipeline, cat_attribs),
     ])
-    return full_df
\ No newline at end of file
+    # Prepare data for modeling
+    X = full_df[num_attribs + cat_attribs]
+    y = full_df["exo"]
+    
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    X_train_prepared = preprocessing.fit_transform(X_train)
+    X_test_prepared = preprocessing.transform(X_test)
+    return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing
\ No newline at end of file
diff --git a/feature_extraction.py b/feature_extraction.py
index 29c4871..5eaa17b 100644
--- a/feature_extraction.py
+++ b/feature_extraction.py
@@ -1,7 +1,63 @@
+import pdb
 import pandas as pd
 import numpy as np
 # Calculations for Feature Extraction from Project_Guide
 
+def extract_features(df):
+    # Group by relevant columns
+    group_cols = ['BodyPart', 'run_num', 'gender', 'exo']  # adapt as needed
+    feature_rows = []
+    for group_vals, group in df.groupby(group_cols):
+        # Accelerometer features
+        a_x, a_y, a_z = group['ACC X (G)_filtered'], group['ACC Y (G)_filtered'], group['ACC Z (G)_filtered']
+        a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)
+        accel_peak = np.max(a_mag)
+        accel_mean = np.mean(a_mag)
+        accel_total = np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2))
+        accel_range = np.max(a_mag) - np.min(a_mag)
+
+        # Gyroscope features
+        w_x, w_y, w_z = group['GYRO X (deg/s)_filtered'], group['GYRO Y (deg/s)_filtered'], group['GYRO Z (deg/s)_filtered']
+        w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)
+        gyro_peak = np.max(w_mag)
+        gyro_mean = np.mean(w_mag)
+        gyro_total = np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2))
+        gyro_range = np.max(w_mag) - np.min(w_mag)
+
+        # EMG features (filtered)
+        emg = group['EMG_MilliVolts_filtered']
+        emg_mean = np.mean(emg)
+        emg_max = np.max(emg)
+        emg_min = np.min(emg)
+        emg_std = np.std(emg)
+        emg_rms = np.sqrt(np.mean(emg**2))
+
+        # Build feature dict
+        feature_dict = {
+            'BodyPart': group_vals[0],
+            'run_num': group_vals[1],
+            'gender': group_vals[2],
+            'exo': group_vals[3],
+            'accel_peak': accel_peak,
+            'accel_mean': accel_mean,
+            'accel_total': accel_total,
+            'accel_range': accel_range,
+            'gyro_peak': gyro_peak,
+            'gyro_mean': gyro_mean,
+            'gyro_total': gyro_total,
+            'gyro_range': gyro_range,
+            'emg_mean': emg_mean,
+            'emg_max': emg_max,
+            'emg_min': emg_min,
+            'emg_std': emg_std,
+            'emg_rms': emg_rms,
+        }
+        feature_rows.append(feature_dict)
+    #THIS IS LAME (only 17 rows) BRUH
+    # Return as a new DataFrame
+    return pd.DataFrame(feature_rows)
+
+#old funcs 
 def compute_emg_features(df):
     signal = df['EMG_MilliVolts']
     return {
diff --git a/filtering.py b/filtering.py
new file mode 100644
index 0000000..e75973f
--- /dev/null
+++ b/filtering.py
@@ -0,0 +1,23 @@
+from scipy.signal import filtfilt, butter
+import numpy as np
+import pandas as pd
+
+def bandpass_filter_emg(series_signal, fs=1259, lowcut=20, highcut=450, order=4):
+    arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal)    
+    if np.isnan(arr).all() or len(arr) == 0: #edge case check
+        return arr
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = butter(order, [low, high], btype='band')
+    return filtfilt(b, a, series_signal)
+
+# IMU Low-pass Filter (<20Hz)
+def lowpass_filter_imu(series_signal, fs=148, cutoff=20, order=4):
+    arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal)
+    if np.isnan(arr).all() or len(arr) == 0: #edge case check
+        return arr
+    nyq = 0.5 * fs
+    normal_cutoff = cutoff / nyq
+    b, a = butter(order, normal_cutoff, btype='low')
+    return filtfilt(b, a, series_signal)
\ No newline at end of file
diff --git a/main.py b/main.py
index cad286f..a73e4ff 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,10 @@
 import pandas as pd
 import numpy as np
-from dataCleaning import read_run, column_clean, preprocessing
+from dataCleaning import read_run, column_clean, preprocessing_actions
 from dataCleaning import create_sensor_col
 from resampling import downsample
-from feature_extraction import compute_emg_features, compute_accel_features, compute_gyro_features
+from feature_extraction import extract_features
+from filtering import bandpass_filter_emg, lowpass_filter_imu
 
 import pdb
 
@@ -30,15 +31,17 @@ def overall_cleaning():
     dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.
     combined_df = pd.concat(dfs, ignore_index=True)
     # Run functions to extract features for each dataframe
-    pdb.set_trace() 
-    emg_features = compute_emg_features(combined_df)
-    accel_features = compute_accel_features(combined_df)
-    gyro_features = compute_gyro_features(combined_df)  # feature_sets = []
-    #do this on EMG cols:
-    # bandpass_filter_emg(df)
-    #on IMU cols: 
-    # lowpass_filter_imu(df)
+    #filter out IMU and EMG outliers using filters: 
+    imu_cols = ['ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']
+    for col in imu_cols:
+        combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu)
+    combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg)
     
+    features_df = extract_features(combined_df)
+    #machine learning on combined_df
+    #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data
+    X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df)
+    #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data.
     return combined_df
 
 def main():

From 75a207aa0e9b6af97558cd80cb1c20feb43da321 Mon Sep 17 00:00:00 2001
From: cyrusParvereshi <cyrus.parvereshi@ucdavis.edu>
Date: Mon, 26 May 2025 19:27:06 -0700
Subject: [PATCH 12/12] added neural_net option for preprocessing_actions

---
 dataCleaning.py | 20 ++++++++++++--------
 main.py         | 10 ++++------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/dataCleaning.py b/dataCleaning.py
index 255fa10..4ae484f 100644
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -8,7 +8,7 @@
 from resampling import upsample, downsample
 
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.compose import ColumnTransformer
@@ -73,7 +73,7 @@ def create_sensor_col(df, run_num, gender, exo):
     df_pivoted.to_csv("pivoted_df.csv")
     return df_pivoted
 
-def preprocessing_actions(full_df):
+def preprocessing_actions(full_df, neural_net=False):
     num_attribs = [
         'EMG_MilliVolts_filtered',
         'ACC X (G)_filtered',
@@ -89,12 +89,16 @@ def preprocessing_actions(full_df):
         'gender'
         #exo is the target variable
     ]
-
-    num_pipeline = Pipeline([
-        ("impute", SimpleImputer(strategy="median")),
-        ("standardize", StandardScaler()),
-    ])
-
+    if neural_net:
+        num_pipeline = Pipeline([
+            ("impute", SimpleImputer(strategy="median")),
+            ("standardize", MinMaxScaler()),
+        ])  
+    else:
+        num_pipeline = Pipeline([
+            ("impute", SimpleImputer(strategy="median")),
+            ("standardize", StandardScaler()),
+        ])
     cat_pipeline = Pipeline([
         ("impute", SimpleImputer(strategy="most_frequent")),
         ("oneHot", OneHotEncoder()),
diff --git a/main.py b/main.py
index a73e4ff..df28536 100644
--- a/main.py
+++ b/main.py
@@ -5,9 +5,7 @@
 from resampling import downsample
 from feature_extraction import extract_features
 from filtering import bandpass_filter_emg, lowpass_filter_imu
-
-import pdb
-
+import pdb 
 def overall_cleaning():
     df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male
     df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male
@@ -36,13 +34,13 @@ def overall_cleaning():
     for col in imu_cols:
         combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu)
     combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg)
-    
-    features_df = extract_features(combined_df)
+    pdb.set_trace()
+    features_df = extract_features(combined_df) #TO-DO FIX
     #machine learning on combined_df
     #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data
     X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df)
     #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data.
-    return combined_df
+    return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline
 
 def main():
     final_df =  overall_cleaning()