From 9f3f0544e75a558d6eaf0870549d702658152364 Mon Sep 17 00:00:00 2001 From: Cyrus Parvereshi Date: Sun, 18 May 2025 15:51:39 -0700 Subject: [PATCH 01/12] more --- FeatureExtraction.py | 7 ++++--- dataCleaning.py | 8 ++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/FeatureExtraction.py b/FeatureExtraction.py index c27d629..de1624d 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from dataCleaning import read_run, column_clean, preprocessing +from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col import pdb def overall_cleaning(): @@ -13,8 +13,9 @@ def overall_cleaning(): df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male') df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female') df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female') - combined_df = pd.concat([df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo], ignore_index=True) - dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. + dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. + combined_df = pd.concat(dfs, ignore_index=True) + create_sensor_col(combined_df) # # Show the head of the data # df_p3_exo.describe() df_p3_noexo.head() diff --git a/dataCleaning.py b/dataCleaning.py index 087d7e8..1797251 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -54,5 +54,13 @@ def column_clean(df, run_num, gender): df.to_csv("test.csv") return df +def create_sensor_col(full_df): + df_melted = full_df.melt(var_name="measurement_type", value_name="value") + # Extract the sensor location from the column names + df_melted["sensor_location"] = df_melted["measurement_type"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') + # Remove the sensor location prefix from the measurement type column + df_melted["measurement_type"] = df_melted["measurement_type"].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True) + pdb.set_trace() + def preprocessing(full_df): pass From 77c0266fc2bb9557e42970060cf32f04cddde890 Mon Sep 17 00:00:00 2001 From: Cyrus Parvereshi Date: Mon, 19 May 2025 22:04:41 -0700 Subject: [PATCH 02/12] saving --- FeatureExtraction.py | 12 +++++++++--- dataCleaning.py | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/FeatureExtraction.py b/FeatureExtraction.py index de1624d..914c85d 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -4,7 +4,7 @@ import pdb def overall_cleaning(): - df_p3_exo = read_run("P3_Exo_1_0.csv") # second run, male + df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female @@ -13,9 +13,15 @@ def overall_cleaning(): df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male') df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female') df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female') + + df_p3_exo = create_sensor_col(df_p3_exo) + df_p3_noexo = create_sensor_col(df_p3_noexo) + df_p4_exo = create_sensor_col(df_p4_exo) + df_p4_noexo = create_sensor_col(df_p4_noexo) + dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. - combined_df = pd.concat(dfs, ignore_index=True) - create_sensor_col(combined_df) + combined_df = pd.concat(dfs, ignore_index=True) + # # Show the head of the data # df_p3_exo.describe() df_p3_noexo.head() diff --git a/dataCleaning.py b/dataCleaning.py index 1797251..8c05090 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -31,7 +31,7 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)', 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)' ] - return df + return df #raw data def column_clean(df, run_num, gender): #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU @@ -51,16 +51,36 @@ def column_clean(df, run_num, gender): df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data df['gender'] = gender df['run_num'] = run_num - df.to_csv("test.csv") return df -def create_sensor_col(full_df): - df_melted = full_df.melt(var_name="measurement_type", value_name="value") - # Extract the sensor location from the column names - df_melted["sensor_location"] = df_melted["measurement_type"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') - # Remove the sensor location prefix from the measurement type column - df_melted["measurement_type"] = df_melted["measurement_type"].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True) +def create_sensor_col(df): + columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'] pdb.set_trace() + # Identify all measurement columns, including EMG millivolts + measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep] + df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name="sensor_measurement", value_name="value") + # Extract the Sensor Body Position + df_melted["Sensor_Body_Position"] = df_melted["sensor_measurement"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') + # Extract measurement type, including EMG millivolts + df_melted["measurement_type"] = df_melted["sensor_measurement"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column + df_melted = df_melted.drop(columns=["sensor_measurement"]) # Pivot the DataFrame so each measurement type becomes a separate column + df_melted["value"] = df_melted["value"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space) + df_melted["value"] = pd.to_numeric(df_melted["value"], errors="coerce") #make sure all values are cast to numeric + df_melted.fillna(np.nan, inplace=True) + # Pivot the DataFrame so each measurement type becomes a separate column + df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], + columns='measurement_type', values='value') + df_pivoted.columns = df_pivoted.columns.get_level_values(0) + df_pivoted.columns = df_pivoted.columns.str.strip() + df_pivoted = df_pivoted.reset_index() + pdb.set_trace() + df_pivoted.to_csv("pivoted_df.csv") + return df_pivoted + +def standardize_time_series(): + # interpolate() + pass + def preprocessing(full_df): pass From 945f380d3f06c4e59f7cff01181e0fbbcc40c7bf Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Mon, 19 May 2025 23:43:32 -0700 Subject: [PATCH 03/12] tried to finish cleaning but failed --- FeatureExtraction.py | 16 ++++----- dataCleaning.py | 80 +++++++++++++++++++++++++++++--------------- modif_cols.py | 39 +++++++++++++++++++++ 3 files changed, 100 insertions(+), 35 deletions(-) create mode 100644 modif_cols.py diff --git a/FeatureExtraction.py b/FeatureExtraction.py index 914c85d..1fba79e 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -9,15 +9,15 @@ def overall_cleaning(): df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female - df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male') - df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male') - df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female') - df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female') + df_p3_exo = column_clean(df_p3_exo) + df_p3_noexo = column_clean(df_p3_noexo) + df_p4_exo = column_clean(df_p4_exo) + df_p4_noexo = column_clean(df_p4_noexo) - df_p3_exo = create_sensor_col(df_p3_exo) - df_p3_noexo = create_sensor_col(df_p3_noexo) - df_p4_exo = create_sensor_col(df_p4_exo) - df_p4_noexo = create_sensor_col(df_p4_noexo) + df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True) + df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False) + df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True) + df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False) dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. combined_df = pd.concat(dfs, ignore_index=True) diff --git a/dataCleaning.py b/dataCleaning.py index 8c05090..74407d6 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np import pdb - +from modif_cols import tidy_emg_imu_as_measured # Data Labels: # Label for EMG Data shared: # Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata) @@ -27,13 +27,13 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel usecols = usecols, on_bad_lines='skip') df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)', 'RDelt_ACC Y (G)', 'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)', - 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)', - 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)', - 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)' + 'LDelt_TimeSeries', 'LDelt_EMG_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)', + 'RBicep_TimeSeries', 'RBicep_EMG_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_TimeSeries', 'LBicep_EMG_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)' ] return df #raw data -def column_clean(df, run_num, gender): +def column_clean(df): #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)', 'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', @@ -45,39 +45,65 @@ def column_clean(df, run_num, gender): 'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)'] df = df.drop(extr_time_series, axis = 1) + df = df.rename(columns={'RDelt_EMG_TimeSeries': 'EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)': 'IMU_TimeSeries'}) # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols # df.columns = df.columns.str.strip() # Remove leading/trailing spaces (Yuxuan) # df = df.apply(pd.to_numeric, errors='coerce') # Conver t everything to numeric (Yuxuan) df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data - df['gender'] = gender - df['run_num'] = run_num return df -def create_sensor_col(df): - columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'] - pdb.set_trace() - # Identify all measurement columns, including EMG millivolts - measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep] - df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name="sensor_measurement", value_name="value") - - # Extract the Sensor Body Position - df_melted["Sensor_Body_Position"] = df_melted["sensor_measurement"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') - # Extract measurement type, including EMG millivolts - df_melted["measurement_type"] = df_melted["sensor_measurement"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column - df_melted = df_melted.drop(columns=["sensor_measurement"]) # Pivot the DataFrame so each measurement type becomes a separate column - df_melted["value"] = df_melted["value"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space) - df_melted["value"] = pd.to_numeric(df_melted["value"], errors="coerce") #make sure all values are cast to numeric - df_melted.fillna(np.nan, inplace=True) - # Pivot the DataFrame so each measurement type becomes a separate column - df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], - columns='measurement_type', values='value') - df_pivoted.columns = df_pivoted.columns.get_level_values(0) +def create_sensor_col(df, run_num, gender, exo): + # muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep'] + # # Identify all measurement columns for melting + # measurement_cols = [col for col in df.columns if any(muscle in col for muscle in muscles)] + # # Melt all measurement columns (EMG, ACC, GYRO) + # df_long = df.melt( + # id_vars=['EMG_TimeSeries', 'IMU_TimeSeries'], + # value_vars=measurement_cols, + # var_name='sensor_measurement', + # value_name='value' + # ) + # # Extract Muscle, Sensor, and Axis from the column name + # df_long[['Muscle', 'Sensor', 'Axis']] = df_long['sensor_measurement'].str.extract( + # r'^(RDelt|LDelt|RBicep|LBicep)_(EMG|ACC|GYRO)[ _]?(X|Y|Z)?' + # ) + # # Build measurement type column for pivoting + # df_long['Measurement'] = np.where( + # df_long['Sensor'] == 'EMG', + # 'EMG_MV', + # df_long['Sensor'] + ' ' + df_long['Axis'] + # ) + # pdb.set_trace() + + # # Pivot so each row is a Muscle-Timepoint, columns are measurement types + # tidy = df_long.pivot_table( + # index=['EMG_TimeSeries', 'IMU_TimeSeries', 'Muscle'], + # columns='Measurement', + # values='value', + # aggfunc='first' + # ).reset_index() + # # Flatten columns so pivot table multiindexes don't persist + # tidy.columns.name = None + # tidy = tidy.rename_axis(None, axis=1) + # # sort columns + # columns_order = ['EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV', + # 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z', 'Muscle'] + # # Only include columns that actually exist in the data + # columns_order = [col for col in columns_order if col in tidy.columns] + + # df_pivoted = tidy[columns_order] + df_pivoted = tidy_emg_imu_as_measured(df) df_pivoted.columns = df_pivoted.columns.str.strip() - df_pivoted = df_pivoted.reset_index() pdb.set_trace() + df_pivoted = df_pivoted.reset_index() + + df_pivoted['gender'] = gender + df_pivoted['run_num'] = run_num + df_pivoted['exo'] = exo df_pivoted.to_csv("pivoted_df.csv") return df_pivoted + def standardize_time_series(): # interpolate() pass diff --git a/modif_cols.py b/modif_cols.py new file mode 100644 index 0000000..7008e02 --- /dev/null +++ b/modif_cols.py @@ -0,0 +1,39 @@ +import pandas as pd +import numpy as np + +def tidy_emg_imu_as_measured(df): + muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep'] + all_muscle_tables = [] + for muscle in muscles: + # EMG rows (as measured) + emg_df = pd.DataFrame({ + 'Muscle': muscle, + 'EMG_TimeSeries': pd.to_numeric(df['EMG_TimeSeries'], errors='coerce'), + 'IMU_TimeSeries': np.nan, + 'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce'), + 'ACC X': np.nan, 'ACC Y': np.nan, 'ACC Z': np.nan, + 'GYRO X': np.nan, 'GYRO Y': np.nan, 'GYRO Z': np.nan + }) + # IMU rows (as measured) + imu_df = pd.DataFrame({ + 'Muscle': muscle, + 'EMG_TimeSeries': np.nan, + 'IMU_TimeSeries': pd.to_numeric(df['IMU_TimeSeries'], errors='coerce'), + 'EMG_MV': np.nan, + 'ACC X': pd.to_numeric(df[f'{muscle}_ACC X (G)'], errors='coerce'), + 'ACC Y': pd.to_numeric(df[f'{muscle}_ACC Y (G)'], errors='coerce'), + 'ACC Z': pd.to_numeric(df[f'{muscle}_ACC Z (G)'], errors='coerce'), + 'GYRO X': pd.to_numeric(df[f'{muscle}_GYRO X (deg/s)'], errors='coerce'), + 'GYRO Y': pd.to_numeric(df[f'{muscle}_GYRO Y (deg/s)'], errors='coerce'), + 'GYRO Z': pd.to_numeric(df[f'{muscle}_GYRO Z (deg/s)'], errors='coerce') + }) + all_muscle_tables.append(pd.concat([emg_df, imu_df], ignore_index=True)) + tidy = pd.concat(all_muscle_tables, ignore_index=True) + # Order and sort (optional) + columns_order = ['Muscle', 'EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV', + 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z'] + tidy = tidy[columns_order] + tidy['SortTime'] = tidy['EMG_TimeSeries'].combine_first(tidy['IMU_TimeSeries']) + tidy = tidy.sort_values(['Muscle', 'SortTime']).drop(columns=['SortTime']) + df_pivoted_sorted = tidy.sort_values('EMG_TimeSeries', na_position='last') + return df_pivoted_sorted \ No newline at end of file From 218cefe2013285fcbe5084cb1aecbc7d2deb3a0f Mon Sep 17 00:00:00 2001 From: Cyrus Parvereshi Date: Wed, 21 May 2025 21:55:16 -0700 Subject: [PATCH 04/12] made jupyter notebook --- debug.ipynb | 952 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 952 insertions(+) create mode 100644 debug.ipynb diff --git a/debug.ipynb b/debug.ipynb new file mode 100644 index 0000000..c6ad330 --- /dev/null +++ b/debug.ipynb @@ -0,0 +1,952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e0bebc80-fe7f-4d6c-8387-5c512308e48d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col\n", + "import pdb" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3bd13a4f-893e-43c3-bc4a-afabc40bbcde", + "metadata": {}, + "outputs": [], + "source": [ + "def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fields as well as metadata)\n", + " usecols = list(range(0, 56)) \n", + " df = pd.read_csv(filename, low_memory = False, \n", + " header = 0, \n", + " skiprows=skiprows,\n", + " # names=header,\n", + " usecols = usecols,\n", + " on_bad_lines='skip') \n", + " df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)', 'RDelt_ACC Y (G)', 'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',\n", + " 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',\n", + " 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',\n", + " 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'\n", + " ]\n", + " return df #raw data " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "07d4e282-5741-449e-a2a8-71f3b3a6c66d", + "metadata": {}, + "outputs": [], + "source": [ + "def column_clean(df, run_num, gender):\n", + " #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU \n", + " extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',\n", + " 'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', \n", + " 'LDelt_Acc Y Time Series(s)', 'LDelt_Acc Z Time Series(s)', 'LDelt_GyroXTime Series(s)',\n", + " 'LDelt_GyroYTime Series(s)', 'LDelt_GyroZTime Series(s)', 'RBicep_TimeSeries', 'RBicep_Acc X Time Series(s)',\n", + " 'RBicep_Acc Y Time Series(s)', 'RBicep_Acc Z Time Series(s)', 'RBicep_GyroXTime Series(s)',\n", + " 'RBicep_GyroYTime Series(s)', 'RBicep_GyroZTime Series(s)', 'LBicep_TimeSeries', 'LBicep_Acc X Time Series(s)',\n", + " 'LBicep_Acc Y Time Series(s)', 'LBicep_Acc Z Time Series(s)', 'LBicep_GyroXTime Series(s)', \n", + " 'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']\n", + " \n", + " df = df.drop(extr_time_series, axis = 1)\n", + " # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols\n", + " # df.columns = df.columns.str.strip() # Remove leading/trailing spaces (Yuxuan)\n", + " # df = df.apply(pd.to_numeric, errors='coerce') # Conver t everything to numeric (Yuxuan)\n", + " df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data\n", + " df['gender'] = gender\n", + " df['run_num'] = run_num\n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c74acdb7-01e3-443f-92b0-6d2a6e45696c", + "metadata": {}, + "outputs": [], + "source": [ + "def create_sensor_col(df): \n", + " columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n", + " pdb.set_trace()\n", + " # Identify all measurement columns, including EMG millivolts\n", + " measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n", + " df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n", + " \n", + " # Extract the Sensor Body Position\n", + " df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + " # Extract measurement type, including EMG millivolts\n", + " df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column\n", + " df_melted = df_melted.drop(columns=[\"sensor_measurement\"]) # Pivot the DataFrame so each measurement type becomes a separate column\n", + " df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n", + " df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n", + " df_melted.fillna(np.nan, inplace=True)\n", + " # Pivot the DataFrame so each measurement type becomes a separate column\n", + " df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n", + " columns='measurement_type', values='value')\n", + " df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n", + " df_pivoted.columns = df_pivoted.columns.str.strip()\n", + " df_pivoted = df_pivoted.reset_index()\n", + " pdb.set_trace()\n", + " df_pivoted.to_csv(\"pivoted_df.csv\")\n", + " return df_pivoted" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a446022-09eb-4f86-a77a-980490d55b0e", + "metadata": {}, + "outputs": [], + "source": [ + "def standardize_time_series():\n", + " # interpolate()\n", + " pass\n", + "\n", + "def preprocessing(full_df):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fc81654-185e-4738-919a-57afa144341e", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dab168eb-f1dc-4d7e-99fe-3ec1042db065", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculations for Feature Extraction from Project_Guide\n", + "def compute_emg_features(signal):\n", + " return {\n", + " 'mean': np.mean(signal),\n", + " 'max': np.max(signal),\n", + " 'min': np.min(signal),\n", + " 'std': np.std(signal),\n", + " 'rms': np.sqrt(np.mean(signal**2))\n", + " }\n", + "\n", + "def compute_accel_features(a_x, a_y, a_z):\n", + " a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)\n", + " \n", + " features = {\n", + " 'peak_accel': np.max(a_mag),\n", + " 'mean_accel': np.mean(a_mag),\n", + " 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),\n", + " 'accel_range': np.max(a_mag) - np.min(a_mag)\n", + " }\n", + " return features\n", + "\n", + "def compute_gyro_features(w_x, w_y, w_z):\n", + " w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)\n", + " \n", + " features = {\n", + " 'peak_angular_vel': np.max(w_mag),\n", + " 'mean_angular_vel': np.mean(w_mag),\n", + " 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),\n", + " 'angular_vel_range': np.max(w_mag) - np.min(w_mag)\n", + " }\n", + " return features " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72ed77bf-dbc1-406c-a5b1-57dcce504502", + "metadata": {}, + "outputs": [], + "source": [ + "P3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d8305190-17ac-43fd-b7d9-d9a90e329533", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = read_run(\"P3_Exo_1_0.csv\") # 2nd run, male" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ab22fd91-5089-4c1e-91e5-f2fedb609f69", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_noexo = read_run(\"P3_NoExo_1_0.csv\") # first run, male" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2b503e14-fc38-46c2-9a10-3d7ea8ade855", + "metadata": {}, + "outputs": [], + "source": [ + "df_p4_exo = read_run(\"P4_Exo_1_0.csv\") # 1st run female\n", + "df_p4_noexo = read_run(\"P4_NoExo_1_0.csv\") # 2nd female" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ad941e26-6be4-4ae5-8d57-f148509e1675", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')\n", + "df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')\n", + "df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')\n", + "df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0372fab5-cc33-4ec4-a878-a20207b8b542", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RDelt_EMG_TimeSeriesRDelt_IMU_Acc X Time Series(s)genderrun_numsensor_measurementvalueSensor_Body_Positionmeasurement_type
00.0000000male2RDelt_EMG_MilliVolts0.004868RDeltEMG_MilliVolts
10.0007940.00675male2RDelt_EMG_MilliVolts0.005875RDeltEMG_MilliVolts
20.0015880.0135male2RDelt_EMG_MilliVolts0.005203RDeltEMG_MilliVolts
30.0023820.02025male2RDelt_EMG_MilliVolts0.005539RDeltEMG_MilliVolts
40.0031770.027male2RDelt_EMG_MilliVolts0.007721RDeltEMG_MilliVolts
...........................
3852739109.265029NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852740109.265823NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852741109.266618NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852742109.267412NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852743109.268206NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
\n", + "

3852744 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " RDelt_EMG_TimeSeries RDelt_IMU_Acc X Time Series(s) gender run_num \\\n", + "0 0.000000 0 male 2 \n", + "1 0.000794 0.00675 male 2 \n", + "2 0.001588 0.0135 male 2 \n", + "3 0.002382 0.02025 male 2 \n", + "4 0.003177 0.027 male 2 \n", + "... ... ... ... ... \n", + "3852739 109.265029 NaN male 2 \n", + "3852740 109.265823 NaN male 2 \n", + "3852741 109.266618 NaN male 2 \n", + "3852742 109.267412 NaN male 2 \n", + "3852743 109.268206 NaN male 2 \n", + "\n", + " sensor_measurement value Sensor_Body_Position measurement_type \n", + "0 RDelt_EMG_MilliVolts 0.004868 RDelt EMG_MilliVolts \n", + "1 RDelt_EMG_MilliVolts 0.005875 RDelt EMG_MilliVolts \n", + "2 RDelt_EMG_MilliVolts 0.005203 RDelt EMG_MilliVolts \n", + "3 RDelt_EMG_MilliVolts 0.005539 RDelt EMG_MilliVolts \n", + "4 RDelt_EMG_MilliVolts 0.007721 RDelt EMG_MilliVolts \n", + "... ... ... ... ... \n", + "3852739 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852740 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852741 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852742 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852743 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "\n", + "[3852744 rows x 8 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df_p3_exo\n", + "columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n", + "measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n", + "df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n", + " # Extract the Sensor Body Position\n", + "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + "# Extract measurement type, including EMG millivolts\n", + "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') \n", + "df_melted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9aea9bd-e997-4330-9116-e4d9c6db90a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RDelt_EMG_TimeSeriesRDelt_EMG_MilliVoltsRDelt_IMU_Acc X Time Series(s)RDelt_ACC X (G)RDelt_ACC Y (G)RDelt_ACC Z (G)RDelt_GYRO X (deg/s)RDelt_GYRO Y (deg/s)RDelt_GYRO Z (deg/s)LDelt_MilliVolts...RBicep_GYRO Z (deg/s)LBicep_MilliVoltsLBicep_ACC X (G)LBicep_ACC Y (G)LBicep_ACC Z (G)LBicep_GYRO X (deg/s)LBicep_GYRO Y (deg/s)LBicep_GYRO Z (deg/s)genderrun_num
00.0000000.00486800.07476810.90612790.2548828-30.7404575-4.25190839.358779-0.006546...-11.95419880.0419620.25073240.88085940.1972656-30.8015278.557251912.6870232male2
10.0007940.0058750.006750.07952880.9132080.2689209-30.7786255-5.9618328.5419846-0.006546...-12.53435130.0419620.24530030.87902830.2055054-29.0381689.900763513.0305347male2
20.0015880.0052030.01350.08044430.91943360.2719116-29.9312973-6.80152658.4503813-0.007217...-12.8015270.0414590.24865720.8801880.2092896-27.66412168.969465312.358779male2
30.0023820.0055390.020250.08093260.93164060.2680054-29.1068707-6.88549617.6793895-0.004196...-12.74809170.0397800.25335690.8801270.2134399-25.4427497.610687311.1145039male2
40.0031770.0077210.0270.08666990.93194580.2663574-29.3129768-7.40458016.7557254-0.005203...-11.18320660.0414590.25909420.87707520.2124634-23.5572516.09160339.7862597male2
..................................................................
137593109.2650290.017960NaNNaNNaNNaNNaNNaNNaN-0.002182...NaN0.030716NaNNaNNaNNaNNaNNaNmale2
137594109.2658230.019974NaNNaNNaNNaNNaNNaNNaN-0.001846...NaN0.035416NaNNaNNaNNaNNaNNaNmale2
137595109.2666180.020981NaNNaNNaNNaNNaNNaNNaN-0.004196...NaN0.034745NaNNaNNaNNaNNaNNaNmale2
137596109.2674120.018631NaNNaNNaNNaNNaNNaNNaN-0.005707...NaN0.035248NaNNaNNaNNaNNaNNaNmale2
137597109.2682060.019974NaNNaNNaNNaNNaNNaNNaN-0.004196...NaN0.036591NaNNaNNaNNaNNaNNaNmale2
\n", + "

137598 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " RDelt_EMG_TimeSeries RDelt_EMG_MilliVolts \\\n", + "0 0.000000 0.004868 \n", + "1 0.000794 0.005875 \n", + "2 0.001588 0.005203 \n", + "3 0.002382 0.005539 \n", + "4 0.003177 0.007721 \n", + "... ... ... \n", + "137593 109.265029 0.017960 \n", + "137594 109.265823 0.019974 \n", + "137595 109.266618 0.020981 \n", + "137596 109.267412 0.018631 \n", + "137597 109.268206 0.019974 \n", + "\n", + " RDelt_IMU_Acc X Time Series(s) RDelt_ACC X (G) RDelt_ACC Y (G) \\\n", + "0 0 0.0747681 0.9061279 \n", + "1 0.00675 0.0795288 0.913208 \n", + "2 0.0135 0.0804443 0.9194336 \n", + "3 0.02025 0.0809326 0.9316406 \n", + "4 0.027 0.0866699 0.9319458 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " RDelt_ACC Z (G) RDelt_GYRO X (deg/s) RDelt_GYRO Y (deg/s) \\\n", + "0 0.2548828 -30.7404575 -4.2519083 \n", + "1 0.2689209 -30.7786255 -5.961832 \n", + "2 0.2719116 -29.9312973 -6.8015265 \n", + "3 0.2680054 -29.1068707 -6.8854961 \n", + "4 0.2663574 -29.3129768 -7.4045801 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " RDelt_GYRO Z (deg/s) LDelt_MilliVolts ... RBicep_GYRO Z (deg/s) \\\n", + "0 9.358779 -0.006546 ... -11.9541988 \n", + "1 8.5419846 -0.006546 ... -12.5343513 \n", + "2 8.4503813 -0.007217 ... -12.801527 \n", + "3 7.6793895 -0.004196 ... -12.7480917 \n", + "4 6.7557254 -0.005203 ... -11.1832066 \n", + "... ... ... ... ... \n", + "137593 NaN -0.002182 ... NaN \n", + "137594 NaN -0.001846 ... NaN \n", + "137595 NaN -0.004196 ... NaN \n", + "137596 NaN -0.005707 ... NaN \n", + "137597 NaN -0.004196 ... NaN \n", + "\n", + " LBicep_MilliVolts LBicep_ACC X (G) LBicep_ACC Y (G) LBicep_ACC Z (G) \\\n", + "0 0.041962 0.2507324 0.8808594 0.1972656 \n", + "1 0.041962 0.2453003 0.8790283 0.2055054 \n", + "2 0.041459 0.2486572 0.880188 0.2092896 \n", + "3 0.039780 0.2533569 0.880127 0.2134399 \n", + "4 0.041459 0.2590942 0.8770752 0.2124634 \n", + "... ... ... ... ... \n", + "137593 0.030716 NaN NaN NaN \n", + "137594 0.035416 NaN NaN NaN \n", + "137595 0.034745 NaN NaN NaN \n", + "137596 0.035248 NaN NaN NaN \n", + "137597 0.036591 NaN NaN NaN \n", + "\n", + " LBicep_GYRO X (deg/s) LBicep_GYRO Y (deg/s) LBicep_GYRO Z (deg/s) \\\n", + "0 -30.801527 8.5572519 12.6870232 \n", + "1 -29.038168 9.9007635 13.0305347 \n", + "2 -27.6641216 8.9694653 12.358779 \n", + "3 -25.442749 7.6106873 11.1145039 \n", + "4 -23.557251 6.0916033 9.7862597 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " gender run_num \n", + "0 male 2 \n", + "1 male 2 \n", + "2 male 2 \n", + "3 male 2 \n", + "4 male 2 \n", + "... ... ... \n", + "137593 male 2 \n", + "137594 male 2 \n", + "137595 male 2 \n", + "137596 male 2 \n", + "137597 male 2 \n", + "\n", + "[137598 rows x 32 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " # Extract the Sensor Body Position\n", + "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + "# Extract measurement type, including EMG millivolts\n", + "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column\n", + "df_melted = df_melted.drop(columns=[\"sensor_measurement\"]) # Pivot the DataFrame so each measurement type becomes a separate column\n", + "df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n", + "df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n", + "df_melted.fillna(np.nan, inplace=True)\n", + "# Pivot the DataFrame so each measurement type becomes a separate column\n", + "df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n", + " columns='measurement_type', values='value')\n", + "df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n", + "df_pivoted.columns = df_pivoted.columns.str.strip()\n", + "df_pivoted = df_pivoted.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bf43181-6aa6-44ba-8fc1-44dc7bbeaed3", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = create_sensor_col(df_p3_exo)\n", + "df_p3_noexo = create_sensor_col(df_p3_noexo)\n", + "df_p4_exo = create_sensor_col(df_p4_exo)\n", + "df_p4_noexo = create_sensor_col(df_p4_noexo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01bf5481-3d22-4143-8990-fec243ae013e", + "metadata": {}, + "outputs": [], + "source": [ + "dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.\n", + "combined_df = pd.concat(dfs, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6263007-da44-4d49-b437-0754e56a2def", + "metadata": {}, + "outputs": [], + "source": [ + "# # Show the head of the data\n", + "# df_p3_exo.describe()\n", + "df_p3_noexo.head()\n", + "# df_p4_exo.head()\n", + "# df_p4_noexo.head()\n", + "# # Choose inputs\n", + "# features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()\n", + "# features.head()\n", + "feature_sets = []\n", + "# Run functions to extract features for each dataframe\n", + "#CP: does this make sure to remove the redundant time series columns?\n", + "#can keep ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name \n", + "for df in dfs:\n", + " emg_features = compute_emg_features(df['EMG 1 (mV)'])\n", + " accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])\n", + " gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])\n", + " features = {\n", + " 'emg': emg_features,\n", + " 'accel': accel_features,\n", + " 'gyro': gyro_features\n", + " }\n", + " feature_sets.append(features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccbbdce4-9c51-4b5c-b263-d23cc0c79154", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 93b616b640e81c7e53b2b76441845c04fb296350 Mon Sep 17 00:00:00 2001 From: Cyrus Parvereshi Date: Wed, 21 May 2025 22:35:15 -0700 Subject: [PATCH 05/12] worked on upsampling BEFORE melt --- FeatureExtraction.py | 10 +++++++-- UpsamplingIMU.py | 27 +++++++++++++++++++++++++ dataCleaning.py | 48 +++++--------------------------------------- 3 files changed, 40 insertions(+), 45 deletions(-) create mode 100644 UpsamplingIMU.py diff --git a/FeatureExtraction.py b/FeatureExtraction.py index 1fba79e..519c6ce 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np -from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col +from dataCleaning import read_run, column_clean, preprocessing +from dataCleaning import create_sensor_col, standardize_time_series import pdb def overall_cleaning(): @@ -13,6 +14,11 @@ def overall_cleaning(): df_p3_noexo = column_clean(df_p3_noexo) df_p4_exo = column_clean(df_p4_exo) df_p4_noexo = column_clean(df_p4_noexo) + #upsample IMU to match EMG + standardize_time_series(df_p3_exo) + standardize_time_series(df_p3_noexo) + standardize_time_series(df_p4_exo) + standardize_time_series(df_p4_noexo) df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True) df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False) @@ -21,7 +27,7 @@ def overall_cleaning(): dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. combined_df = pd.concat(dfs, ignore_index=True) - + # # Show the head of the data # df_p3_exo.describe() df_p3_noexo.head() diff --git a/UpsamplingIMU.py b/UpsamplingIMU.py new file mode 100644 index 0000000..39162b7 --- /dev/null +++ b/UpsamplingIMU.py @@ -0,0 +1,27 @@ +import pandas as pd +import pdb + +def upsample(df): + IMU_cols = [ + 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', + 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', + 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', + 'LDelt_GYRO Z (deg/s)', 'RBicep_ACC X (G)', + 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', + 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', + 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', + 'LBicep_GYRO Z (deg/s)' + ] + df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries']) + df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s') + df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN + df = df.set_index('time') + freq_nanseconds = int(0.0007941176470588235 * 1e9) # Convert to integer microseconds + IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq() + IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values + IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x)) + IMU_upsampled = IMU_upsampled.interpolate(method='linear') # Interpolates the data using the linear method to match EMG data + #sine interpolation is best + df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts']) + return df_new \ No newline at end of file diff --git a/dataCleaning.py b/dataCleaning.py index 74407d6..5f84da0 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -3,6 +3,7 @@ import numpy as np import pdb from modif_cols import tidy_emg_imu_as_measured +from UpsamplingIMU import upsample # Data Labels: # Label for EMG Data shared: # Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata) @@ -52,46 +53,11 @@ def column_clean(df): df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data return df -def create_sensor_col(df, run_num, gender, exo): - # muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep'] - # # Identify all measurement columns for melting - # measurement_cols = [col for col in df.columns if any(muscle in col for muscle in muscles)] - # # Melt all measurement columns (EMG, ACC, GYRO) - # df_long = df.melt( - # id_vars=['EMG_TimeSeries', 'IMU_TimeSeries'], - # value_vars=measurement_cols, - # var_name='sensor_measurement', - # value_name='value' - # ) - # # Extract Muscle, Sensor, and Axis from the column name - # df_long[['Muscle', 'Sensor', 'Axis']] = df_long['sensor_measurement'].str.extract( - # r'^(RDelt|LDelt|RBicep|LBicep)_(EMG|ACC|GYRO)[ _]?(X|Y|Z)?' - # ) - # # Build measurement type column for pivoting - # df_long['Measurement'] = np.where( - # df_long['Sensor'] == 'EMG', - # 'EMG_MV', - # df_long['Sensor'] + ' ' + df_long['Axis'] - # ) - # pdb.set_trace() - - # # Pivot so each row is a Muscle-Timepoint, columns are measurement types - # tidy = df_long.pivot_table( - # index=['EMG_TimeSeries', 'IMU_TimeSeries', 'Muscle'], - # columns='Measurement', - # values='value', - # aggfunc='first' - # ).reset_index() - # # Flatten columns so pivot table multiindexes don't persist - # tidy.columns.name = None - # tidy = tidy.rename_axis(None, axis=1) - # # sort columns - # columns_order = ['EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV', - # 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z', 'Muscle'] - # # Only include columns that actually exist in the data - # columns_order = [col for col in columns_order if col in tidy.columns] +def standardize_time_series(df): + upsample(df) - # df_pivoted = tidy[columns_order] +#melting and stuff +def create_sensor_col(df, run_num, gender, exo): df_pivoted = tidy_emg_imu_as_measured(df) df_pivoted.columns = df_pivoted.columns.str.strip() pdb.set_trace() @@ -104,9 +70,5 @@ def create_sensor_col(df, run_num, gender, exo): return df_pivoted -def standardize_time_series(): - # interpolate() - pass - def preprocessing(full_df): pass From ac4c667041e35c63089651c57aed0cbdf270f021 Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Fri, 23 May 2025 23:38:27 -0700 Subject: [PATCH 06/12] something --- FeatureExtraction.py | 5 +++-- UpsamplingIMU.py | 27 ---------------------- dataCleaning.py | 5 +++-- resampling.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 31 deletions(-) delete mode 100644 UpsamplingIMU.py create mode 100644 resampling.py diff --git a/FeatureExtraction.py b/FeatureExtraction.py index 519c6ce..daa5082 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -27,7 +27,7 @@ def overall_cleaning(): dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. combined_df = pd.concat(dfs, ignore_index=True) - + combined_df = standardize_time_series(combined_df) # # Show the head of the data # df_p3_exo.describe() df_p3_noexo.head() @@ -50,7 +50,8 @@ def overall_cleaning(): 'gyro': gyro_features } feature_sets.append(features) - + #TO-DO make Exo or No Exo variable? + #imputation/preprocessing # feature_sets now contains extracted features for each df p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats diff --git a/UpsamplingIMU.py b/UpsamplingIMU.py deleted file mode 100644 index 39162b7..0000000 --- a/UpsamplingIMU.py +++ /dev/null @@ -1,27 +0,0 @@ -import pandas as pd -import pdb - -def upsample(df): - IMU_cols = [ - 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', - 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', - 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', - 'LDelt_GYRO Z (deg/s)', 'RBicep_ACC X (G)', - 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', - 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', - 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', - 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', - 'LBicep_GYRO Z (deg/s)' - ] - df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries']) - df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s') - df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN - df = df.set_index('time') - freq_nanseconds = int(0.0007941176470588235 * 1e9) # Convert to integer microseconds - IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq() - IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values - IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x)) - IMU_upsampled = IMU_upsampled.interpolate(method='linear') # Interpolates the data using the linear method to match EMG data - #sine interpolation is best - df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts']) - return df_new \ No newline at end of file diff --git a/dataCleaning.py b/dataCleaning.py index 5f84da0..83fd67d 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -3,7 +3,7 @@ import numpy as np import pdb from modif_cols import tidy_emg_imu_as_measured -from UpsamplingIMU import upsample +from resampling import upsample, downsample # Data Labels: # Label for EMG Data shared: # Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata) @@ -54,7 +54,8 @@ def column_clean(df): return df def standardize_time_series(df): - upsample(df) + df = downsample(df) + return df #melting and stuff def create_sensor_col(df, run_num, gender, exo): diff --git a/resampling.py b/resampling.py new file mode 100644 index 0000000..22da474 --- /dev/null +++ b/resampling.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np +def upsample(df): + IMU_cols = [ + 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', + 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', + 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', + 'LDelt_GYRO Z (deg/s)', 'RBicep_ACC X (G)', + 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', + 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', + 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', + 'LBicep_GYRO Z (deg/s)' + ] + df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries']) + df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s') + df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN + df = df.set_index('time') + freq_nanseconds = int(0.0007941176470588235 * 1e9) # Convert to integer microseconds + IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq() + IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values + IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x)) + IMU_upsampled = IMU_upsampled.interpolate(method='linear') # Interpolates the data using the linear method to match EMG data + #sine interpolation is best + df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts']) + return df_new + +def downsample(df): + EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts'] + + df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries']) + df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s') + df = df.set_index('time') + + df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq() # Scales these columns to be the same length as IMU data + return df + +## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index +def alternative(df): + high_rate = 1259 + low_rate = 148 + step = high_rate / low_rate + + EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts'] + + # Indexes to sample + indices = np.round(np.arange(0, len(df), step)).astype(int) + indices = indices[indices < len(df)] # Ensure we stay within bounds + + # Downsample using nearest index + df[EMG_cols] = df[EMG_cols].iloc[indices].reset_index(drop=True) + df = df.drop(columns=['EMG_TimeSeries']) + return df From db5326756b41e805b78daa87ef00ba5c9764331d Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Fri, 23 May 2025 23:42:16 -0700 Subject: [PATCH 07/12] adde din filters --- FeatureExtraction.py | 2 +- dataCleaning.py | 14 +++++++++++ dk.txt | 57 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 dk.txt diff --git a/FeatureExtraction.py b/FeatureExtraction.py index daa5082..6408401 100644 --- a/FeatureExtraction.py +++ b/FeatureExtraction.py @@ -30,7 +30,7 @@ def overall_cleaning(): combined_df = standardize_time_series(combined_df) # # Show the head of the data # df_p3_exo.describe() - df_p3_noexo.head() + # df_p3_noexo.head() # df_p4_exo.head() # df_p4_noexo.head() # # Choose inputs diff --git a/dataCleaning.py b/dataCleaning.py index 83fd67d..07c0acc 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -53,6 +53,20 @@ def column_clean(df): df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data return df +def bandpass_filter_emg(signal, fs=1259, lowcut=20, highcut=450, order=4): + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype='band') + return filtfilt(b, a, signal) + +# IMU Low-pass Filter (<20Hz) +def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4): + nyq = 0.5 * fs + normal_cutoff = cutoff / nyq + b, a = butter(order, normal_cutoff, btype='low') + return filtfilt(b, a, signal) + def standardize_time_series(df): df = downsample(df) return df diff --git a/dk.txt b/dk.txt new file mode 100644 index 0000000..dfcadb6 --- /dev/null +++ b/dk.txt @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np + +def emg_to_imu_asof_all_muscles(df, emg_time_col='EMG_TimeSeries', imu_time_col='IMU_TimeSeries'): + """ + For each muscle group, create a tidy DataFrame where each row is an EMG measurement, + with corresponding IMU data (if available) joined via pandas.merge_asof (backward). + IMU columns may have NaNs if not available at that EMG time. + Returns a concatenated DataFrame for all muscles with a 'Muscle' column. + """ + muscle_names = [] + for col in df.columns: + if '_EMG_MilliVolts' in col: + muscle = col.replace('_EMG_MilliVolts', '') + muscle_names.append(muscle) + + all_muscles = [] + for muscle in muscle_names: + # Build EMG DataFrame for this muscle + emg_df = pd.DataFrame({ + 'EMG_TimeSeries': pd.to_numeric(df[emg_time_col], errors='coerce'), + 'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce') + }).dropna(subset=['EMG_TimeSeries', 'EMG_MV']) + + # Build IMU DataFrame for this muscle + imu_df = pd.DataFrame({ + 'IMU_TimeSeries': pd.to_numeric(df[imu_time_col], errors='coerce'), + 'ACC X': pd.to_numeric(df.get(f'{muscle}_ACC X (G)'), errors='coerce'), + 'ACC Y': pd.to_numeric(df.get(f'{muscle}_ACC Y (G)'), errors='coerce'), + 'ACC Z': pd.to_numeric(df.get(f'{muscle}_ACC Z (G)'), errors='coerce'), + 'GYRO X': pd.to_numeric(df.get(f'{muscle}_GYRO X (deg/s)'), errors='coerce'), + 'GYRO Y': pd.to_numeric(df.get(f'{muscle}_GYRO Y (deg/s)'), errors='coerce'), + 'GYRO Z': pd.to_numeric(df.get(f'{muscle}_GYRO Z (deg/s)'), errors='coerce') + }) + + # Merge IMU onto EMG (backward: most recent IMU) + merged = pd.merge_asof( + emg_df.sort_values('EMG_TimeSeries'), + imu_df.sort_values('IMU_TimeSeries'), + left_on='EMG_TimeSeries', + right_on='IMU_TimeSeries', + direction='backward' + ) + + merged['Muscle'] = muscle + all_muscles.append(merged) + + tidy = pd.concat(all_muscles, ignore_index=True) + # Order columns + cols = ['Muscle', 'EMG_TimeSeries', 'EMG_MV', 'IMU_TimeSeries', + 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z'] + tidy = tidy[cols] + return tidy + +# Usage example: +# tidy_df = emg_to_imu_asof_all_muscles(df) +# tidy_df.to_csv('emg_imu_tidy.csv', index=False) \ No newline at end of file From 8aeecc817b365e1915a453991e3c69ccf427d416 Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Fri, 23 May 2025 23:44:41 -0700 Subject: [PATCH 08/12] separated files out --- feature_extraction.py | 51 +++++++++++++++++++++++++++++++++ FeatureExtraction.py => main.py | 36 +++-------------------- 2 files changed, 55 insertions(+), 32 deletions(-) create mode 100644 feature_extraction.py rename FeatureExtraction.py => main.py (72%) diff --git a/feature_extraction.py b/feature_extraction.py new file mode 100644 index 0000000..8f9c4cb --- /dev/null +++ b/feature_extraction.py @@ -0,0 +1,51 @@ +# Calculations for Feature Extraction from Project_Guide +def compute_emg_features(signal): + return { + 'mean': np.mean(signal), + 'max': np.max(signal), + 'min': np.min(signal), + 'std': np.std(signal), + 'rms': np.sqrt(np.mean(signal**2)) + } + +def compute_accel_features(a_x, a_y, a_z): + a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) + + features = { + 'peak_accel': np.max(a_mag), + 'mean_accel': np.mean(a_mag), + 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)), + 'accel_range': np.max(a_mag) - np.min(a_mag) + } + return features + +def compute_gyro_features(w_x, w_y, w_z): + w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) + + features = { + 'peak_angular_vel': np.max(w_mag), + 'mean_angular_vel': np.mean(w_mag), + 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)), + 'angular_vel_range': np.max(w_mag) - np.min(w_mag) + } + return features + + + # fft_mean = mean(valid_freqs * valid_fft) + # fft_median = median(valid_freqs * valid_fft) + # fft_power = np.sum(valid_fft**2) + + # feature_row = { + # 'emg_max': emg.max(), + # 'emg_min': emg.min(), + # 'emg_rms': np.sqrt(np.mean(emg**2)), + # 'acc_peak': np.linalg.norm(acc, axis=1).max(), + # 'acc_range': np.ptp(np.linalg.norm(acc, axis=1)), + # 'gyro_peak': np.linalg.norm(gyro, axis=1).max(), + # 'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)), + # 'emg_fft_mean_freq': fft_mean, + # 'emg_fft_median_freq': fft_median, + # 'emg_fft_power': fft_power, + # 'label': label, + # 'gender': gender + # } \ No newline at end of file diff --git a/FeatureExtraction.py b/main.py similarity index 72% rename from FeatureExtraction.py rename to main.py index 6408401..9142f53 100644 --- a/FeatureExtraction.py +++ b/main.py @@ -4,7 +4,7 @@ from dataCleaning import create_sensor_col, standardize_time_series import pdb -def overall_cleaning(): +def main_cleaning(): df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female @@ -56,38 +56,10 @@ def overall_cleaning(): p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats -# Calculations for Feature Extraction from Project_Guide -def compute_emg_features(signal): - return { - 'mean': np.mean(signal), - 'max': np.max(signal), - 'min': np.min(signal), - 'std': np.std(signal), - 'rms': np.sqrt(np.mean(signal**2)) - } -def compute_accel_features(a_x, a_y, a_z): - a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) - - features = { - 'peak_accel': np.max(a_mag), - 'mean_accel': np.mean(a_mag), - 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)), - 'accel_range': np.max(a_mag) - np.min(a_mag) - } - return features - -def compute_gyro_features(w_x, w_y, w_z): - w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) - - features = { - 'peak_angular_vel': np.max(w_mag), - 'mean_angular_vel': np.mean(w_mag), - 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)), - 'angular_vel_range': np.max(w_mag) - np.min(w_mag) - } - return features +def main(): + p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning() if __name__ == '__main__': - p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning() \ No newline at end of file + main() \ No newline at end of file From 4536564f684233ad75b726a8fcb3ee6491658c1d Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Sat, 24 May 2025 22:08:43 -0700 Subject: [PATCH 09/12] saving --- feature_extraction.py | 2 ++ main.py | 11 +++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/feature_extraction.py b/feature_extraction.py index 8f9c4cb..a7edb07 100644 --- a/feature_extraction.py +++ b/feature_extraction.py @@ -1,3 +1,5 @@ +import pandas as pd +import numpy as np # Calculations for Feature Extraction from Project_Guide def compute_emg_features(signal): return { diff --git a/main.py b/main.py index 9142f53..0cd1b56 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ from dataCleaning import create_sensor_col, standardize_time_series import pdb -def main_cleaning(): +def overall_cleaning(): df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female @@ -15,11 +15,10 @@ def main_cleaning(): df_p4_exo = column_clean(df_p4_exo) df_p4_noexo = column_clean(df_p4_noexo) #upsample IMU to match EMG - standardize_time_series(df_p3_exo) - standardize_time_series(df_p3_noexo) - standardize_time_series(df_p4_exo) - standardize_time_series(df_p4_noexo) - + df_p3_exo = standardize_time_series(df_p3_exo) + df_p3_noexo = standardize_time_series(df_p3_noexo) + df_p4_exo = standardize_time_series(df_p4_exo) + df_p4_noexo = standardize_time_series(df_p4_noexo) df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True) df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False) df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True) From 0d8b0b0d4bffe130bebd7ceb4e29672ee262652e Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Sun, 25 May 2025 00:26:05 -0700 Subject: [PATCH 10/12] did pretty much all data cleaning have to fix preprocessing and filters --- dataCleaning.py | 35 ++++++++++++++++++++------ feature_extraction.py | 46 ++++++++++++++++++---------------- main.py | 58 ++++++++++++++++--------------------------- modif_cols.py | 56 ++++++++++++++++------------------------- resampling.py | 22 ++++++++++++++-- 5 files changed, 113 insertions(+), 104 deletions(-) diff --git a/dataCleaning.py b/dataCleaning.py index 07c0acc..0bb0d31 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -4,6 +4,13 @@ import pdb from modif_cols import tidy_emg_imu_as_measured from resampling import upsample, downsample +from scipy.signal import filtfilt, butter + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer # Data Labels: # Label for EMG Data shared: # Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata) @@ -67,23 +74,35 @@ def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4): b, a = butter(order, normal_cutoff, btype='low') return filtfilt(b, a, signal) -def standardize_time_series(df): - df = downsample(df) - return df - #melting and stuff def create_sensor_col(df, run_num, gender, exo): df_pivoted = tidy_emg_imu_as_measured(df) df_pivoted.columns = df_pivoted.columns.str.strip() - pdb.set_trace() df_pivoted = df_pivoted.reset_index() - df_pivoted['gender'] = gender df_pivoted['run_num'] = run_num df_pivoted['exo'] = exo df_pivoted.to_csv("pivoted_df.csv") return df_pivoted - def preprocessing(full_df): - pass + # num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms", + # "total_bedrooms", "population", "households", "median_income"] + # cat_attribs = ["ocean_proximity"] + num_attribs = full_df.select_dtypes(include=['number']).columns.tolist() + cat_attribs = full_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist() + num_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="median")), + ("standardize", StandardScaler()), + ]) + + cat_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="most_frequent")), + ("oneHot", OneHotEncoder()), + ]) + + preprocessing = ColumnTransformer([ + ("num", num_pipeline, num_attribs), + ("cat", cat_pipeline, cat_attribs), + ]) + return full_df \ No newline at end of file diff --git a/feature_extraction.py b/feature_extraction.py index a7edb07..29c4871 100644 --- a/feature_extraction.py +++ b/feature_extraction.py @@ -1,7 +1,9 @@ import pandas as pd import numpy as np # Calculations for Feature Extraction from Project_Guide -def compute_emg_features(signal): + +def compute_emg_features(df): + signal = df['EMG_MilliVolts'] return { 'mean': np.mean(signal), 'max': np.max(signal), @@ -10,7 +12,8 @@ def compute_emg_features(signal): 'rms': np.sqrt(np.mean(signal**2)) } -def compute_accel_features(a_x, a_y, a_z): +def compute_accel_features(df): + a_x = df['ACC X (G)'], a_y = df['ACC Y (G)'], a_z = df['ACC Z (G)'] a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) features = { @@ -21,7 +24,8 @@ def compute_accel_features(a_x, a_y, a_z): } return features -def compute_gyro_features(w_x, w_y, w_z): +def compute_gyro_features(df): + w_x = df['GYRO X (deg/s)'], w_y = df['GYRO Y (deg/s)'], w_z = df['GYRO Z (deg/s)'] w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) features = { @@ -31,23 +35,21 @@ def compute_gyro_features(w_x, w_y, w_z): 'angular_vel_range': np.max(w_mag) - np.min(w_mag) } return features + # fft_mean = mean(valid_freqs * valid_fft) + # fft_median = median(valid_freqs * valid_fft) + # fft_power = np.sum(valid_fft**2) - - # fft_mean = mean(valid_freqs * valid_fft) - # fft_median = median(valid_freqs * valid_fft) - # fft_power = np.sum(valid_fft**2) - - # feature_row = { - # 'emg_max': emg.max(), - # 'emg_min': emg.min(), - # 'emg_rms': np.sqrt(np.mean(emg**2)), - # 'acc_peak': np.linalg.norm(acc, axis=1).max(), - # 'acc_range': np.ptp(np.linalg.norm(acc, axis=1)), - # 'gyro_peak': np.linalg.norm(gyro, axis=1).max(), - # 'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)), - # 'emg_fft_mean_freq': fft_mean, - # 'emg_fft_median_freq': fft_median, - # 'emg_fft_power': fft_power, - # 'label': label, - # 'gender': gender - # } \ No newline at end of file + # feature_row = { + # 'emg_max': emg.max(), + # 'emg_min': emg.min(), + # 'emg_rms': np.sqrt(np.mean(emg**2)), + # 'acc_peak': np.linalg.norm(acc, axis=1).max(), + # 'acc_range': np.ptp(np.linalg.norm(acc, axis=1)), + # 'gyro_peak': np.linalg.norm(gyro, axis=1).max(), + # 'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)), + # 'emg_fft_mean_freq': fft_mean, + # 'emg_fft_median_freq': fft_median, + # 'emg_fft_power': fft_power, + # 'label': label, + # 'gender': gender + # } \ No newline at end of file diff --git a/main.py b/main.py index 0cd1b56..cad286f 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,10 @@ import pandas as pd import numpy as np from dataCleaning import read_run, column_clean, preprocessing -from dataCleaning import create_sensor_col, standardize_time_series +from dataCleaning import create_sensor_col +from resampling import downsample +from feature_extraction import compute_emg_features, compute_accel_features, compute_gyro_features + import pdb def overall_cleaning(): @@ -14,51 +17,32 @@ def overall_cleaning(): df_p3_noexo = column_clean(df_p3_noexo) df_p4_exo = column_clean(df_p4_exo) df_p4_noexo = column_clean(df_p4_noexo) - #upsample IMU to match EMG - df_p3_exo = standardize_time_series(df_p3_exo) - df_p3_noexo = standardize_time_series(df_p3_noexo) - df_p4_exo = standardize_time_series(df_p4_exo) - df_p4_noexo = standardize_time_series(df_p4_noexo) + #downsample EMG to match IMU + df_p3_exo = downsample(df_p3_exo) + df_p3_noexo = downsample(df_p3_noexo) + df_p4_exo = downsample(df_p4_exo) + df_p4_noexo = downsample(df_p4_noexo) + #melt sensor columns into a body part sensor df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True) df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False) df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True) df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False) - dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. combined_df = pd.concat(dfs, ignore_index=True) - combined_df = standardize_time_series(combined_df) - # # Show the head of the data - # df_p3_exo.describe() - # df_p3_noexo.head() - # df_p4_exo.head() - # df_p4_noexo.head() - # # Choose inputs - # features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna() - # features.head() - feature_sets = [] # Run functions to extract features for each dataframe - #CP: does this make sure to remove the redundant time series columns? - #can keep ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name - for df in dfs: - emg_features = compute_emg_features(df['EMG 1 (mV)']) - accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)']) - gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)']) - features = { - 'emg': emg_features, - 'accel': accel_features, - 'gyro': gyro_features - } - feature_sets.append(features) - #TO-DO make Exo or No Exo variable? - #imputation/preprocessing - # feature_sets now contains extracted features for each df - p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets - return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats - + pdb.set_trace() + emg_features = compute_emg_features(combined_df) + accel_features = compute_accel_features(combined_df) + gyro_features = compute_gyro_features(combined_df) # feature_sets = [] + #do this on EMG cols: + # bandpass_filter_emg(df) + #on IMU cols: + # lowpass_filter_imu(df) + + return combined_df def main(): - p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning() - + final_df = overall_cleaning() if __name__ == '__main__': main() \ No newline at end of file diff --git a/modif_cols.py b/modif_cols.py index 7008e02..2556d16 100644 --- a/modif_cols.py +++ b/modif_cols.py @@ -2,38 +2,24 @@ import numpy as np def tidy_emg_imu_as_measured(df): - muscles = ['RDelt', 'LDelt', 'RBicep', 'LBicep'] - all_muscle_tables = [] - for muscle in muscles: - # EMG rows (as measured) - emg_df = pd.DataFrame({ - 'Muscle': muscle, - 'EMG_TimeSeries': pd.to_numeric(df['EMG_TimeSeries'], errors='coerce'), - 'IMU_TimeSeries': np.nan, - 'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce'), - 'ACC X': np.nan, 'ACC Y': np.nan, 'ACC Z': np.nan, - 'GYRO X': np.nan, 'GYRO Y': np.nan, 'GYRO Z': np.nan - }) - # IMU rows (as measured) - imu_df = pd.DataFrame({ - 'Muscle': muscle, - 'EMG_TimeSeries': np.nan, - 'IMU_TimeSeries': pd.to_numeric(df['IMU_TimeSeries'], errors='coerce'), - 'EMG_MV': np.nan, - 'ACC X': pd.to_numeric(df[f'{muscle}_ACC X (G)'], errors='coerce'), - 'ACC Y': pd.to_numeric(df[f'{muscle}_ACC Y (G)'], errors='coerce'), - 'ACC Z': pd.to_numeric(df[f'{muscle}_ACC Z (G)'], errors='coerce'), - 'GYRO X': pd.to_numeric(df[f'{muscle}_GYRO X (deg/s)'], errors='coerce'), - 'GYRO Y': pd.to_numeric(df[f'{muscle}_GYRO Y (deg/s)'], errors='coerce'), - 'GYRO Z': pd.to_numeric(df[f'{muscle}_GYRO Z (deg/s)'], errors='coerce') - }) - all_muscle_tables.append(pd.concat([emg_df, imu_df], ignore_index=True)) - tidy = pd.concat(all_muscle_tables, ignore_index=True) - # Order and sort (optional) - columns_order = ['Muscle', 'EMG_TimeSeries', 'IMU_TimeSeries', 'EMG_MV', - 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z'] - tidy = tidy[columns_order] - tidy['SortTime'] = tidy['EMG_TimeSeries'].combine_first(tidy['IMU_TimeSeries']) - tidy = tidy.sort_values(['Muscle', 'SortTime']).drop(columns=['SortTime']) - df_pivoted_sorted = tidy.sort_values('EMG_TimeSeries', na_position='last') - return df_pivoted_sorted \ No newline at end of file + # Identify columns to melt (all sensor columns) + measurement_cols = [c for c in df.columns if any( + sensor in c for sensor in ['RDelt', 'LDelt', 'RBicep', 'LBicep'])] + id_vars = [c for c in df.columns if c not in measurement_cols] + # Melt + df_long = df.melt(id_vars=id_vars, value_vars=measurement_cols, + var_name='Measurement', value_name='Value') + # Extract BodyPart and Signal + df_long['BodyPart'] = df_long['Measurement'].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') + df_long['Signal'] = df_long['Measurement'].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True) + + # Pivot so each signal is a separate column + df_wide = df_long.pivot_table( + index=id_vars + ['BodyPart'], + columns='Signal', + values='Value' + ).reset_index() + # flatten columns if needed + df_wide.columns.name = None + df_wide.columns = [str(col) for col in df_wide.columns] + return df_wide \ No newline at end of file diff --git a/resampling.py b/resampling.py index 22da474..fd5df69 100644 --- a/resampling.py +++ b/resampling.py @@ -1,5 +1,7 @@ import pandas as pd import numpy as np +import pdb + def upsample(df): IMU_cols = [ 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', @@ -27,12 +29,28 @@ def upsample(df): def downsample(df): EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts'] - + IMU_cols = [ + 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', + 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GYRO Z (deg/s)', + 'RBicep_ACC X (G)', 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GYRO Z (deg/s)' + ] df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries']) + for col in df.columns: + # Optionally skip time columns if you want to preserve them as objects/strings + if "TimeSeries" in col: + continue + df[col] = pd.to_numeric(df[col], errors='coerce') df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s') df = df.set_index('time') - df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq() # Scales these columns to be the same length as IMU data + df[EMG_cols] = df[EMG_cols].interpolate(method='linear') + df[EMG_cols] = df[EMG_cols].fillna(method='bfill').fillna(method='ffill') #back fill and forward fill all Nans. + # Find the last index where at least one IMU value is real + last_idx = df[IMU_cols].last_valid_index() + # Trim DataFrame to that index + df = df.loc[:last_idx] + df = df.reset_index() return df ## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index From 7fc2e9d8dc40bc97dcd259f06a03a3ac8f388176 Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Sun, 25 May 2025 10:44:44 -0700 Subject: [PATCH 11/12] did best could with data cleaning and preprocessing, not sure how to extract shit for the features she mentioned --- dataCleaning.py | 62 ++++++++++++++++++++++++------------------- feature_extraction.py | 56 ++++++++++++++++++++++++++++++++++++++ filtering.py | 23 ++++++++++++++++ main.py | 23 +++++++++------- 4 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 filtering.py diff --git a/dataCleaning.py b/dataCleaning.py index 0bb0d31..255fa10 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -1,10 +1,11 @@ import pandas as pd import numpy as np -import pdb +import pdb + +from sklearn.model_selection import train_test_split from modif_cols import tidy_emg_imu_as_measured from resampling import upsample, downsample -from scipy.signal import filtfilt, butter from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler @@ -60,19 +61,6 @@ def column_clean(df): df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data return df -def bandpass_filter_emg(signal, fs=1259, lowcut=20, highcut=450, order=4): - nyq = 0.5 * fs - low = lowcut / nyq - high = highcut / nyq - b, a = butter(order, [low, high], btype='band') - return filtfilt(b, a, signal) - -# IMU Low-pass Filter (<20Hz) -def lowpass_filter_imu(signal, fs=148, cutoff=20, order=4): - nyq = 0.5 * fs - normal_cutoff = cutoff / nyq - b, a = butter(order, normal_cutoff, btype='low') - return filtfilt(b, a, signal) #melting and stuff def create_sensor_col(df, run_num, gender, exo): @@ -85,24 +73,42 @@ def create_sensor_col(df, run_num, gender, exo): df_pivoted.to_csv("pivoted_df.csv") return df_pivoted -def preprocessing(full_df): - # num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms", - # "total_bedrooms", "population", "households", "median_income"] - # cat_attribs = ["ocean_proximity"] - num_attribs = full_df.select_dtypes(include=['number']).columns.tolist() - cat_attribs = full_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist() +def preprocessing_actions(full_df): + num_attribs = [ + 'EMG_MilliVolts_filtered', + 'ACC X (G)_filtered', + 'ACC Y (G)_filtered', + 'ACC Z (G)_filtered', + 'GYRO X (deg/s)_filtered', + 'GYRO Y (deg/s)_filtered', + 'GYRO Z (deg/s)_filtered', + # Add any other numerical features here + ] + cat_attribs = [ + 'BodyPart', + 'gender' + #exo is the target variable + ] + num_pipeline = Pipeline([ - ("impute", SimpleImputer(strategy="median")), - ("standardize", StandardScaler()), + ("impute", SimpleImputer(strategy="median")), + ("standardize", StandardScaler()), ]) cat_pipeline = Pipeline([ - ("impute", SimpleImputer(strategy="most_frequent")), - ("oneHot", OneHotEncoder()), + ("impute", SimpleImputer(strategy="most_frequent")), + ("oneHot", OneHotEncoder()), ]) preprocessing = ColumnTransformer([ - ("num", num_pipeline, num_attribs), - ("cat", cat_pipeline, cat_attribs), + ("num", num_pipeline, num_attribs), + ("cat", cat_pipeline, cat_attribs), ]) - return full_df \ No newline at end of file + # Prepare data for modeling + X = full_df[num_attribs + cat_attribs] + y = full_df["exo"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + X_train_prepared = preprocessing.fit_transform(X_train) + X_test_prepared = preprocessing.transform(X_test) + return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing \ No newline at end of file diff --git a/feature_extraction.py b/feature_extraction.py index 29c4871..5eaa17b 100644 --- a/feature_extraction.py +++ b/feature_extraction.py @@ -1,7 +1,63 @@ +import pdb import pandas as pd import numpy as np # Calculations for Feature Extraction from Project_Guide +def extract_features(df): + # Group by relevant columns + group_cols = ['BodyPart', 'run_num', 'gender', 'exo'] # adapt as needed + feature_rows = [] + for group_vals, group in df.groupby(group_cols): + # Accelerometer features + a_x, a_y, a_z = group['ACC X (G)_filtered'], group['ACC Y (G)_filtered'], group['ACC Z (G)_filtered'] + a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) + accel_peak = np.max(a_mag) + accel_mean = np.mean(a_mag) + accel_total = np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)) + accel_range = np.max(a_mag) - np.min(a_mag) + + # Gyroscope features + w_x, w_y, w_z = group['GYRO X (deg/s)_filtered'], group['GYRO Y (deg/s)_filtered'], group['GYRO Z (deg/s)_filtered'] + w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) + gyro_peak = np.max(w_mag) + gyro_mean = np.mean(w_mag) + gyro_total = np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)) + gyro_range = np.max(w_mag) - np.min(w_mag) + + # EMG features (filtered) + emg = group['EMG_MilliVolts_filtered'] + emg_mean = np.mean(emg) + emg_max = np.max(emg) + emg_min = np.min(emg) + emg_std = np.std(emg) + emg_rms = np.sqrt(np.mean(emg**2)) + + # Build feature dict + feature_dict = { + 'BodyPart': group_vals[0], + 'run_num': group_vals[1], + 'gender': group_vals[2], + 'exo': group_vals[3], + 'accel_peak': accel_peak, + 'accel_mean': accel_mean, + 'accel_total': accel_total, + 'accel_range': accel_range, + 'gyro_peak': gyro_peak, + 'gyro_mean': gyro_mean, + 'gyro_total': gyro_total, + 'gyro_range': gyro_range, + 'emg_mean': emg_mean, + 'emg_max': emg_max, + 'emg_min': emg_min, + 'emg_std': emg_std, + 'emg_rms': emg_rms, + } + feature_rows.append(feature_dict) + #THIS IS LAME (only 17 rows) BRUH + # Return as a new DataFrame + return pd.DataFrame(feature_rows) + +#old funcs def compute_emg_features(df): signal = df['EMG_MilliVolts'] return { diff --git a/filtering.py b/filtering.py new file mode 100644 index 0000000..e75973f --- /dev/null +++ b/filtering.py @@ -0,0 +1,23 @@ +from scipy.signal import filtfilt, butter +import numpy as np +import pandas as pd + +def bandpass_filter_emg(series_signal, fs=1259, lowcut=20, highcut=450, order=4): + arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal) + if np.isnan(arr).all() or len(arr) == 0: #edge case check + return arr + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype='band') + return filtfilt(b, a, series_signal) + +# IMU Low-pass Filter (<20Hz) +def lowpass_filter_imu(series_signal, fs=148, cutoff=20, order=4): + arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal) + if np.isnan(arr).all() or len(arr) == 0: #edge case check + return arr + nyq = 0.5 * fs + normal_cutoff = cutoff / nyq + b, a = butter(order, normal_cutoff, btype='low') + return filtfilt(b, a, series_signal) \ No newline at end of file diff --git a/main.py b/main.py index cad286f..a73e4ff 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,10 @@ import pandas as pd import numpy as np -from dataCleaning import read_run, column_clean, preprocessing +from dataCleaning import read_run, column_clean, preprocessing_actions from dataCleaning import create_sensor_col from resampling import downsample -from feature_extraction import compute_emg_features, compute_accel_features, compute_gyro_features +from feature_extraction import extract_features +from filtering import bandpass_filter_emg, lowpass_filter_imu import pdb @@ -30,15 +31,17 @@ def overall_cleaning(): dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. combined_df = pd.concat(dfs, ignore_index=True) # Run functions to extract features for each dataframe - pdb.set_trace() - emg_features = compute_emg_features(combined_df) - accel_features = compute_accel_features(combined_df) - gyro_features = compute_gyro_features(combined_df) # feature_sets = [] - #do this on EMG cols: - # bandpass_filter_emg(df) - #on IMU cols: - # lowpass_filter_imu(df) + #filter out IMU and EMG outliers using filters: + imu_cols = ['ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)'] + for col in imu_cols: + combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu) + combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg) + features_df = extract_features(combined_df) + #machine learning on combined_df + #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data + X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df) + #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data. return combined_df def main(): From 75a207aa0e9b6af97558cd80cb1c20feb43da321 Mon Sep 17 00:00:00 2001 From: cyrusParvereshi Date: Mon, 26 May 2025 19:27:06 -0700 Subject: [PATCH 12/12] added neural_net option for preprocessing_actions --- dataCleaning.py | 20 ++++++++++++-------- main.py | 10 ++++------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/dataCleaning.py b/dataCleaning.py index 255fa10..4ae484f 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -8,7 +8,7 @@ from resampling import upsample, downsample from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer @@ -73,7 +73,7 @@ def create_sensor_col(df, run_num, gender, exo): df_pivoted.to_csv("pivoted_df.csv") return df_pivoted -def preprocessing_actions(full_df): +def preprocessing_actions(full_df, neural_net=False): num_attribs = [ 'EMG_MilliVolts_filtered', 'ACC X (G)_filtered', @@ -89,12 +89,16 @@ def preprocessing_actions(full_df): 'gender' #exo is the target variable ] - - num_pipeline = Pipeline([ - ("impute", SimpleImputer(strategy="median")), - ("standardize", StandardScaler()), - ]) - + if neural_net: + num_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="median")), + ("standardize", MinMaxScaler()), + ]) + else: + num_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="median")), + ("standardize", StandardScaler()), + ]) cat_pipeline = Pipeline([ ("impute", SimpleImputer(strategy="most_frequent")), ("oneHot", OneHotEncoder()), diff --git a/main.py b/main.py index a73e4ff..df28536 100644 --- a/main.py +++ b/main.py @@ -5,9 +5,7 @@ from resampling import downsample from feature_extraction import extract_features from filtering import bandpass_filter_emg, lowpass_filter_imu - -import pdb - +import pdb def overall_cleaning(): df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male @@ -36,13 +34,13 @@ def overall_cleaning(): for col in imu_cols: combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu) combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg) - - features_df = extract_features(combined_df) + pdb.set_trace() + features_df = extract_features(combined_df) #TO-DO FIX #machine learning on combined_df #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df) #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data. - return combined_df + return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline def main(): final_df = overall_cleaning()