diff --git a/FeatureExtraction.py b/FeatureExtraction.py deleted file mode 100644 index c27d629..0000000 --- a/FeatureExtraction.py +++ /dev/null @@ -1,79 +0,0 @@ -import pandas as pd -import numpy as np -from dataCleaning import read_run, column_clean, preprocessing -import pdb - -def overall_cleaning(): - df_p3_exo = read_run("P3_Exo_1_0.csv") # second run, male - df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male - df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female - df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female - - df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male') - df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male') - df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female') - df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female') - combined_df = pd.concat([df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo], ignore_index=True) - dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. - # # Show the head of the data - # df_p3_exo.describe() - df_p3_noexo.head() - # df_p4_exo.head() - # df_p4_noexo.head() - # # Choose inputs - # features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna() - # features.head() - feature_sets = [] - # Run functions to extract features for each dataframe - #CP: does this make sure to remove the redundant time series columns? - #can keep ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name - for df in dfs: - emg_features = compute_emg_features(df['EMG 1 (mV)']) - accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)']) - gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)']) - features = { - 'emg': emg_features, - 'accel': accel_features, - 'gyro': gyro_features - } - feature_sets.append(features) - - # feature_sets now contains extracted features for each df - p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = feature_sets - return p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats - -# Calculations for Feature Extraction from Project_Guide -def compute_emg_features(signal): - return { - 'mean': np.mean(signal), - 'max': np.max(signal), - 'min': np.min(signal), - 'std': np.std(signal), - 'rms': np.sqrt(np.mean(signal**2)) - } - -def compute_accel_features(a_x, a_y, a_z): - a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) - - features = { - 'peak_accel': np.max(a_mag), - 'mean_accel': np.mean(a_mag), - 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)), - 'accel_range': np.max(a_mag) - np.min(a_mag) - } - return features - -def compute_gyro_features(w_x, w_y, w_z): - w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) - - features = { - 'peak_angular_vel': np.max(w_mag), - 'mean_angular_vel': np.mean(w_mag), - 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)), - 'angular_vel_range': np.max(w_mag) - np.min(w_mag) - } - return features - - -if __name__ == '__main__': - p3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning() \ No newline at end of file diff --git a/dataCleaning.py b/dataCleaning.py index 087d7e8..4ae484f 100644 --- a/dataCleaning.py +++ b/dataCleaning.py @@ -1,8 +1,17 @@ import pandas as pd import numpy as np -import pdb +import pdb +from sklearn.model_selection import train_test_split +from modif_cols import tidy_emg_imu_as_measured +from resampling import upsample, downsample + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler, StandardScaler +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer # Data Labels: # Label for EMG Data shared: # Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata) @@ -27,13 +36,13 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel usecols = usecols, on_bad_lines='skip') df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)', 'RDelt_ACC Y (G)', 'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)', - 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)', - 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)', - 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)' + 'LDelt_TimeSeries', 'LDelt_EMG_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)', + 'RBicep_TimeSeries', 'RBicep_EMG_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_TimeSeries', 'LBicep_EMG_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)' ] - return df + return df #raw data -def column_clean(df, run_num, gender): +def column_clean(df): #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)', 'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', @@ -45,14 +54,65 @@ def column_clean(df, run_num, gender): 'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)'] df = df.drop(extr_time_series, axis = 1) + df = df.rename(columns={'RDelt_EMG_TimeSeries': 'EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)': 'IMU_TimeSeries'}) # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols # df.columns = df.columns.str.strip() # Remove leading/trailing spaces (Yuxuan) # df = df.apply(pd.to_numeric, errors='coerce') # Conver t everything to numeric (Yuxuan) df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data - df['gender'] = gender - df['run_num'] = run_num - df.to_csv("test.csv") return df -def preprocessing(full_df): - pass + +#melting and stuff +def create_sensor_col(df, run_num, gender, exo): + df_pivoted = tidy_emg_imu_as_measured(df) + df_pivoted.columns = df_pivoted.columns.str.strip() + df_pivoted = df_pivoted.reset_index() + df_pivoted['gender'] = gender + df_pivoted['run_num'] = run_num + df_pivoted['exo'] = exo + df_pivoted.to_csv("pivoted_df.csv") + return df_pivoted + +def preprocessing_actions(full_df, neural_net=False): + num_attribs = [ + 'EMG_MilliVolts_filtered', + 'ACC X (G)_filtered', + 'ACC Y (G)_filtered', + 'ACC Z (G)_filtered', + 'GYRO X (deg/s)_filtered', + 'GYRO Y (deg/s)_filtered', + 'GYRO Z (deg/s)_filtered', + # Add any other numerical features here + ] + cat_attribs = [ + 'BodyPart', + 'gender' + #exo is the target variable + ] + if neural_net: + num_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="median")), + ("standardize", MinMaxScaler()), + ]) + else: + num_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="median")), + ("standardize", StandardScaler()), + ]) + cat_pipeline = Pipeline([ + ("impute", SimpleImputer(strategy="most_frequent")), + ("oneHot", OneHotEncoder()), + ]) + + preprocessing = ColumnTransformer([ + ("num", num_pipeline, num_attribs), + ("cat", cat_pipeline, cat_attribs), + ]) + # Prepare data for modeling + X = full_df[num_attribs + cat_attribs] + y = full_df["exo"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + X_train_prepared = preprocessing.fit_transform(X_train) + X_test_prepared = preprocessing.transform(X_test) + return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing \ No newline at end of file diff --git a/debug.ipynb b/debug.ipynb new file mode 100644 index 0000000..c6ad330 --- /dev/null +++ b/debug.ipynb @@ -0,0 +1,952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e0bebc80-fe7f-4d6c-8387-5c512308e48d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from dataCleaning import read_run, column_clean, preprocessing, create_sensor_col\n", + "import pdb" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3bd13a4f-893e-43c3-bc4a-afabc40bbcde", + "metadata": {}, + "outputs": [], + "source": [ + "def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fields as well as metadata)\n", + " usecols = list(range(0, 56)) \n", + " df = pd.read_csv(filename, low_memory = False, \n", + " header = 0, \n", + " skiprows=skiprows,\n", + " # names=header,\n", + " usecols = usecols,\n", + " on_bad_lines='skip') \n", + " df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)', 'RDelt_ACC Y (G)', 'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',\n", + " 'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',\n", + " 'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',\n", + " 'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'\n", + " ]\n", + " return df #raw data " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "07d4e282-5741-449e-a2a8-71f3b3a6c66d", + "metadata": {}, + "outputs": [], + "source": [ + "def column_clean(df, run_num, gender):\n", + " #remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU \n", + " extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',\n", + " 'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)', \n", + " 'LDelt_Acc Y Time Series(s)', 'LDelt_Acc Z Time Series(s)', 'LDelt_GyroXTime Series(s)',\n", + " 'LDelt_GyroYTime Series(s)', 'LDelt_GyroZTime Series(s)', 'RBicep_TimeSeries', 'RBicep_Acc X Time Series(s)',\n", + " 'RBicep_Acc Y Time Series(s)', 'RBicep_Acc Z Time Series(s)', 'RBicep_GyroXTime Series(s)',\n", + " 'RBicep_GyroYTime Series(s)', 'RBicep_GyroZTime Series(s)', 'LBicep_TimeSeries', 'LBicep_Acc X Time Series(s)',\n", + " 'LBicep_Acc Y Time Series(s)', 'LBicep_Acc Z Time Series(s)', 'LBicep_GyroXTime Series(s)', \n", + " 'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']\n", + " \n", + " df = df.drop(extr_time_series, axis = 1)\n", + " # measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols\n", + " # df.columns = df.columns.str.strip() # Remove leading/trailing spaces (Yuxuan)\n", + " # df = df.apply(pd.to_numeric, errors='coerce') # Conver t everything to numeric (Yuxuan)\n", + " df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data\n", + " df['gender'] = gender\n", + " df['run_num'] = run_num\n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c74acdb7-01e3-443f-92b0-6d2a6e45696c", + "metadata": {}, + "outputs": [], + "source": [ + "def create_sensor_col(df): \n", + " columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n", + " pdb.set_trace()\n", + " # Identify all measurement columns, including EMG millivolts\n", + " measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n", + " df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n", + " \n", + " # Extract the Sensor Body Position\n", + " df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + " # Extract measurement type, including EMG millivolts\n", + " df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column\n", + " df_melted = df_melted.drop(columns=[\"sensor_measurement\"]) # Pivot the DataFrame so each measurement type becomes a separate column\n", + " df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n", + " df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n", + " df_melted.fillna(np.nan, inplace=True)\n", + " # Pivot the DataFrame so each measurement type becomes a separate column\n", + " df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n", + " columns='measurement_type', values='value')\n", + " df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n", + " df_pivoted.columns = df_pivoted.columns.str.strip()\n", + " df_pivoted = df_pivoted.reset_index()\n", + " pdb.set_trace()\n", + " df_pivoted.to_csv(\"pivoted_df.csv\")\n", + " return df_pivoted" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a446022-09eb-4f86-a77a-980490d55b0e", + "metadata": {}, + "outputs": [], + "source": [ + "def standardize_time_series():\n", + " # interpolate()\n", + " pass\n", + "\n", + "def preprocessing(full_df):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fc81654-185e-4738-919a-57afa144341e", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dab168eb-f1dc-4d7e-99fe-3ec1042db065", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculations for Feature Extraction from Project_Guide\n", + "def compute_emg_features(signal):\n", + " return {\n", + " 'mean': np.mean(signal),\n", + " 'max': np.max(signal),\n", + " 'min': np.min(signal),\n", + " 'std': np.std(signal),\n", + " 'rms': np.sqrt(np.mean(signal**2))\n", + " }\n", + "\n", + "def compute_accel_features(a_x, a_y, a_z):\n", + " a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2)\n", + " \n", + " features = {\n", + " 'peak_accel': np.max(a_mag),\n", + " 'mean_accel': np.mean(a_mag),\n", + " 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)),\n", + " 'accel_range': np.max(a_mag) - np.min(a_mag)\n", + " }\n", + " return features\n", + "\n", + "def compute_gyro_features(w_x, w_y, w_z):\n", + " w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2)\n", + " \n", + " features = {\n", + " 'peak_angular_vel': np.max(w_mag),\n", + " 'mean_angular_vel': np.mean(w_mag),\n", + " 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)),\n", + " 'angular_vel_range': np.max(w_mag) - np.min(w_mag)\n", + " }\n", + " return features " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72ed77bf-dbc1-406c-a5b1-57dcce504502", + "metadata": {}, + "outputs": [], + "source": [ + "P3exo_feats, p3noexo_feats, p4exo_feats, p4noexo_feats = overall_cleaning()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d8305190-17ac-43fd-b7d9-d9a90e329533", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = read_run(\"P3_Exo_1_0.csv\") # 2nd run, male" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ab22fd91-5089-4c1e-91e5-f2fedb609f69", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_noexo = read_run(\"P3_NoExo_1_0.csv\") # first run, male" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2b503e14-fc38-46c2-9a10-3d7ea8ade855", + "metadata": {}, + "outputs": [], + "source": [ + "df_p4_exo = read_run(\"P4_Exo_1_0.csv\") # 1st run female\n", + "df_p4_noexo = read_run(\"P4_NoExo_1_0.csv\") # 2nd female" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ad941e26-6be4-4ae5-8d57-f148509e1675", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = column_clean(df_p3_exo, run_num = 2, gender = 'male')\n", + "df_p3_noexo = column_clean(df_p3_noexo, run_num = 1, gender = 'male')\n", + "df_p4_exo = column_clean(df_p4_exo, run_num = 2, gender = 'female')\n", + "df_p4_noexo = column_clean(df_p4_noexo, run_num = 1, gender = 'female')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0372fab5-cc33-4ec4-a878-a20207b8b542", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RDelt_EMG_TimeSeriesRDelt_IMU_Acc X Time Series(s)genderrun_numsensor_measurementvalueSensor_Body_Positionmeasurement_type
00.0000000male2RDelt_EMG_MilliVolts0.004868RDeltEMG_MilliVolts
10.0007940.00675male2RDelt_EMG_MilliVolts0.005875RDeltEMG_MilliVolts
20.0015880.0135male2RDelt_EMG_MilliVolts0.005203RDeltEMG_MilliVolts
30.0023820.02025male2RDelt_EMG_MilliVolts0.005539RDeltEMG_MilliVolts
40.0031770.027male2RDelt_EMG_MilliVolts0.007721RDeltEMG_MilliVolts
...........................
3852739109.265029NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852740109.265823NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852741109.266618NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852742109.267412NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
3852743109.268206NaNmale2LBicep_GYRO Z (deg/s)NaNLBicepGYRO Z
\n", + "

3852744 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " RDelt_EMG_TimeSeries RDelt_IMU_Acc X Time Series(s) gender run_num \\\n", + "0 0.000000 0 male 2 \n", + "1 0.000794 0.00675 male 2 \n", + "2 0.001588 0.0135 male 2 \n", + "3 0.002382 0.02025 male 2 \n", + "4 0.003177 0.027 male 2 \n", + "... ... ... ... ... \n", + "3852739 109.265029 NaN male 2 \n", + "3852740 109.265823 NaN male 2 \n", + "3852741 109.266618 NaN male 2 \n", + "3852742 109.267412 NaN male 2 \n", + "3852743 109.268206 NaN male 2 \n", + "\n", + " sensor_measurement value Sensor_Body_Position measurement_type \n", + "0 RDelt_EMG_MilliVolts 0.004868 RDelt EMG_MilliVolts \n", + "1 RDelt_EMG_MilliVolts 0.005875 RDelt EMG_MilliVolts \n", + "2 RDelt_EMG_MilliVolts 0.005203 RDelt EMG_MilliVolts \n", + "3 RDelt_EMG_MilliVolts 0.005539 RDelt EMG_MilliVolts \n", + "4 RDelt_EMG_MilliVolts 0.007721 RDelt EMG_MilliVolts \n", + "... ... ... ... ... \n", + "3852739 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852740 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852741 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852742 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "3852743 LBicep_GYRO Z (deg/s) NaN LBicep GYRO Z \n", + "\n", + "[3852744 rows x 8 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df_p3_exo\n", + "columns_to_keep = ['RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num']\n", + "measurement_columns = [col for col in df.columns if any(prefix in col for prefix in ['RDelt', 'LDelt', 'RBicep', 'LBicep']) and col not in columns_to_keep]\n", + "df_melted = df.melt(id_vars=columns_to_keep, value_vars=measurement_columns, var_name=\"sensor_measurement\", value_name=\"value\")\n", + " # Extract the Sensor Body Position\n", + "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + "# Extract measurement type, including EMG millivolts\n", + "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') \n", + "df_melted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9aea9bd-e997-4330-9116-e4d9c6db90a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RDelt_EMG_TimeSeriesRDelt_EMG_MilliVoltsRDelt_IMU_Acc X Time Series(s)RDelt_ACC X (G)RDelt_ACC Y (G)RDelt_ACC Z (G)RDelt_GYRO X (deg/s)RDelt_GYRO Y (deg/s)RDelt_GYRO Z (deg/s)LDelt_MilliVolts...RBicep_GYRO Z (deg/s)LBicep_MilliVoltsLBicep_ACC X (G)LBicep_ACC Y (G)LBicep_ACC Z (G)LBicep_GYRO X (deg/s)LBicep_GYRO Y (deg/s)LBicep_GYRO Z (deg/s)genderrun_num
00.0000000.00486800.07476810.90612790.2548828-30.7404575-4.25190839.358779-0.006546...-11.95419880.0419620.25073240.88085940.1972656-30.8015278.557251912.6870232male2
10.0007940.0058750.006750.07952880.9132080.2689209-30.7786255-5.9618328.5419846-0.006546...-12.53435130.0419620.24530030.87902830.2055054-29.0381689.900763513.0305347male2
20.0015880.0052030.01350.08044430.91943360.2719116-29.9312973-6.80152658.4503813-0.007217...-12.8015270.0414590.24865720.8801880.2092896-27.66412168.969465312.358779male2
30.0023820.0055390.020250.08093260.93164060.2680054-29.1068707-6.88549617.6793895-0.004196...-12.74809170.0397800.25335690.8801270.2134399-25.4427497.610687311.1145039male2
40.0031770.0077210.0270.08666990.93194580.2663574-29.3129768-7.40458016.7557254-0.005203...-11.18320660.0414590.25909420.87707520.2124634-23.5572516.09160339.7862597male2
..................................................................
137593109.2650290.017960NaNNaNNaNNaNNaNNaNNaN-0.002182...NaN0.030716NaNNaNNaNNaNNaNNaNmale2
137594109.2658230.019974NaNNaNNaNNaNNaNNaNNaN-0.001846...NaN0.035416NaNNaNNaNNaNNaNNaNmale2
137595109.2666180.020981NaNNaNNaNNaNNaNNaNNaN-0.004196...NaN0.034745NaNNaNNaNNaNNaNNaNmale2
137596109.2674120.018631NaNNaNNaNNaNNaNNaNNaN-0.005707...NaN0.035248NaNNaNNaNNaNNaNNaNmale2
137597109.2682060.019974NaNNaNNaNNaNNaNNaNNaN-0.004196...NaN0.036591NaNNaNNaNNaNNaNNaNmale2
\n", + "

137598 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " RDelt_EMG_TimeSeries RDelt_EMG_MilliVolts \\\n", + "0 0.000000 0.004868 \n", + "1 0.000794 0.005875 \n", + "2 0.001588 0.005203 \n", + "3 0.002382 0.005539 \n", + "4 0.003177 0.007721 \n", + "... ... ... \n", + "137593 109.265029 0.017960 \n", + "137594 109.265823 0.019974 \n", + "137595 109.266618 0.020981 \n", + "137596 109.267412 0.018631 \n", + "137597 109.268206 0.019974 \n", + "\n", + " RDelt_IMU_Acc X Time Series(s) RDelt_ACC X (G) RDelt_ACC Y (G) \\\n", + "0 0 0.0747681 0.9061279 \n", + "1 0.00675 0.0795288 0.913208 \n", + "2 0.0135 0.0804443 0.9194336 \n", + "3 0.02025 0.0809326 0.9316406 \n", + "4 0.027 0.0866699 0.9319458 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " RDelt_ACC Z (G) RDelt_GYRO X (deg/s) RDelt_GYRO Y (deg/s) \\\n", + "0 0.2548828 -30.7404575 -4.2519083 \n", + "1 0.2689209 -30.7786255 -5.961832 \n", + "2 0.2719116 -29.9312973 -6.8015265 \n", + "3 0.2680054 -29.1068707 -6.8854961 \n", + "4 0.2663574 -29.3129768 -7.4045801 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " RDelt_GYRO Z (deg/s) LDelt_MilliVolts ... RBicep_GYRO Z (deg/s) \\\n", + "0 9.358779 -0.006546 ... -11.9541988 \n", + "1 8.5419846 -0.006546 ... -12.5343513 \n", + "2 8.4503813 -0.007217 ... -12.801527 \n", + "3 7.6793895 -0.004196 ... -12.7480917 \n", + "4 6.7557254 -0.005203 ... -11.1832066 \n", + "... ... ... ... ... \n", + "137593 NaN -0.002182 ... NaN \n", + "137594 NaN -0.001846 ... NaN \n", + "137595 NaN -0.004196 ... NaN \n", + "137596 NaN -0.005707 ... NaN \n", + "137597 NaN -0.004196 ... NaN \n", + "\n", + " LBicep_MilliVolts LBicep_ACC X (G) LBicep_ACC Y (G) LBicep_ACC Z (G) \\\n", + "0 0.041962 0.2507324 0.8808594 0.1972656 \n", + "1 0.041962 0.2453003 0.8790283 0.2055054 \n", + "2 0.041459 0.2486572 0.880188 0.2092896 \n", + "3 0.039780 0.2533569 0.880127 0.2134399 \n", + "4 0.041459 0.2590942 0.8770752 0.2124634 \n", + "... ... ... ... ... \n", + "137593 0.030716 NaN NaN NaN \n", + "137594 0.035416 NaN NaN NaN \n", + "137595 0.034745 NaN NaN NaN \n", + "137596 0.035248 NaN NaN NaN \n", + "137597 0.036591 NaN NaN NaN \n", + "\n", + " LBicep_GYRO X (deg/s) LBicep_GYRO Y (deg/s) LBicep_GYRO Z (deg/s) \\\n", + "0 -30.801527 8.5572519 12.6870232 \n", + "1 -29.038168 9.9007635 13.0305347 \n", + "2 -27.6641216 8.9694653 12.358779 \n", + "3 -25.442749 7.6106873 11.1145039 \n", + "4 -23.557251 6.0916033 9.7862597 \n", + "... ... ... ... \n", + "137593 NaN NaN NaN \n", + "137594 NaN NaN NaN \n", + "137595 NaN NaN NaN \n", + "137596 NaN NaN NaN \n", + "137597 NaN NaN NaN \n", + "\n", + " gender run_num \n", + "0 male 2 \n", + "1 male 2 \n", + "2 male 2 \n", + "3 male 2 \n", + "4 male 2 \n", + "... ... ... \n", + "137593 male 2 \n", + "137594 male 2 \n", + "137595 male 2 \n", + "137596 male 2 \n", + "137597 male 2 \n", + "\n", + "[137598 rows x 32 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " # Extract the Sensor Body Position\n", + "df_melted[\"Sensor_Body_Position\"] = df_melted[\"sensor_measurement\"].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)')\n", + "# Extract measurement type, including EMG millivolts\n", + "df_melted[\"measurement_type\"] = df_melted[\"sensor_measurement\"].str.extract(r'_(EMG_MilliVolts|MilliVolts|ACC X|ACC Y|ACC Z|GYRO X|GYRO Y|GYRO Z)') # Drop the original sensor_measurement column\n", + "df_melted = df_melted.drop(columns=[\"sensor_measurement\"]) # Pivot the DataFrame so each measurement type becomes a separate column\n", + "df_melted[\"value\"] = df_melted[\"value\"].astype(str).str.strip() #make sure that values are clean if they're strings (no extra space)\n", + "df_melted[\"value\"] = pd.to_numeric(df_melted[\"value\"], errors=\"coerce\") #make sure all values are cast to numeric\n", + "df_melted.fillna(np.nan, inplace=True)\n", + "# Pivot the DataFrame so each measurement type becomes a separate column\n", + "df_pivoted = df_melted.pivot_table(index=['Sensor_Body_Position', 'RDelt_EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)', 'gender', 'run_num'], \n", + " columns='measurement_type', values='value')\n", + "df_pivoted.columns = df_pivoted.columns.get_level_values(0)\n", + "df_pivoted.columns = df_pivoted.columns.str.strip()\n", + "df_pivoted = df_pivoted.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bf43181-6aa6-44ba-8fc1-44dc7bbeaed3", + "metadata": {}, + "outputs": [], + "source": [ + "df_p3_exo = create_sensor_col(df_p3_exo)\n", + "df_p3_noexo = create_sensor_col(df_p3_noexo)\n", + "df_p4_exo = create_sensor_col(df_p4_exo)\n", + "df_p4_noexo = create_sensor_col(df_p4_noexo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01bf5481-3d22-4143-8990-fec243ae013e", + "metadata": {}, + "outputs": [], + "source": [ + "dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later.\n", + "combined_df = pd.concat(dfs, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6263007-da44-4d49-b437-0754e56a2def", + "metadata": {}, + "outputs": [], + "source": [ + "# # Show the head of the data\n", + "# df_p3_exo.describe()\n", + "df_p3_noexo.head()\n", + "# df_p4_exo.head()\n", + "# df_p4_noexo.head()\n", + "# # Choose inputs\n", + "# features = df_p3_exo[['EMG 1 (mV)', 'ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)']].dropna()\n", + "# features.head()\n", + "feature_sets = []\n", + "# Run functions to extract features for each dataframe\n", + "#CP: does this make sure to remove the redundant time series columns?\n", + "#can keep ACC X Time Series (s) in each sensor group, and remove any other column with 'Time Series (s)' in its name \n", + "for df in dfs:\n", + " emg_features = compute_emg_features(df['EMG 1 (mV)'])\n", + " accel_features = compute_accel_features(df['ACC X (G)'], df['ACC Y (G)'], df['ACC Z (G)'])\n", + " gyro_features = compute_gyro_features(df['GYRO X (deg/s)'], df['GYRO Y (deg/s)'], df['GYRO Z (deg/s)'])\n", + " features = {\n", + " 'emg': emg_features,\n", + " 'accel': accel_features,\n", + " 'gyro': gyro_features\n", + " }\n", + " feature_sets.append(features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccbbdce4-9c51-4b5c-b263-d23cc0c79154", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dk.txt b/dk.txt new file mode 100644 index 0000000..dfcadb6 --- /dev/null +++ b/dk.txt @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np + +def emg_to_imu_asof_all_muscles(df, emg_time_col='EMG_TimeSeries', imu_time_col='IMU_TimeSeries'): + """ + For each muscle group, create a tidy DataFrame where each row is an EMG measurement, + with corresponding IMU data (if available) joined via pandas.merge_asof (backward). + IMU columns may have NaNs if not available at that EMG time. + Returns a concatenated DataFrame for all muscles with a 'Muscle' column. + """ + muscle_names = [] + for col in df.columns: + if '_EMG_MilliVolts' in col: + muscle = col.replace('_EMG_MilliVolts', '') + muscle_names.append(muscle) + + all_muscles = [] + for muscle in muscle_names: + # Build EMG DataFrame for this muscle + emg_df = pd.DataFrame({ + 'EMG_TimeSeries': pd.to_numeric(df[emg_time_col], errors='coerce'), + 'EMG_MV': pd.to_numeric(df[f'{muscle}_EMG_MilliVolts'], errors='coerce') + }).dropna(subset=['EMG_TimeSeries', 'EMG_MV']) + + # Build IMU DataFrame for this muscle + imu_df = pd.DataFrame({ + 'IMU_TimeSeries': pd.to_numeric(df[imu_time_col], errors='coerce'), + 'ACC X': pd.to_numeric(df.get(f'{muscle}_ACC X (G)'), errors='coerce'), + 'ACC Y': pd.to_numeric(df.get(f'{muscle}_ACC Y (G)'), errors='coerce'), + 'ACC Z': pd.to_numeric(df.get(f'{muscle}_ACC Z (G)'), errors='coerce'), + 'GYRO X': pd.to_numeric(df.get(f'{muscle}_GYRO X (deg/s)'), errors='coerce'), + 'GYRO Y': pd.to_numeric(df.get(f'{muscle}_GYRO Y (deg/s)'), errors='coerce'), + 'GYRO Z': pd.to_numeric(df.get(f'{muscle}_GYRO Z (deg/s)'), errors='coerce') + }) + + # Merge IMU onto EMG (backward: most recent IMU) + merged = pd.merge_asof( + emg_df.sort_values('EMG_TimeSeries'), + imu_df.sort_values('IMU_TimeSeries'), + left_on='EMG_TimeSeries', + right_on='IMU_TimeSeries', + direction='backward' + ) + + merged['Muscle'] = muscle + all_muscles.append(merged) + + tidy = pd.concat(all_muscles, ignore_index=True) + # Order columns + cols = ['Muscle', 'EMG_TimeSeries', 'EMG_MV', 'IMU_TimeSeries', + 'ACC X', 'ACC Y', 'ACC Z', 'GYRO X', 'GYRO Y', 'GYRO Z'] + tidy = tidy[cols] + return tidy + +# Usage example: +# tidy_df = emg_to_imu_asof_all_muscles(df) +# tidy_df.to_csv('emg_imu_tidy.csv', index=False) \ No newline at end of file diff --git a/feature_extraction.py b/feature_extraction.py new file mode 100644 index 0000000..5eaa17b --- /dev/null +++ b/feature_extraction.py @@ -0,0 +1,111 @@ +import pdb +import pandas as pd +import numpy as np +# Calculations for Feature Extraction from Project_Guide + +def extract_features(df): + # Group by relevant columns + group_cols = ['BodyPart', 'run_num', 'gender', 'exo'] # adapt as needed + feature_rows = [] + for group_vals, group in df.groupby(group_cols): + # Accelerometer features + a_x, a_y, a_z = group['ACC X (G)_filtered'], group['ACC Y (G)_filtered'], group['ACC Z (G)_filtered'] + a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) + accel_peak = np.max(a_mag) + accel_mean = np.mean(a_mag) + accel_total = np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)) + accel_range = np.max(a_mag) - np.min(a_mag) + + # Gyroscope features + w_x, w_y, w_z = group['GYRO X (deg/s)_filtered'], group['GYRO Y (deg/s)_filtered'], group['GYRO Z (deg/s)_filtered'] + w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) + gyro_peak = np.max(w_mag) + gyro_mean = np.mean(w_mag) + gyro_total = np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)) + gyro_range = np.max(w_mag) - np.min(w_mag) + + # EMG features (filtered) + emg = group['EMG_MilliVolts_filtered'] + emg_mean = np.mean(emg) + emg_max = np.max(emg) + emg_min = np.min(emg) + emg_std = np.std(emg) + emg_rms = np.sqrt(np.mean(emg**2)) + + # Build feature dict + feature_dict = { + 'BodyPart': group_vals[0], + 'run_num': group_vals[1], + 'gender': group_vals[2], + 'exo': group_vals[3], + 'accel_peak': accel_peak, + 'accel_mean': accel_mean, + 'accel_total': accel_total, + 'accel_range': accel_range, + 'gyro_peak': gyro_peak, + 'gyro_mean': gyro_mean, + 'gyro_total': gyro_total, + 'gyro_range': gyro_range, + 'emg_mean': emg_mean, + 'emg_max': emg_max, + 'emg_min': emg_min, + 'emg_std': emg_std, + 'emg_rms': emg_rms, + } + feature_rows.append(feature_dict) + #THIS IS LAME (only 17 rows) BRUH + # Return as a new DataFrame + return pd.DataFrame(feature_rows) + +#old funcs +def compute_emg_features(df): + signal = df['EMG_MilliVolts'] + return { + 'mean': np.mean(signal), + 'max': np.max(signal), + 'min': np.min(signal), + 'std': np.std(signal), + 'rms': np.sqrt(np.mean(signal**2)) + } + +def compute_accel_features(df): + a_x = df['ACC X (G)'], a_y = df['ACC Y (G)'], a_z = df['ACC Z (G)'] + a_mag = np.sqrt(a_x**2 + a_y**2 + a_z**2) + + features = { + 'peak_accel': np.max(a_mag), + 'mean_accel': np.mean(a_mag), + 'total_accel': np.sqrt(np.mean(a_x**2) + np.mean(a_y**2) + np.mean(a_z**2)), + 'accel_range': np.max(a_mag) - np.min(a_mag) + } + return features + +def compute_gyro_features(df): + w_x = df['GYRO X (deg/s)'], w_y = df['GYRO Y (deg/s)'], w_z = df['GYRO Z (deg/s)'] + w_mag = np.sqrt(w_x**2 + w_y**2 + w_z**2) + + features = { + 'peak_angular_vel': np.max(w_mag), + 'mean_angular_vel': np.mean(w_mag), + 'total_angular_vel': np.sqrt(np.mean(w_x**2) + np.mean(w_y**2) + np.mean(w_z**2)), + 'angular_vel_range': np.max(w_mag) - np.min(w_mag) + } + return features + # fft_mean = mean(valid_freqs * valid_fft) + # fft_median = median(valid_freqs * valid_fft) + # fft_power = np.sum(valid_fft**2) + + # feature_row = { + # 'emg_max': emg.max(), + # 'emg_min': emg.min(), + # 'emg_rms': np.sqrt(np.mean(emg**2)), + # 'acc_peak': np.linalg.norm(acc, axis=1).max(), + # 'acc_range': np.ptp(np.linalg.norm(acc, axis=1)), + # 'gyro_peak': np.linalg.norm(gyro, axis=1).max(), + # 'gyro_range': np.ptp(np.linalg.norm(gyro, axis=1)), + # 'emg_fft_mean_freq': fft_mean, + # 'emg_fft_median_freq': fft_median, + # 'emg_fft_power': fft_power, + # 'label': label, + # 'gender': gender + # } \ No newline at end of file diff --git a/filtering.py b/filtering.py new file mode 100644 index 0000000..e75973f --- /dev/null +++ b/filtering.py @@ -0,0 +1,23 @@ +from scipy.signal import filtfilt, butter +import numpy as np +import pandas as pd + +def bandpass_filter_emg(series_signal, fs=1259, lowcut=20, highcut=450, order=4): + arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal) + if np.isnan(arr).all() or len(arr) == 0: #edge case check + return arr + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype='band') + return filtfilt(b, a, series_signal) + +# IMU Low-pass Filter (<20Hz) +def lowpass_filter_imu(series_signal, fs=148, cutoff=20, order=4): + arr = series_signal.values if isinstance(series_signal, pd.Series) else np.array(series_signal) + if np.isnan(arr).all() or len(arr) == 0: #edge case check + return arr + nyq = 0.5 * fs + normal_cutoff = cutoff / nyq + b, a = butter(order, normal_cutoff, btype='low') + return filtfilt(b, a, series_signal) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..df28536 --- /dev/null +++ b/main.py @@ -0,0 +1,49 @@ +import pandas as pd +import numpy as np +from dataCleaning import read_run, column_clean, preprocessing_actions +from dataCleaning import create_sensor_col +from resampling import downsample +from feature_extraction import extract_features +from filtering import bandpass_filter_emg, lowpass_filter_imu +import pdb +def overall_cleaning(): + df_p3_exo = read_run("P3_Exo_1_0.csv") # 2nd run, male + df_p3_noexo = read_run("P3_NoExo_1_0.csv") # first run, male + df_p4_exo = read_run("P4_Exo_1_0.csv") # 1st run female + df_p4_noexo = read_run("P4_NoExo_1_0.csv") # 2nd female + + df_p3_exo = column_clean(df_p3_exo) + df_p3_noexo = column_clean(df_p3_noexo) + df_p4_exo = column_clean(df_p4_exo) + df_p4_noexo = column_clean(df_p4_noexo) + #downsample EMG to match IMU + df_p3_exo = downsample(df_p3_exo) + df_p3_noexo = downsample(df_p3_noexo) + df_p4_exo = downsample(df_p4_exo) + df_p4_noexo = downsample(df_p4_noexo) + #melt sensor columns into a body part sensor + df_p3_exo = create_sensor_col(df_p3_exo, run_num = 2, gender = 'male', exo=True) + df_p3_noexo = create_sensor_col(df_p3_noexo, run_num = 1, gender = 'male', exo=False) + df_p4_exo = create_sensor_col(df_p4_exo, run_num = 1, gender = 'female', exo=True) + df_p4_noexo = create_sensor_col(df_p4_noexo, run_num = 2, gender = 'female', exo=False) + dfs = [df_p3_exo, df_p3_noexo, df_p4_exo, df_p4_noexo] #jack's list for the data cleaning he does later. + combined_df = pd.concat(dfs, ignore_index=True) + # Run functions to extract features for each dataframe + #filter out IMU and EMG outliers using filters: + imu_cols = ['ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)'] + for col in imu_cols: + combined_df[col + '_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])[col].transform(lowpass_filter_imu) + combined_df['EMG_MilliVolts_filtered'] = combined_df.groupby(['BodyPart', 'run_num', 'gender', 'exo'])['EMG_MilliVolts'].transform(bandpass_filter_emg) + pdb.set_trace() + features_df = extract_features(combined_df) #TO-DO FIX + #machine learning on combined_df + #change the next line to call on features_df instead of combined_df when extracting features is fixed to return more data + X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline = preprocessing_actions(combined_df) + #Return preprocessing_pipeline bc want to preprocess (scale, encode, etc.) any new or test data the same way as your training data. + return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing_pipeline + +def main(): + final_df = overall_cleaning() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/modif_cols.py b/modif_cols.py new file mode 100644 index 0000000..2556d16 --- /dev/null +++ b/modif_cols.py @@ -0,0 +1,25 @@ +import pandas as pd +import numpy as np + +def tidy_emg_imu_as_measured(df): + # Identify columns to melt (all sensor columns) + measurement_cols = [c for c in df.columns if any( + sensor in c for sensor in ['RDelt', 'LDelt', 'RBicep', 'LBicep'])] + id_vars = [c for c in df.columns if c not in measurement_cols] + # Melt + df_long = df.melt(id_vars=id_vars, value_vars=measurement_cols, + var_name='Measurement', value_name='Value') + # Extract BodyPart and Signal + df_long['BodyPart'] = df_long['Measurement'].str.extract(r'^(RDelt|LDelt|RBicep|LBicep)') + df_long['Signal'] = df_long['Measurement'].str.replace(r'^(RDelt|LDelt|RBicep|LBicep)_', '', regex=True) + + # Pivot so each signal is a separate column + df_wide = df_long.pivot_table( + index=id_vars + ['BodyPart'], + columns='Signal', + values='Value' + ).reset_index() + # flatten columns if needed + df_wide.columns.name = None + df_wide.columns = [str(col) for col in df_wide.columns] + return df_wide \ No newline at end of file diff --git a/resampling.py b/resampling.py new file mode 100644 index 0000000..fd5df69 --- /dev/null +++ b/resampling.py @@ -0,0 +1,71 @@ +import pandas as pd +import numpy as np +import pdb + +def upsample(df): + IMU_cols = [ + 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', + 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', + 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', + 'LDelt_GYRO Z (deg/s)', 'RBicep_ACC X (G)', + 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', + 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', + 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', + 'LBicep_GYRO Z (deg/s)' + ] + df['IMU_TimeSeries'] = pd.to_numeric(df['IMU_TimeSeries']) + df['time'] = pd.to_timedelta(df['IMU_TimeSeries'], unit='s') + df = df.dropna(subset=['IMU_TimeSeries']) # Drop rows where IMU timestamps are NaN + df = df.set_index('time') + freq_nanseconds = int(0.0007941176470588235 * 1e9) # Convert to integer microseconds + IMU_upsampled = df[IMU_cols].resample(f'{freq_nanseconds}ns').asfreq() + IMU_upsampled = IMU_upsampled.fillna(method='ffill') #forward fill the values + IMU_upsampled = IMU_upsampled.apply(lambda x: pd.to_numeric(x)) + IMU_upsampled = IMU_upsampled.interpolate(method='linear') # Interpolates the data using the linear method to match EMG data + #sine interpolation is best + df_new = IMU_upsampled.join(df['RDelt_EMG_MilliVolts']) + return df_new + +def downsample(df): + EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts'] + IMU_cols = [ + 'RDelt_ACC X (G)', 'RDelt_ACC Y (G)', 'RDelt_ACC Z (G)', 'RDelt_GYRO X (deg/s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GYRO Z (deg/s)', + 'LDelt_ACC X (G)', 'LDelt_ACC Y (G)', 'LDelt_ACC Z (G)', 'LDelt_GYRO X (deg/s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GYRO Z (deg/s)', + 'RBicep_ACC X (G)', 'RBicep_ACC Y (G)', 'RBicep_ACC Z (G)', 'RBicep_GYRO X (deg/s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GYRO Z (deg/s)', + 'LBicep_ACC X (G)', 'LBicep_ACC Y (G)', 'LBicep_ACC Z (G)', 'LBicep_GYRO X (deg/s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GYRO Z (deg/s)' + ] + df['EMG_TimeSeries'] = pd.to_numeric(df['EMG_TimeSeries']) + for col in df.columns: + # Optionally skip time columns if you want to preserve them as objects/strings + if "TimeSeries" in col: + continue + df[col] = pd.to_numeric(df[col], errors='coerce') + df['time'] = pd.to_timedelta(df['EMG_TimeSeries'], unit='s') + df = df.set_index('time') + df[EMG_cols] = df[EMG_cols].resample('6.75ms').asfreq() # Scales these columns to be the same length as IMU data + df[EMG_cols] = df[EMG_cols].interpolate(method='linear') + df[EMG_cols] = df[EMG_cols].fillna(method='bfill').fillna(method='ffill') #back fill and forward fill all Nans. + # Find the last index where at least one IMU value is real + last_idx = df[IMU_cols].last_valid_index() + # Trim DataFrame to that index + df = df.loc[:last_idx] + df = df.reset_index() + return df + +## If downsample using pandas.resample doesn't work, use this alternative function that uses the rows index +def alternative(df): + high_rate = 1259 + low_rate = 148 + step = high_rate / low_rate + + EMG_cols = ['RDelt_EMG_MilliVolts', 'LDelt_EMG_MilliVolts', 'RBicep_EMG_MilliVolts', 'LBicep_EMG_MilliVolts'] + + # Indexes to sample + indices = np.round(np.arange(0, len(df), step)).astype(int) + indices = indices[indices < len(df)] # Ensure we stay within bounds + + # Downsample using nearest index + df[EMG_cols] = df[EMG_cols].iloc[indices].reset_index(drop=True) + df = df.drop(columns=['EMG_TimeSeries']) + return df