Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 0 additions & 79 deletions FeatureExtraction.py

This file was deleted.

82 changes: 71 additions & 11 deletions dataCleaning.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@

import pandas as pd
import numpy as np
import pdb
import pdb

from sklearn.model_selection import train_test_split
from modif_cols import tidy_emg_imu_as_measured
from resampling import upsample, downsample

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Data Labels:
# Label for EMG Data shared:
# Open CSV files to check what they look like. Use skiprows=5 and low_memory=False to load it properly (the top 5 rows are metadata)
Expand All @@ -27,13 +36,13 @@ def read_run(filename, skiprows=7): #skip the first 7 rows (freq/cycle time fiel
usecols = usecols,
on_bad_lines='skip')
df.columns = ['RDelt_EMG_TimeSeries', 'RDelt_EMG_MilliVolts', 'RDelt_IMU_Acc X Time Series(s)', 'RDelt_ACC X (G)', 'RDelt_Acc Y Time Series(s)', 'RDelt_ACC Y (G)', 'RDelt_Acc Z Time Series(s)', 'RDelt_ACC Z (G)','RDelt_GyroXTime Series(s)', 'RDelt_GYRO X (deg/s)','RDelt_GyroYTime Series(s)', 'RDelt_GYRO Y (deg/s)', 'RDelt_GyroZTime Series(s)', 'RDelt_GYRO Z (deg/s)',
'LDelt_TimeSeries', 'LDelt_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
'RBicep_TimeSeries', 'RBicep_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
'LBicep_TimeSeries', 'LBicep_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
'LDelt_TimeSeries', 'LDelt_EMG_MilliVolts', 'LDelt_Acc X Time Series(s)', 'LDelt_ACC X (G)', 'LDelt_Acc Y Time Series(s)', 'LDelt_ACC Y (G)', 'LDelt_Acc Z Time Series(s)', 'LDelt_ACC Z (G)','LDelt_GyroXTime Series(s)', 'LDelt_GYRO X (deg/s)','LDelt_GyroYTime Series(s)', 'LDelt_GYRO Y (deg/s)', 'LDelt_GyroZTime Series(s)', 'LDelt_GYRO Z (deg/s)',
'RBicep_TimeSeries', 'RBicep_EMG_MilliVolts', 'RBicep_Acc X Time Series(s)', 'RBicep_ACC X (G)', 'RBicep_Acc Y Time Series(s)', 'RBicep_ACC Y (G)', 'RBicep_Acc Z Time Series(s)', 'RBicep_ACC Z (G)','RBicep_GyroXTime Series(s)', 'RBicep_GYRO X (deg/s)','RBicep_GyroYTime Series(s)', 'RBicep_GYRO Y (deg/s)', 'RBicep_GyroZTime Series(s)', 'RBicep_GYRO Z (deg/s)',
'LBicep_TimeSeries', 'LBicep_EMG_MilliVolts', 'LBicep_Acc X Time Series(s)', 'LBicep_ACC X (G)', 'LBicep_Acc Y Time Series(s)', 'LBicep_ACC Y (G)', 'LBicep_Acc Z Time Series(s)', 'LBicep_ACC Z (G)','LBicep_GyroXTime Series(s)', 'LBicep_GYRO X (deg/s)','LBicep_GyroYTime Series(s)', 'LBicep_GYRO Y (deg/s)', 'LBicep_GyroZTime Series(s)', 'LBicep_GYRO Z (deg/s)'
]
return df
return df #raw data

def column_clean(df, run_num, gender):
def column_clean(df):
#remove all time series columns except RDelt_EMG_TimeSeries' and 'RDelt_IMU_Acc X Time Series(s)', so keep time scale for both EMG and IMU
extr_time_series = [ 'RDelt_Acc Y Time Series(s)', 'RDelt_Acc Z Time Series(s)', 'RDelt_GyroXTime Series(s)',
'RDelt_GyroYTime Series(s)', 'RDelt_GyroZTime Series(s)', 'LDelt_TimeSeries', 'LDelt_Acc X Time Series(s)',
Expand All @@ -45,14 +54,65 @@ def column_clean(df, run_num, gender):
'LBicep_GyroYTime Series(s)', 'LBicep_GyroZTime Series(s)']

df = df.drop(extr_time_series, axis = 1)
df = df.rename(columns={'RDelt_EMG_TimeSeries': 'EMG_TimeSeries', 'RDelt_IMU_Acc X Time Series(s)': 'IMU_TimeSeries'})
# measurement_cols = [col for col in df.columns if (('ACC' in col or 'GYRO' in col) and 'Time Series' not in col)] #exclude mV and time cols
# df.columns = df.columns.str.strip() # Remove leading/trailing spaces (Yuxuan)
# df = df.apply(pd.to_numeric, errors='coerce') # Conver t everything to numeric (Yuxuan)
df = df.replace(['', ' ', 'NA', None], np.nan) #stdize missing data
df['gender'] = gender
df['run_num'] = run_num
df.to_csv("test.csv")
return df

def preprocessing(full_df):
pass

#melting and stuff
def create_sensor_col(df, run_num, gender, exo):
df_pivoted = tidy_emg_imu_as_measured(df)
df_pivoted.columns = df_pivoted.columns.str.strip()
df_pivoted = df_pivoted.reset_index()
df_pivoted['gender'] = gender
df_pivoted['run_num'] = run_num
df_pivoted['exo'] = exo
df_pivoted.to_csv("pivoted_df.csv")
return df_pivoted

def preprocessing_actions(full_df, neural_net=False):
num_attribs = [
'EMG_MilliVolts_filtered',
'ACC X (G)_filtered',
'ACC Y (G)_filtered',
'ACC Z (G)_filtered',
'GYRO X (deg/s)_filtered',
'GYRO Y (deg/s)_filtered',
'GYRO Z (deg/s)_filtered',
# Add any other numerical features here
]
cat_attribs = [
'BodyPart',
'gender'
#exo is the target variable
]
if neural_net:
num_pipeline = Pipeline([
("impute", SimpleImputer(strategy="median")),
("standardize", MinMaxScaler()),
])
else:
num_pipeline = Pipeline([
("impute", SimpleImputer(strategy="median")),
("standardize", StandardScaler()),
])
cat_pipeline = Pipeline([
("impute", SimpleImputer(strategy="most_frequent")),
("oneHot", OneHotEncoder()),
])

preprocessing = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attribs),
])
# Prepare data for modeling
X = full_df[num_attribs + cat_attribs]
y = full_df["exo"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_prepared = preprocessing.fit_transform(X_train)
X_test_prepared = preprocessing.transform(X_test)
return X_train_prepared, X_test_prepared, y_train, y_test, preprocessing
Loading