-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing.py
More file actions
31 lines (25 loc) · 1.58 KB
/
processing.py
File metadata and controls
31 lines (25 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
import pandas as pd
# Retrieves both the training data and the test data.
taining_data_loc = str(input('Enter the location of the training data: '))
if (taining_data_loc.find("data") < 0 or taining_data_loc.find("train") < 0):
print("Please enter the location of a training data set from the data directory.")
exit()
dataset_train = pd.read_csv(taining_data_loc)
test_data_loc = str(input('Enter the location of the test data: '))
if (test_data_loc.find("data") < 0 or test_data_loc.find("test") < 0):
print("Please enter the location of a test data set from the data directory.")
exit()
dataset_test = pd.read_csv(test_data_loc)
# Normalizes base features in both data sets.
dataset_train.loc[:, "nuc_1":"nuc_46"] = dataset_train.loc[: , "nuc_1":"nuc_46"] / 4
dataset_test.loc[:, "nuc_1":"nuc_46"] = dataset_test.loc[: , "nuc_1":"nuc_46"] / 4
# Normalizes length feature in both data sets.
dataset_train.loc[:, "length"] = dataset_train.loc[:, "length"] / dataset_train.loc[:, 'length'].max()
dataset_test.loc[:, "length"] = dataset_test.loc[:, "length"] / dataset_test.loc[:, 'length'].max()
# Normalizes en feature in both data sets.
dataset_train.loc[:, "en"] = dataset_train.loc[:, "en"] / dataset_train.loc[:, 'en'].max()
dataset_test.loc[:, "en"] = dataset_test.loc[:, "en"] / dataset_test.loc[:, 'en'].max()
# Writes the normalized training and test data to the processed_data directory.
dataset_train.to_csv('processed_data/' + taining_data_loc[(taining_data_loc.find('/') + 1):], index=False)
dataset_test.to_csv('processed_data/' + test_data_loc[(test_data_loc.find('/') + 1):], index=False)