-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataPreProcessing.py
More file actions
68 lines (54 loc) · 1.95 KB
/
dataPreProcessing.py
File metadata and controls
68 lines (54 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#Example: python dataPreProcessing.py "FiveFoldData" "workdata.csv"
import sys
import os
import numpy as np
from sklearn.preprocessing import Imputer
import warnings
warnings.filterwarnings('ignore')
if len(sys.argv)<2:
print "Wrong No: of Input Parameters"
print "Required format:"
print "Argument 1: Folder to store the Five Fold data(Will automatically create the folder if it doesn't exist)"
print "Argument 2: Name of the data file"
NewDataFolder = sys.argv[1]
DataFile = sys.argv[2]
if not os.path.exists(NewDataFolder):
os.makedirs(NewDataFolder)
raw_data = open(DataFile)
dataset = np.loadtxt(raw_data, delimiter=",")
#Using sklearn imputer(mean), you can change this function and replace with appropriate imputer(Option K-Means)
def mean_X_Filler(X):
imp = Imputer(missing_values=0.0, strategy='mean', axis=0)
return imp.fit_transform(X)
def saveData(test, rest, fold):
threeFouth = int(.75*len(rest))
train = rest[:threeFouth,:]
validation = rest[threeFouth:,:]
if not os.path.exists(NewDataFolder+"/"+fold):
os.makedirs(NewDataFolder+"/"+fold)
np.savetxt(NewDataFolder+"/"+fold+"/train.txt", train, fmt='%f')
np.savetxt(NewDataFolder+"/"+fold+"/validation.txt", validation, fmt='%f')
np.savetxt(NewDataFolder+"/"+fold+"/test.txt", test, fmt='%f')
def fiveFold(data):
oneFifth = int(.2*len(data))
tempData = {}
for i in range(0,5):
tempData[i] = data[i*oneFifth:(i+1)*oneFifth,:]
for i in range(0,5):
flag = False
for j in range(0,5):
if i==j:
flag = True
else:
if j==0 or (j==1 and flag==True):
rest = tempData[j]
else:
rest = np.concatenate((rest,tempData[j]), axis = 0)
saveData(tempData[i], rest,str(i+1))
raw_data = open(DataFile)
dataset = np.loadtxt(raw_data, delimiter=",")
X = dataset[:,0:-1]
x_Filled = mean_X_Filler(X)
Y = dataset[:,-1:]
new_data = np.concatenate((x_Filled,Y), axis =1)
fiveFold(new_data)