-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_cleaner.py
More file actions
113 lines (93 loc) · 3.59 KB
/
sample_cleaner.py
File metadata and controls
113 lines (93 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import sys
import json
import openpyxl
"""
A script built to remove corruped samples from both excel and json metadata
Usage: script.py excel_file json_file corrupted_samples_file
The file containing the corrupted sample names should have one sample name
per line, like so:
AND01301
AND01395
MCN0714
"""
"""
shell command to remove the sample files from the folder
while IFS= read -r file; do cp "folder/with/samples/${file}.R"* output/location/ ; done < corrupted_samples_file.txt
while IFS= read -r file; do rm -- "folder/with/samples/${file}.R"* ; done < corrupted_samples_file.txt
"""
args = sys.argv
if len(args) == 4:
excel_path = args[1]
json_path = args[2]
corrupted_samples_file = args[3]
if not os.path.exists(excel_path):
print("Could not find excel file")
sys.exit(1)
if not os.path.exists(json_path):
print("Could not find json file")
sys.exit(1)
if not os.path.exists(corrupted_samples_file):
print("Could not find corruped samples file")
sys.exit(1)
else:
print("The script needs 3 arguments:")
print("Usage: script.py excel_file json_file corrupted_samples_file")
sys.exit(1)
def clean_excel_rows(excel_path, json_path, corrupted_samples_file):
excel_filepath = excel_path
excel_dirname = os.path.dirname(excel_filepath)
excel_filename = os.path.basename(excel_filepath)
wb_file = openpyxl.load_workbook(excel_filepath, data_only=True)
ws_metadata_lab = wb_file["METADATA_LAB"]
with open(corrupted_samples_file) as f:
lines = f.readlines()
lines = [s.strip('\n') for s in lines]
row_indexes = []
print("Cleaning json data...")
clean_json_data(json_path, lines)
for row in ws_metadata_lab.iter_rows(min_row=1, max_row=ws_metadata_lab.max_row):
flag = False
for cell in row:
for target_string in lines:
if target_string in str(cell.value):
row_indexes.append(cell.row)
flag = True
break
if flag:
break
test_labels=[]
for index in row_indexes:
label = [v.value for v in ws_metadata_lab[index]][3]
test_labels.append(label)
if test_labels == lines:
print("Correctly validated rows to delete, proceeding...")
for row_idx in sorted(row_indexes, reverse=True):
ws_metadata_lab.delete_rows(row_idx)
else:
print("Could not find the correct rows to delete, aborting")
sys.exit(1)
clean_excelname = str("clean_"+excel_filename)
output_excel_path = os.path.join(excel_dirname, clean_excelname)
wb_file.save(output_excel_path)
print("Correctly saved excel file as", output_excel_path)
return
def clean_json_data(json_path, lines):
sample_data_path = json_path
with open(sample_data_path, "r", encoding="utf-8") as fh:
json_data = json.load(fh)
for corrupted_sample in lines:
try:
del json_data[corrupted_sample]
except KeyError as e:
print(f"Sample {e} could not be found in json file")
continue
json_dirname = os.path.dirname(sample_data_path)
json_filename = os.path.basename(sample_data_path)
clean_jsonname = str("clean_"+json_filename)
output_json_path = os.path.join(json_dirname,clean_jsonname)
with open(output_json_path, "w", encoding="utf-8") as fh:
fh.write(json.dumps(json_data, indent=4, sort_keys=True, ensure_ascii=False))
print("Correctly saved json file as", output_json_path)
return
clean_excel_rows(excel_path, json_path, corrupted_samples_file)