-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmean_std_frame_filter.py
More file actions
221 lines (175 loc) · 9.12 KB
/
mean_std_frame_filter.py
File metadata and controls
221 lines (175 loc) · 9.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import os
import h5py
import pandas as pd
import numpy as np
import yaml
def process_mean_std_filtering(input_folder, filter_params,
clusters_extension, centers_extension, output_extension, mode):
"""Process all HDF5 files in the input folder for mean/std frame filtering."""
centers_files = []
for root, _, files in os.walk(input_folder):
for f in files:
if (f.endswith(f'{centers_extension}.hdf5')):
centers_files.append((root, f))
centers_files.sort(key=lambda x: x[1]) # Sort by filename
print(f"Found {len(centers_files)} center files to process")
for root, center_file in centers_files:
# base_name = center_file.replace(f'{centers_extension}.hdf5', '')
cluster_file = next((f for f in os.listdir(root)
if f.endswith(f'{clusters_extension}.hdf5')
), None)
if cluster_file:
input_file1_path = os.path.join(root, center_file)
input_file2_path = os.path.join(root, cluster_file)
output_file1_name = center_file.replace('.hdf5', f"{output_extension}.hdf5")
output_file2_name = cluster_file.replace('.hdf5', f"{output_extension}.hdf5")
output_dir = os.path.dirname(input_file1_path)
output_path_file1 = os.path.join(output_dir, output_file1_name)
output_path_file2 = os.path.join(output_dir, output_file2_name)
try:
print(f"\nProcessing files:\n{center_file}\n{cluster_file}")
# Filter centers file using mean_std_frame_filter
filtered_centers, unique_values, min_mean_frame, max_mean_frame = mean_std_frame_filter(input_file1_path, filter_params)
# Load original info
_, info1 = load_locs(input_file1_path)
_, info2 = load_locs(input_file2_path)
filtering_info = {
'Generated by': 'Mean/Std Frame Filtering',
'Filter parameters': filter_params
}
info1.append(filtering_info)
info2.append(filtering_info)
filtered_centers_rec = np.rec.array(filtered_centers.to_records(index=False))
save_locs(output_path_file1, filtered_centers_rec, info1)
print(f"Filtered centers data has been saved to '{output_path_file1}'")
# Filter and save clusters file based on centers file's 'group' column values
filter_and_save_filtered_file(
input_file2_path, 'locs', 'group', unique_values,
output_path_file2, info2
)
print(f"Filtered clusters data has been saved to '{output_path_file2}'")
print(f"Successfully processed files")
except Exception as e:
print(f"Error processing {center_file} and {cluster_file}: {str(e)}")
else:
print(f"No matching cluster file found for {center_file}")
print("\nAll files processed!")
def mean_std_frame_filter(input_file_path, filter_params):
"""Filter HDF5 file based on mean and std frame values."""
mean_frame = filter_params['mean_frame']
std_frame = filter_params['std_frame']
with h5py.File(input_file_path, 'r') as hdf5_file:
locs_data = hdf5_file['locs'][:]
df = pd.DataFrame(locs_data)
for key, value in filter_params.items():
if value is not None and key in df.columns:
min_val = value[0] if len(value) > 0 else None
max_val = value[1] if len(value) > 1 else None
if min_val is not None:
df = df[df[key] >= min_val]
if max_val is not None:
df = df[df[key] <= max_val]
if mean_frame is None:
mean_f = df['frame'].mean()
std_f = df['frame'].std()
min_mean_frame = mean_f - 2 * std_f
max_mean_frame = mean_f + 2 * std_f
else:
min_mean_frame= mean_frame[0]
max_mean_frame= mean_frame[1]
# Filter the dataframe based on the calculated or provided min/max frame values
filtered_df = df[(df['frame'] < max_mean_frame) & (df['frame'] > min_mean_frame)]
filtered_df = filtered_df[(filtered_df['std_frame'] < std_frame[1]) &
(filtered_df['std_frame'] > std_frame[0])]
unique_values = list(np.unique(filtered_df['group'].to_numpy()))
return filtered_df, unique_values, min_mean_frame, max_mean_frame
def filter_and_save_filtered_file(input_file_path, dataset_name, column_name, values_to_keep,
output_file_path, info):
"""Filter and save HDF5 file based on group values."""
with h5py.File(input_file_path, 'r') as input_file:
if dataset_name not in input_file:
raise ValueError(f"Dataset '{dataset_name}' not found in the file.")
dataset = input_file[dataset_name]
if column_name not in dataset.dtype.names:
raise ValueError(f"Column '{column_name}' not found in the dataset '{dataset_name}'.")
group_column_values = dataset[column_name][:]
rows_to_keep = np.isin(group_column_values, values_to_keep)
# Ensure the input file is closed before opening the output file
with h5py.File(output_file_path, 'w') as output_file:
with h5py.File(input_file_path, 'r') as input_file: # Reopen the input file for reading
for ds_name in input_file:
if ds_name == dataset_name:
# Ensure the data types are preserved when saving
filtered_data = input_file[dataset_name][rows_to_keep]
# Convert to structured array with the correct dtype
filtered_data = filtered_data.astype(dataset.dtype)
output_file.create_dataset(ds_name, data=filtered_data.astype(dataset.dtype))
else:
output_file.create_dataset(ds_name, data=input_file[ds_name][:])
save_info(output_file_path.replace('.hdf5', '.yaml'), info)
print(f"Filtered data has been saved to '{output_file_path}'")
def load_locs(path):
"""Load localizations from HDF5 file."""
with h5py.File(path, "r") as locs_file:
locs = locs_file["locs"][...]
locs = np.rec.array(locs, dtype=locs.dtype)
info = load_info(path)
return locs, info
def load_info(path):
"""Load metadata information from the corresponding YAML file."""
path_base = path.rsplit(".", 1)[0]
filename = path_base + ".yaml"
try:
with open(filename, "r") as info_file:
info = list(yaml.load_all(info_file, Loader=yaml.UnsafeLoader))
except FileNotFoundError as e:
print(f"Could not find metadata file: {filename}")
raise FileNotFoundError(e)
return info
def save_locs(path, locs, info):
"""Save localizations to an HDF5 file."""
with h5py.File(path, "w") as locs_file:
locs_file.create_dataset("locs", data=locs)
save_info(path.replace('.hdf5', '.yaml'), info)
def save_info(path, info):
"""Save metadata information to a YAML file."""
with open(path, "w") as file:
yaml.dump_all(info, file, default_flow_style=False)
if __name__ == "__main__":
# Load config file
with open('config.yaml', 'r') as f:
config = yaml.safe_load(f)
# Get input files based on mode
input_folder = config['paths']['input_folder']
mode = config['paths']['mode']
file_endings = config['paths']['file_endings']
input_files = []
if mode == "folder":
for file in os.listdir(input_folder):
if file.endswith('.hdf5'):
for ending in file_endings:
if file.endswith(f"{ending}.hdf5"):
input_files.append(os.path.join(input_folder, file))
break
elif mode == "subfolders":
for root, _, files in os.walk(input_folder):
for file in files:
if file.endswith('.hdf5'):
for ending in file_endings:
if file.endswith(f"{ending}.hdf5"):
input_files.append(os.path.join(root, file))
break
print(f"Found {len(input_files)} files to process")
# Process each file
for input_file in input_files:
output_dir = os.path.dirname(input_file)
process_mean_std_filtering(
input_file=input_file,
output_dir=output_dir,
min_std_frame=config['mean_std_filtering']['min_std_frame'],
max_std_frame=config['mean_std_filtering']['max_std_frame'],
min_mean_frame=config['mean_std_filtering']['min_mean_frame'],
max_mean_frame=config['mean_std_filtering']['max_mean_frame'],
extension=config['mean_std_filtering']['extension'],
centers_extension=config['mean_std_filtering']['centers_extension']
)