-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_test_data.py
More file actions
431 lines (374 loc) · 16.7 KB
/
generate_test_data.py
File metadata and controls
431 lines (374 loc) · 16.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/usr/bin/env python3
"""
Generate test DataSheet.xlsx and PrescriptionLines.xlsx files populated with live database data.
Reads seed data from the configured database (CURRENT_DATABASE in columns.py), converts it to
datasheet column names using types.csv, and writes the result into copies of the initialiser
templates so that the output files preserve all formatting and validation rules.
Output files are written to the test_data/ directory:
- test_data/DataSheet.xlsx — patient data (up to 5 records)
- test_data/PrescriptionLines.xlsx — prescription lines (up to 10 records)
Usage:
python generate_test_data.py
"""
import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
import pandas as pd
from sqlalchemy import create_engine, text, inspect
from columns import CURRENT_DATABASE, Types
from openpyxl import load_workbook
def get_database_connection():
"""Connect to the database."""
db = CURRENT_DATABASE
connection_string = f'postgresql+psycopg2://{db["user_name"]}:{db["password"]}@{db["host_name"]}:{db["port_number"]}/{db["database_name"]}'
engine = create_engine(connection_string)
return engine
def export_patient_data_sample(engine, limit=5):
"""
Export a sample of patient records from the database.
Joins admin with all sub-tables (pastmedicalhistory, socialhistory, vitalstatistics,
heightandweight, visualacuity, dental, fallrisk, physiotherapy, doctorsconsultation)
and returns a flat DataFrame with CSV-prefixed column names (e.g. vs_temperature).
Args:
engine: SQLAlchemy engine connected to the sothea database.
limit (int): Maximum number of patient records to return. Defaults to 5.
Returns:
pd.DataFrame: Flat patient data with CSV-style column names.
"""
query = f"""SELECT
a.id,
a.vid,
a.family_group AS a_family_group,
a.reg_date AS a_reg_date,
a.queue_no AS a_queue_no,
a.name AS a_name,
a.khmer_name AS a_khmer_name,
a.dob AS a_dob,
a.age AS a_age,
a.gender AS a_gender,
a.village AS a_village,
a.contact_no AS a_contact_no,
a.pregnant AS a_pregnant,
a.last_menstrual_period AS a_last_menstrual_period,
a.drug_allergies AS a_drug_allergies,
a.sent_to_id AS a_sent_to_id,
pmh.cough AS pmh_cough,
pmh.fever AS pmh_fever,
pmh.blocked_nose AS pmh_blocked_nose,
pmh.sore_throat AS pmh_sore_throat,
pmh.night_sweats AS pmh_night_sweats,
pmh.unintentional_weight_loss AS pmh_unintentional_weight_loss,
pmh.tuberculosis AS pmh_tuberculosis,
pmh.tuberculosis_has_been_treated AS pmh_tuberculosis_has_been_treated,
pmh.diabetes AS pmh_diabetes,
pmh.hypertension AS pmh_hypertension,
pmh.hyperlipidemia AS pmh_hyperlipidemia,
pmh.chronic_joint_pains AS pmh_chronic_joint_pains,
pmh.chronic_muscle_aches AS pmh_chronic_muscle_aches,
pmh.sexually_transmitted_disease AS pmh_sexually_transmitted_disease,
pmh.specified_stds AS pmh_specified_stds,
pmh.others AS pmh_others,
sh.past_smoking_history AS sh_past_smoking_history,
sh.no_of_years AS sh_no_of_years,
sh.current_smoking_history AS sh_current_smoking_history,
sh.cigarettes_per_day AS sh_cigarettes_per_day,
sh.alcohol_history AS sh_alcohol_history,
sh.how_regular AS sh_how_regular,
vs.temperature AS vs_temperature,
vs.spo2 AS vs_spo2,
vs.systolic_bp1 AS vs_systolic_bp1,
vs.diastolic_bp1 AS vs_diastolic_bp1,
vs.systolic_bp2 AS vs_systolic_bp2,
vs.diastolic_bp2 AS vs_diastolic_bp2,
vs.avg_systolic_bp AS vs_avg_systolic_bp,
vs.avg_diastolic_bp AS vs_avg_diastolic_bp,
vs.hr1 AS vs_hr1,
vs.hr2 AS vs_hr2,
vs.avg_hr AS vs_avg_hr,
vs.rand_blood_glucose_mmol_l AS vs_rand_blood_glucose_mmoll,
vs.icope_high_bp AS vs_icope_high_bp,
haw.height AS haw_height,
haw.weight AS haw_weight,
haw.bmi AS haw_bmi,
haw.bmi_analysis AS haw_bmi_analysis,
haw.paeds_height AS haw_paeds_height,
haw.paeds_weight AS haw_paeds_weight,
haw.icope_lost_weight_past_months AS haw_icope_lost_weight_past_months,
haw.icope_no_desire_to_eat AS haw_icope_no_desire_to_eat,
va.l_eye_vision AS va_l_eye_vision,
va.r_eye_vision AS va_r_eye_vision,
va.sent_to_opto AS va_sent_to_opto,
va.referred_for_glasses AS va_referred_for_glasses,
va.icope_eye_problem AS va_icope_eye_problem,
va.icope_treated_for_diabetes_or_bp AS va_icope_treated_for_diabetes_or_bp,
va.additional_intervention AS va_additional_intervention,
d.fluoride_exposure AS d_fluoride_exposure,
d.diet AS d_diet,
d.bacterial_exposure AS d_bacterial_exposure,
d.oral_symptoms AS d_oral_symptoms,
d.drink_other_water AS d_drink_other_water,
d.risk_for_dental_carries AS d_risk_for_dental_carries,
d.icope_difficulty_chewing AS d_icope_difficulty_chewing,
d.icope_pain_in_mouth AS d_icope_pain_in_mouth,
d.dental_notes AS d_dental_notes,
fr.side_to_side_balance AS fr_side_to_side_balance,
fr.semi_tandem_balance AS fr_semi_tandem_balance,
fr.tandem_balance AS fr_tandem_balance,
fr.gait_speed_test AS fr_gait_speed_test,
fr.chair_stand_test AS fr_chair_stand_test,
fr.fall_risk_score AS fr_fall_risk_score,
fr.icope_complete_chair_stands AS fr_icope_complete_chair_stands,
fr.icope_chair_stands_time AS fr_icope_chair_stands_time,
dc.well AS dc_well,
dc.msk AS dc_msk,
dc.cvs AS dc_cvs,
dc.respi AS dc_respi,
dc.gu AS dc_gu,
dc.git AS dc_git,
dc.eye AS dc_eye,
dc.derm AS dc_derm,
dc.others AS dc_others,
dc.consultation_notes AS dc_consultation_notes,
dc.diagnosis AS dc_diagnosis,
dc.treatment AS dc_treatment,
dc.referral_needed AS dc_referral_needed,
dc.referral_loc AS dc_referral_loc,
dc.remarks AS dc_remarks,
p.subjective_assessment AS p_subjective_assessment,
p.pain_scale AS p_pain_scale,
p.objective_assessment AS p_objective_assessment,
p.intervention AS p_intervention,
p.evaluation AS p_evaluation
FROM admin a
LEFT JOIN pastmedicalhistory pmh ON a.id = pmh.id AND a.vid = pmh.vid
LEFT JOIN socialhistory sh ON a.id = sh.id AND a.vid = sh.vid
LEFT JOIN vitalstatistics vs ON a.id = vs.id AND a.vid = vs.vid
LEFT JOIN heightandweight haw ON a.id = haw.id AND a.vid = haw.vid
LEFT JOIN visualacuity va ON a.id = va.id AND a.vid = va.vid
LEFT JOIN dental d ON a.id = d.id AND a.vid = d.vid
LEFT JOIN fallrisk fr ON a.id = fr.id AND a.vid = fr.vid
LEFT JOIN physiotherapy p ON a.id = p.id AND a.vid = p.vid
LEFT JOIN doctorsconsultation dc ON a.id = dc.id AND a.vid = dc.vid
ORDER BY a.id, a.vid
LIMIT {limit};
"""
df = pd.read_sql(query, engine)
return df
def export_prescription_lines_sample(engine, limit=10):
"""
Export a sample of prescription lines from the database.
Dynamically detects the drug code column name (drug_code) and joins prescription_lines
with prescriptions, admin, and drugs. Returns one row per prescription line with
CSV-prefixed column names (e.g. pl_drug_code, pl_dose_amount).
Batch allocation columns are not included here; they are added as empty columns by
convert_prescription_to_datasheet_format().
Args:
engine: SQLAlchemy engine connected to the sothea database.
limit (int): Maximum number of prescription line records to return. Defaults to 10.
Returns:
pd.DataFrame: Prescription lines with CSV-style column names.
"""
# First check what columns exist in drugs table
with engine.connect() as conn:
result = conn.execute(text("SELECT column_name FROM information_schema.columns WHERE table_name = 'drugs'"))
drug_columns = [row[0] for row in result]
# Use drug_code if available, otherwise try other possible column names
drug_code_col = None
for col_name in ['drug_code', 'atc_code', 'code']:
if col_name in drug_columns:
drug_code_col = col_name
break
if drug_code_col is None:
print(" WARNING: No drug code column found, using generic_name only")
drug_code_col = "NULL"
drug_code_select = "NULL AS pl_drug_code"
else:
drug_code_select = f"d.{drug_code_col} AS pl_drug_code"
query = f"""SELECT
pl.id AS pl_id,
a.id AS patient_id,
a.vid,
{drug_code_select},
COALESCE(d.generic_name, d.brand_name) AS pl_drug_name,
pl.dose_amount AS pl_dose_amount,
pl.dose_unit AS pl_dose_unit,
pl.frequency_code AS pl_frequency_code,
pl.duration AS pl_duration,
pl.duration_unit AS pl_duration_unit,
pl.prn AS pl_prn,
pl.remarks AS pl_remarks
FROM prescription_lines pl
JOIN prescriptions p ON pl.prescription_id = p.id
JOIN admin a ON p.patient_id = a.id AND p.vid = a.vid
JOIN drugs d ON pl.drug_id = d.id
ORDER BY pl.id
LIMIT {limit};
"""
df = pd.read_sql(query, engine)
return df
def convert_to_datasheet_format(df, types):
"""
Convert a patient data DataFrame from CSV column names to datasheet column names.
Renames columns using the csv_name -> datasheet_name mapping from types.csv, then
reorders them to match the template column order (Registration ID first, then Date,
Family Group, followed by all other fields in types.csv order).
Args:
df (pd.DataFrame): Patient data with CSV-prefixed column names.
types (Types): Loaded types.csv descriptor.
Returns:
pd.DataFrame: DataFrame with datasheet column names in template order.
"""
# Create mapping from CSV names to datasheet names
csv_to_datasheet_map = {}
for category in types.categories:
if category.name.lower() != 'prescription lines':
for field in category.fields:
csv_to_datasheet_map[field.csv_name] = field.datasheet_name
# Rename columns
df_renamed = df.copy()
for csv_name, datasheet_name in csv_to_datasheet_map.items():
if csv_name in df_renamed.columns:
df_renamed = df_renamed.rename(columns={csv_name: datasheet_name})
# Reorder columns to match template order
datasheet_columns = []
id_field = None
date_field = None
family_group_field = None
other_fields = []
for category in types.categories:
if category.name.lower() != 'prescription lines':
for field in category.fields:
if field.sql_name == 'vid':
continue
if field.sql_name == 'id':
id_field = field.datasheet_name
elif field.sql_name == 'reg_date':
date_field = field.datasheet_name
elif field.sql_name == 'family_group':
family_group_field = field.datasheet_name
else:
other_fields.append(field.datasheet_name)
# Build column order: id, date, family_group, then everything else
ordered_columns = []
if id_field and id_field in df_renamed.columns:
ordered_columns.append(id_field)
if date_field and date_field in df_renamed.columns:
ordered_columns.append(date_field)
if family_group_field and family_group_field in df_renamed.columns:
ordered_columns.append(family_group_field)
# Add other columns that exist in the dataframe
for col in other_fields:
if col in df_renamed.columns:
ordered_columns.append(col)
# Select only columns that exist and are in the ordered list
existing_ordered = [col for col in ordered_columns if col in df_renamed.columns]
df_final = df_renamed[existing_ordered]
return df_final
def convert_prescription_to_datasheet_format(df, types):
"""
Convert a prescription lines DataFrame from CSV column names to datasheet column names.
Renames columns using the csv_name -> datasheet_name mapping from types.csv (Prescription
Lines category only), reorders them to match the template, then appends 10 empty batch
allocation column triplets (Batch Number N, Location N, Quantity N).
Args:
df (pd.DataFrame): Prescription lines with CSV-prefixed column names.
types (Types): Loaded types.csv descriptor.
Returns:
pd.DataFrame: DataFrame with datasheet column names, including empty batch columns.
"""
csv_to_datasheet_map = {}
for category in types.categories:
if category.name.lower() == 'prescription lines':
for field in category.fields:
csv_to_datasheet_map[field.csv_name] = field.datasheet_name
# Rename columns
df_renamed = df.copy()
for csv_name, datasheet_name in csv_to_datasheet_map.items():
if csv_name in df_renamed.columns:
df_renamed = df_renamed.rename(columns={csv_name: datasheet_name})
# Get ordered columns from types
ordered_columns = []
for category in types.categories:
if category.name.lower() == 'prescription lines':
for field in category.fields:
if field.datasheet_name in df_renamed.columns:
ordered_columns.append(field.datasheet_name)
# Select columns in order
existing_ordered = [col for col in ordered_columns if col in df_renamed.columns]
df_final = df_renamed[existing_ordered]
# Add empty batch allocation columns
for i in range(1, 11):
batch_col = f'Batch Number {i}'
location_col = f'Location {i}'
quantity_col = f'Quantity {i}'
if batch_col not in df_final.columns:
df_final[batch_col] = None
if location_col not in df_final.columns:
df_final[location_col] = None
if quantity_col not in df_final.columns:
df_final[quantity_col] = None
return df_final
def main():
"""
Generate test_data/DataSheet.xlsx and test_data/PrescriptionLines.xlsx from live database data.
Connects to the database, exports patient and prescription line samples, converts them to
datasheet format, then writes them into copies of the initialiser templates so that all
column formatting and validation rules are preserved.
"""
print("Connecting to database...")
engine = get_database_connection()
print("Loading types...")
types = Types("types.csv")
print("Exporting patient data sample...")
patient_df = export_patient_data_sample(engine, limit=5)
print(f" Exported {len(patient_df)} patient records")
if len(patient_df) == 0:
print("ERROR: No patient data found in database!")
return
print("Converting patient data to datasheet format...")
datasheet_df = convert_to_datasheet_format(patient_df, types)
print("Exporting prescription lines sample...")
prescription_df = export_prescription_lines_sample(engine, limit=10)
print(f" Exported {len(prescription_df)} prescription line records")
print("Converting prescription lines to datasheet format...")
prescription_datasheet_df = convert_prescription_to_datasheet_format(prescription_df, types)
# Load the template to preserve formatting
print("Loading templates...")
template_path = "initialiser/DataSheet.xlsx"
if not os.path.exists(template_path):
print(f"ERROR: Template not found at {template_path}")
return
wb = load_workbook(template_path)
ws = wb["DataSheet"]
ws_prescription = wb["Prescription Lines"]
# Clear existing data (keep headers)
print("Writing patient data to DataSheet...")
ws.delete_rows(2, ws.max_row) # Delete all rows except header
for idx, row in datasheet_df.iterrows():
ws.append(list(row))
print("Writing prescription lines to Prescription Lines sheet...")
ws_prescription.delete_rows(2, ws_prescription.max_row)
for idx, row in prescription_datasheet_df.iterrows():
ws_prescription.append(list(row))
output_path = "test_data/DataSheet.xlsx"
os.makedirs("test_data", exist_ok=True)
wb.save(output_path)
print(f"✓ Saved: {output_path}")
# Also create standalone PrescriptionLines.xlsx
print("Creating standalone PrescriptionLines.xlsx...")
template_prescription_path = "initialiser/PrescriptionLines.xlsx"
if os.path.exists(template_prescription_path):
wb2 = load_workbook(template_prescription_path)
ws2 = wb2["Prescription Lines"]
ws2.delete_rows(2, ws2.max_row)
for idx, row in prescription_datasheet_df.iterrows():
ws2.append(list(row))
output_path2 = "test_data/PrescriptionLines.xlsx"
wb2.save(output_path2)
print(f"✓ Saved: {output_path2}")
print("\n✓ Test files generated successfully!")
print(f" - {output_path}")
if os.path.exists("test_data/PrescriptionLines.xlsx"):
print(f" - test_data/PrescriptionLines.xlsx")
if __name__ == "__main__":
main()