Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions src/pmotools/pmo_builder/pmo_updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python3

import pandas as pd
from pmotools.pmo_engine.pmo_processor import PMOProcessor
from datetime import datetime


class PMOUpdater(object):
@staticmethod
def check_if_date_yyyy_mm_or_yyyy_mm_dd(date_string: str) -> bool:
"""
Checks if a string is in YYYY-MM or YYYY-MM-DD format.
:param date_string: the string to be checked
"""
try:
datetime.strptime(date_string, "%Y-%m-%d")
return True # Matches YYYY-MM-DD
except ValueError:
try:
datetime.strptime(date_string, "%Y-%m")
return True # Matches YYYY-MM
except ValueError:
return False # Does not match either format

@staticmethod
def update_specimen_meta_with_traveler_info(
pmo,
traveler_info: pd.DataFrame,
specimen_name_col: str = "specimen_name",
travel_country_col: str = "travel_country",
travel_start_col: str = "travel_start_date",
travel_end_col: str = "travel_end_date",
bed_net_usage_col: str = None,
geo_admin1_col: str = None,
geo_admin2_col: str = None,
geo_admin3_col: str = None,
lat_lon_col: str = None,
replace_current_traveler_info: bool = False,
):
"""
Update a PMO's specimen's metadata with travel info
:param pmo: the PMO to update, will directly modify this PMO
:param traveler_info: the traveler info
:param specimen_name_col: the specimen name column within the traveler input table
:param travel_country_col: the column name containing the traveled to country
:param travel_start_col: the column name containing the traveled start date, format YYYY-MM-DD or YYYY-MM
:param travel_end_col: the column name containing the traveled end date, format YYYY-MM-DD or YYYY-MM
:param bed_net_usage_col: (Optional) a number between 0 - 1 for rough frequency of bednet usage while traveling
:param geo_admin1_col: (Optional) the column name containing the traveled to country admin level 1 info
:param geo_admin2_col: (Optional) the column name containing the traveled to country admin level 2 info
:param geo_admin3_col: (Optional) the column name containing the traveled to country admin level 3 info
:param lat_lon_col: (Optional) the latitude and longitude column name containing the region traveled to latitude and longitude
:param replace_current_traveler_info: whether to replace current travel info
:return: a reference to the updated PMO
"""
required_cols = [
specimen_name_col,
travel_country_col,
travel_start_col,
travel_end_col,
]
if bed_net_usage_col is not None:
required_cols.append(bed_net_usage_col)
if geo_admin1_col is not None:
required_cols.append(geo_admin1_col)
if geo_admin2_col is not None:
required_cols.append(geo_admin2_col)
if geo_admin3_col is not None:
required_cols.append(geo_admin3_col)
if lat_lon_col is not None:
required_cols.append(lat_lon_col)

if not set(required_cols).issubset(traveler_info.columns):
raise Exception(
"missing traveler_info columns: " + ",".join(required_cols),
" columns in table: " + ",".join(traveler_info.columns),
)

specimen_names_in_pmo = set(PMOProcessor.get_specimen_names(pmo))
specimen_names_in_traveler_info = set(
traveler_info[specimen_name_col].astype(str).tolist()
)

# check to see if provided traveler info for a specimen that cannot be found in PMO
missing_traveler_specs = specimen_names_in_traveler_info - specimen_names_in_pmo

if missing_traveler_specs:
raise ValueError(
f"Provided traveler info for the following specimens but they are missing from the PMO: {sorted(missing_traveler_specs)}"
)
traveler_info_records = traveler_info[required_cols].to_dict(orient="records")
spec_indexs = PMOProcessor.get_index_key_of_specimen_names(pmo)

# prep traveler info lists, clear the list if we are replacing or start an empty list to append to if none exist already
for specimen_name in specimen_names_in_traveler_info:
if (
replace_current_traveler_info
or "travel_out_six_month"
not in pmo["specimen_info"][spec_indexs[specimen_name]]
):
pmo["specimen_info"][spec_indexs[specimen_name]][
"travel_out_six_month"
] = []

for travel_rec in traveler_info_records:
specimen_name = str(travel_rec[specimen_name_col])
# Validate date formats
for date_col in (travel_start_col, travel_end_col):
val = travel_rec[date_col]
if pd.isna(val):
raise ValueError(
f"Missing required date value in column '{date_col}' for specimen '{specimen_name}'"
)
val_str = str(val)
if not PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd(val_str):
raise ValueError(
f"Invalid date format in '{date_col}' for specimen '{specimen_name}': '{val_str}'. "
f"Expected YYYY-MM or YYYY-MM-DD"
)
# add in travel_rec
travel_rec.pop(specimen_name_col, None)
pmo["specimen_info"][spec_indexs[specimen_name]][
"travel_out_six_month"
].append(travel_rec)
return pmo
4 changes: 2 additions & 2 deletions src/pmotools/pmo_engine/pmo_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import os
from typing import NamedTuple
import copy
import pandas
import pandas as pd

from collections import defaultdict
from pmotools.pmo_engine.pmo_checker import PMOChecker

Expand Down Expand Up @@ -407,7 +407,7 @@ def list_library_sample_names_per_specimen_name(
pmodata,
select_specimen_ids: list[int] = None,
select_specimen_names: list[str] = None,
) -> pandas.DataFrame:
) -> pd.DataFrame:
"""
List all the library_sample_names per specimen_name
:param pmodata: the PMO
Expand Down
228 changes: 228 additions & 0 deletions tests/test_pmo_builder/test_pmo_updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/usr/bin/env python3

import os
import unittest
import pandas as pd
from pmotools.pmo_builder.pmo_updater import PMOUpdater


class TestPMOUpdater(unittest.TestCase):
def setUp(self):
self.working_dir = os.path.dirname(os.path.abspath(__file__))

def test_check_if_date_yyyy_mm_or_yyyy_mm_dd(self):
self.assertFalse(PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd("2023/11/24"))
self.assertFalse(PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd("11-24-2023"))
self.assertFalse(PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd("invalid-date"))

self.assertTrue(PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd("2023-11-24"))
self.assertTrue(PMOUpdater.check_if_date_yyyy_mm_or_yyyy_mm_dd("2023-11"))

def test_update_specimen_meta_with_traveler_info(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec1", "spec2"],
"travel_country": ["Kenya", "Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-04", "2024-02-15"],
"travel_end_date": ["2024-02", "2024-06", "2024-02-27"],
}
)

PMOUpdater.update_specimen_meta_with_traveler_info(test_pmo, traveler_info)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-02",
},
{
"travel_country": "Kenya",
"travel_start_date": "2024-04",
"travel_end_date": "2024-06",
},
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02-15",
"travel_end_date": "2024-02-27",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)

def test_update_specimen_meta_with_traveler_info_raises(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["24-01", "2024-02"], # BAD: "24-01"
"travel_end_date": ["2024-02-05", "2024-03"],
}
)

with self.assertRaises(ValueError):
PMOUpdater.update_specimen_meta_with_traveler_info(test_pmo, traveler_info)

def test_update_specimen_meta_with_traveler_info_with_optional(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-02"],
"travel_end_date": ["2024-01-20", "2024-02-15"],
"bed_net": [0.50, 0.0],
"admin1": ["Nairobi", "Dar es Salaam"],
"admin2": ["SubCounty1", "SubCounty2"],
"admin3": ["Ward1", "Ward2"],
"latlon": ["-1.2921,36.8219", "-6.7924,39.2083"],
}
)

PMOUpdater.update_specimen_meta_with_traveler_info(
test_pmo,
traveler_info,
bed_net_usage_col="bed_net",
geo_admin1_col="admin1",
geo_admin2_col="admin2",
geo_admin3_col="admin3",
lat_lon_col="latlon",
)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-01-20",
"bed_net": 0.5,
"admin1": "Nairobi",
"admin2": "SubCounty1",
"admin3": "Ward1",
"latlon": "-1.2921,36.8219",
}
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02",
"travel_end_date": "2024-02-15",
"bed_net": 0.0,
"admin1": "Dar es Salaam",
"admin2": "SubCounty2",
"admin3": "Ward2",
"latlon": "-6.7924,39.2083",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)

def test_update_specimen_meta_with_traveler_info_with_optional_replace_old(self):
test_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-02",
},
{
"travel_country": "Kenya",
"travel_start_date": "2024-04",
"travel_end_date": "2024-06",
},
],
},
{"specimen_name": "spec2"},
],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-02"],
"travel_end_date": ["2024-01-20", "2024-02-15"],
"bed_net": [0.50, 0.0],
"admin1": ["Nairobi", "Dar es Salaam"],
"admin2": ["SubCounty1", "SubCounty2"],
"admin3": ["Ward1", "Ward2"],
"latlon": ["-1.2921,36.8219", "-6.7924,39.2083"],
}
)

PMOUpdater.update_specimen_meta_with_traveler_info(
test_pmo,
traveler_info,
bed_net_usage_col="bed_net",
geo_admin1_col="admin1",
geo_admin2_col="admin2",
geo_admin3_col="admin3",
lat_lon_col="latlon",
replace_current_traveler_info=True,
)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-01-20",
"bed_net": 0.5,
"admin1": "Nairobi",
"admin2": "SubCounty1",
"admin3": "Ward1",
"latlon": "-1.2921,36.8219",
}
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02",
"travel_end_date": "2024-02-15",
"bed_net": 0.0,
"admin1": "Dar es Salaam",
"admin2": "SubCounty2",
"admin3": "Ward2",
"latlon": "-6.7924,39.2083",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)


if __name__ == "__main__":
unittest.main()