From 01fa20c20be5a9bc365bfa5a9aa2372ee8059590 Mon Sep 17 00:00:00 2001 From: Cubix33 Date: Mon, 16 Mar 2026 11:02:31 +0000 Subject: [PATCH] #263- standardize date and time using dateparser --- requirements.txt | 3 ++- src/llm.py | 28 +++++++++++++++------------- src/main.py | 5 +++-- src/utils/normalizer.py | 15 +++++++++++++++ 4 files changed, 35 insertions(+), 16 deletions(-) create mode 100644 src/utils/normalizer.py diff --git a/requirements.txt b/requirements.txt index eaa6c81..fe33442 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ sqlmodel pytest httpx numpy<2 -ollama \ No newline at end of file +ollama +dateparser \ No newline at end of file diff --git a/src/llm.py b/src/llm.py index 70937f9..fd6563a 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,7 +1,7 @@ import json import os import requests - +from src.utils.normalizer import normalize_date class LLM: def __init__(self, transcript_text=None, target_fields=None, json=None): @@ -46,7 +46,7 @@ def build_prompt(self, current_field): def main_loop(self): # self.type_check_all() - for field in self._target_fields.keys(): + for field in self._target_fields: prompt = self.build_prompt(field) # print(prompt) # ollama_url = "http://localhost:11434/api/generate" @@ -82,27 +82,29 @@ def main_loop(self): print("--------- extracted data ---------") return self - - def add_response_to_json(self, field, value): - """ - this method adds the following value under the specified field, - or under a new field if the field doesn't exist, to the json dict - """ + + def add_response_to_json(self, field, value) -> None: value = value.strip().replace('"', "") - parsed_value = None + parsed_value = value - if value != "-1": + # Check if the field is a date field (case-insensitive check) + if "date" in field.lower() or "time" in field.lower(): + parsed_value = normalize_date(value) + elif value != "-1": parsed_value = value if ";" in value: parsed_value = self.handle_plural_values(value) if field in self._json.keys(): - self._json[field].append(parsed_value) + # If it's a list, append; if not, turn it into one + if isinstance(self._json[field], list): + self._json[field].append(parsed_value) + else: + self._json[field] = [self._json[field], parsed_value] else: self._json[field] = parsed_value - - return + def handle_plural_values(self, plural_value): """ diff --git a/src/main.py b/src/main.py index 5bb632b..54f6b0a 100644 --- a/src/main.py +++ b/src/main.py @@ -3,6 +3,7 @@ from commonforms import prepare_form from pypdf import PdfReader from controller import Controller +from typing import Union def input_fields(num_fields: int): fields = [] @@ -68,7 +69,7 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio if __name__ == "__main__": file = "./src/inputs/file.pdf" user_input = "Hi. The employee's name is John Doe. His job title is managing director. His department supervisor is Jane Doe. His phone number is 123456. His email is jdoe@ucsc.edu. The signature is , and the date is 01/02/2005" - fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"] + descriptive_fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"] prepared_pdf = "temp_outfile.pdf" prepare_form(file, prepared_pdf) @@ -80,4 +81,4 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio num_fields = 0 controller = Controller() - controller.fill_form(user_input, fields, file) + controller.fill_form(user_input, descriptive_fields, file) diff --git a/src/utils/normalizer.py b/src/utils/normalizer.py new file mode 100644 index 0000000..a343e1e --- /dev/null +++ b/src/utils/normalizer.py @@ -0,0 +1,15 @@ +import dateparser +from datetime import datetime + +def normalize_date(raw_text: str, output_format: str = "%d-%m-%Y") -> str: + # If LLM returned -1 or empty, don't try to parse + if not raw_text or raw_text == "-1": + return raw_text + + # dateparser handles "yesterday", "2 days ago", "last Friday", etc. + parsed_date = dateparser.parse(raw_text) + + if parsed_date: + return parsed_date.strftime(output_format) + + return raw_text # Fallback to raw text if parsing fails \ No newline at end of file