From 3f6b614877f72bf1b64eb5831580cb317b60462c Mon Sep 17 00:00:00 2001 From: Cubix33 Date: Thu, 19 Mar 2026 19:34:38 +0000 Subject: [PATCH 1/3] #297 - audit trail --- src/filler.py | 44 ++++++++++++++++++++++++++++++++++++++++---- src/llm.py | 46 +++++++++++++++++++++++++++++++++------------- src/main.py | 5 +++-- 3 files changed, 76 insertions(+), 19 deletions(-) diff --git a/src/filler.py b/src/filler.py index e31e535..48232f5 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,7 +1,7 @@ from pdfrw import PdfReader, PdfWriter from src.llm import LLM from datetime import datetime - +import os class Filler: def __init__(self): @@ -23,7 +23,14 @@ def fill_form(self, pdf_form: str, llm: LLM): t2j = llm.main_loop() textbox_answers = t2j.get_data() # This is a dictionary - answers_list = list(textbox_answers.values()) + answers_list = [] + for data in textbox_answers.values(): + if isinstance(data, dict): + answers_list.append(data.get("value", "")) + elif isinstance(data, list) and len(data) > 0: + answers_list.append(data[0].get("value", "")) + else: + answers_list.append(str(data)) # Read PDF pdf = PdfReader(pdf_form) @@ -45,8 +52,37 @@ def fill_form(self, pdf_form: str, llm: LLM): else: # Stop if we run out of answers break - + PdfWriter().write(output_pdf, pdf) + # --- ZERO-DEPENDENCY AUDIT TRAIL --- + # Create a text file with the same name as the PDF + audit_txt_path = output_pdf.replace(".pdf", "_audit.txt") + + with open(audit_txt_path, "w", encoding="utf-8") as f: + f.write("="*60 + "\n") + f.write("FIREFORM AI DATA EXTRACTION AUDIT TRAIL\n") + f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write("="*60 + "\n\n") + + for field, data in textbox_answers.items(): + # Parse out the value and quote safely + if isinstance(data, dict): + val = data.get("value", "N/A") + quote = data.get("quote", "N/A") + elif isinstance(data, list) and len(data) > 0: + val = data[0].get("value", "N/A") + quote = data[0].get("quote", "N/A") + else: + val, quote = str(data), "N/A" + + f.write(f"FIELD : {field}\n") + f.write(f"VALUE : {val}\n") + f.write(f"SOURCE: \"{quote}\"\n") + f.write("-" * 60 + "\n") + + print(f"\t[LOG] Audit trail saved to: {audit_txt_path}") + # ----------------------------------- + # Your main.py expects this function to return the path - return output_pdf + return output_pdf \ No newline at end of file diff --git a/src/llm.py b/src/llm.py index 70937f9..bf5dd8f 100644 --- a/src/llm.py +++ b/src/llm.py @@ -30,11 +30,10 @@ def build_prompt(self, current_field): """ prompt = f""" SYSTEM PROMPT: - You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. - You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return - only a single string containing the identified value for the JSON field. - If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";". - If you don't identify the value in the provided text, return "-1". + You are an AI assistant designed to extract information from transcribed voice recordings. + You must return your answer STRICTLY as a valid JSON object with two keys: "value" and "quote". + - "value": The identified value for the field. If not found, use "-1". If plural, separate with ";". + - "quote": The exact sentence or phrase from the text that justifies your value. If not found, use "N/A". --- DATA: Target JSON field to find in text: {current_field} @@ -46,7 +45,7 @@ def build_prompt(self, current_field): def main_loop(self): # self.type_check_all() - for field in self._target_fields.keys(): + for field in self._target_fields: prompt = self.build_prompt(field) # print(prompt) # ollama_url = "http://localhost:11434/api/generate" @@ -56,7 +55,8 @@ def main_loop(self): payload = { "model": "mistral", "prompt": prompt, - "stream": False, # don't really know why --> look into this later. + "stream": False, # don't really know why --> look into this later. + "format": "json" } try: @@ -72,9 +72,15 @@ def main_loop(self): # parse response json_data = response.json() - parsed_response = json_data["response"] - # print(parsed_response) - self.add_response_to_json(field, parsed_response) + try: + parsed_obj = json.loads(json_data["response"]) + val = parsed_obj.get("value", "-1") + quote = parsed_obj.get("quote", "N/A") + except json.JSONDecodeError: + val = "-1" + quote = "JSON Parse Error" + + self.add_response_to_json(field, val, quote) print("----------------------------------") print("\t[LOG] Resulting JSON created from the input text:") @@ -83,11 +89,19 @@ def main_loop(self): return self - def add_response_to_json(self, field, value): + def add_response_to_json(self, field, value, quote="N/A"): """ this method adds the following value under the specified field, or under a new field if the field doesn't exist, to the json dict """ + # SAFETY CHECK: If the LLM returns a list, join it with semicolons + if isinstance(value, list): + value = ";".join([str(v) for v in value]) + # If it returns a number or boolean, cast it to a string + elif not isinstance(value, str): + value = str(value) + + # Now it's guaranteed to be a string value = value.strip().replace('"', "") parsed_value = None @@ -96,11 +110,17 @@ def add_response_to_json(self, field, value): if ";" in value: parsed_value = self.handle_plural_values(value) + + entry = {"value": parsed_value, "quote": quote} if field in self._json.keys(): - self._json[field].append(parsed_value) + # If it's already a list, append. Otherwise convert to list. + if isinstance(self._json[field], list): + self._json[field].append(entry) + else: + self._json[field] = [self._json[field], entry] else: - self._json[field] = parsed_value + self._json[field] = entry return diff --git a/src/main.py b/src/main.py index 5bb632b..54f6b0a 100644 --- a/src/main.py +++ b/src/main.py @@ -3,6 +3,7 @@ from commonforms import prepare_form from pypdf import PdfReader from controller import Controller +from typing import Union def input_fields(num_fields: int): fields = [] @@ -68,7 +69,7 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio if __name__ == "__main__": file = "./src/inputs/file.pdf" user_input = "Hi. The employee's name is John Doe. His job title is managing director. His department supervisor is Jane Doe. His phone number is 123456. His email is jdoe@ucsc.edu. The signature is , and the date is 01/02/2005" - fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"] + descriptive_fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"] prepared_pdf = "temp_outfile.pdf" prepare_form(file, prepared_pdf) @@ -80,4 +81,4 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio num_fields = 0 controller = Controller() - controller.fill_form(user_input, fields, file) + controller.fill_form(user_input, descriptive_fields, file) From 4976c6f8da8048d68c78240ccdc2ac3a8eb42ed1 Mon Sep 17 00:00:00 2001 From: Cubix33 Date: Thu, 19 Mar 2026 19:37:19 +0000 Subject: [PATCH 2/3] #297 - audit trail --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 54f6b0a..9fa14c1 100644 --- a/src/main.py +++ b/src/main.py @@ -74,7 +74,7 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio prepare_form(file, prepared_pdf) reader = PdfReader(prepared_pdf) - fields = reader.get_fields() + fields = reader.get_fields()) if(fields): num_fields = len(fields) else: From 6593d26599a92282c5fa8dceea72c9bd307e3547 Mon Sep 17 00:00:00 2001 From: Cubix33 Date: Thu, 19 Mar 2026 19:40:44 +0000 Subject: [PATCH 3/3] fixed syntax in main.py --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 9fa14c1..54f6b0a 100644 --- a/src/main.py +++ b/src/main.py @@ -74,7 +74,7 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio prepare_form(file, prepared_pdf) reader = PdfReader(prepared_pdf) - fields = reader.get_fields()) + fields = reader.get_fields() if(fields): num_fields = len(fields) else: