Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 40 additions & 4 deletions src/filler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pdfrw import PdfReader, PdfWriter
from src.llm import LLM
from datetime import datetime

import os

class Filler:
def __init__(self):
Expand All @@ -23,7 +23,14 @@ def fill_form(self, pdf_form: str, llm: LLM):
t2j = llm.main_loop()
textbox_answers = t2j.get_data() # This is a dictionary

answers_list = list(textbox_answers.values())
answers_list = []
for data in textbox_answers.values():
if isinstance(data, dict):
answers_list.append(data.get("value", ""))
elif isinstance(data, list) and len(data) > 0:
answers_list.append(data[0].get("value", ""))
else:
answers_list.append(str(data))

# Read PDF
pdf = PdfReader(pdf_form)
Expand All @@ -45,8 +52,37 @@ def fill_form(self, pdf_form: str, llm: LLM):
else:
# Stop if we run out of answers
break

PdfWriter().write(output_pdf, pdf)

# --- ZERO-DEPENDENCY AUDIT TRAIL ---
# Create a text file with the same name as the PDF
audit_txt_path = output_pdf.replace(".pdf", "_audit.txt")

with open(audit_txt_path, "w", encoding="utf-8") as f:
f.write("="*60 + "\n")
f.write("FIREFORM AI DATA EXTRACTION AUDIT TRAIL\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("="*60 + "\n\n")

for field, data in textbox_answers.items():
# Parse out the value and quote safely
if isinstance(data, dict):
val = data.get("value", "N/A")
quote = data.get("quote", "N/A")
elif isinstance(data, list) and len(data) > 0:
val = data[0].get("value", "N/A")
quote = data[0].get("quote", "N/A")
else:
val, quote = str(data), "N/A"

f.write(f"FIELD : {field}\n")
f.write(f"VALUE : {val}\n")
f.write(f"SOURCE: \"{quote}\"\n")
f.write("-" * 60 + "\n")

print(f"\t[LOG] Audit trail saved to: {audit_txt_path}")
# -----------------------------------

# Your main.py expects this function to return the path
return output_pdf
return output_pdf
46 changes: 33 additions & 13 deletions src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ def build_prompt(self, current_field):
"""
prompt = f"""
SYSTEM PROMPT:
You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings.
You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return
only a single string containing the identified value for the JSON field.
If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";".
If you don't identify the value in the provided text, return "-1".
You are an AI assistant designed to extract information from transcribed voice recordings.
You must return your answer STRICTLY as a valid JSON object with two keys: "value" and "quote".
- "value": The identified value for the field. If not found, use "-1". If plural, separate with ";".
- "quote": The exact sentence or phrase from the text that justifies your value. If not found, use "N/A".
---
DATA:
Target JSON field to find in text: {current_field}
Expand All @@ -46,7 +45,7 @@ def build_prompt(self, current_field):

def main_loop(self):
# self.type_check_all()
for field in self._target_fields.keys():
for field in self._target_fields:
prompt = self.build_prompt(field)
# print(prompt)
# ollama_url = "http://localhost:11434/api/generate"
Expand All @@ -56,7 +55,8 @@ def main_loop(self):
payload = {
"model": "mistral",
"prompt": prompt,
"stream": False, # don't really know why --> look into this later.
"stream": False, # don't really know why --> look into this later.
"format": "json"
}

try:
Expand All @@ -72,9 +72,15 @@ def main_loop(self):

# parse response
json_data = response.json()
parsed_response = json_data["response"]
# print(parsed_response)
self.add_response_to_json(field, parsed_response)
try:
parsed_obj = json.loads(json_data["response"])
val = parsed_obj.get("value", "-1")
quote = parsed_obj.get("quote", "N/A")
except json.JSONDecodeError:
val = "-1"
quote = "JSON Parse Error"

self.add_response_to_json(field, val, quote)

print("----------------------------------")
print("\t[LOG] Resulting JSON created from the input text:")
Expand All @@ -83,11 +89,19 @@ def main_loop(self):

return self

def add_response_to_json(self, field, value):
def add_response_to_json(self, field, value, quote="N/A"):
"""
this method adds the following value under the specified field,
or under a new field if the field doesn't exist, to the json dict
"""
# SAFETY CHECK: If the LLM returns a list, join it with semicolons
if isinstance(value, list):
value = ";".join([str(v) for v in value])
# If it returns a number or boolean, cast it to a string
elif not isinstance(value, str):
value = str(value)

# Now it's guaranteed to be a string
value = value.strip().replace('"', "")
parsed_value = None

Expand All @@ -96,11 +110,17 @@ def add_response_to_json(self, field, value):

if ";" in value:
parsed_value = self.handle_plural_values(value)

entry = {"value": parsed_value, "quote": quote}

if field in self._json.keys():
self._json[field].append(parsed_value)
# If it's already a list, append. Otherwise convert to list.
if isinstance(self._json[field], list):
self._json[field].append(entry)
else:
self._json[field] = [self._json[field], entry]
else:
self._json[field] = parsed_value
self._json[field] = entry

return

Expand Down
5 changes: 3 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from commonforms import prepare_form
from pypdf import PdfReader
from controller import Controller
from typing import Union

def input_fields(num_fields: int):
fields = []
Expand Down Expand Up @@ -68,7 +69,7 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio
if __name__ == "__main__":
file = "./src/inputs/file.pdf"
user_input = "Hi. The employee's name is John Doe. His job title is managing director. His department supervisor is Jane Doe. His phone number is 123456. His email is jdoe@ucsc.edu. The signature is <Mamañema>, and the date is 01/02/2005"
fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"]
descriptive_fields = ["Employee's name", "Employee's job title", "Employee's department supervisor", "Employee's phone number", "Employee's email", "Signature", "Date"]
prepared_pdf = "temp_outfile.pdf"
prepare_form(file, prepared_pdf)

Expand All @@ -80,4 +81,4 @@ def run_pdf_fill_process(user_input: str, definitions: list, pdf_form_path: Unio
num_fields = 0

controller = Controller()
controller.fill_form(user_input, fields, file)
controller.fill_form(user_input, descriptive_fields, file)