Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions eu_fact_force/ingestion/embedding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from eu_fact_force.ingestion.models import DocumentChunk
from typing import Iterator

import structlog

from eu_fact_force.ingestion.models import DocumentChunk
from eu_fact_force.utils.decorators import tracker

LOGGER = structlog.get_logger(__name__)

MODEL_ID = "intfloat/multilingual-e5-base"
# E5 models expect "passage: " for documents to index and "query: " for search queries (asymmetric retrieval).
PASSAGE_PREFIX = "passage: "
Expand Down Expand Up @@ -37,11 +43,14 @@ def embed_query(query: str) -> list[float]:
return out.tolist() if hasattr(out, "tolist") else list(out)


def _iter_batches(items: list[DocumentChunk], batch_size: int) -> Iterator[list[DocumentChunk]]:
def _iter_batches(
items: list[DocumentChunk], batch_size: int
) -> Iterator[list[DocumentChunk]]:
for start in range(0, len(items), batch_size):
yield items[start : start + batch_size]


@tracker(ulogger=LOGGER, inputs=True, log_start=True)
def add_embeddings(chunks: list[DocumentChunk]):
"""
Add embeddings to the chunks and update in the DB.
Expand All @@ -61,5 +70,7 @@ def add_embeddings(chunks: list[DocumentChunk]):
normalize_embeddings=True,
)
for chunk, vector in zip(batch, vectors):
chunk.embedding = vector.tolist() if hasattr(vector, "tolist") else list(vector)
chunk.embedding = (
vector.tolist() if hasattr(vector, "tolist") else list(vector)
)
DocumentChunk.objects.bulk_update(batch, ["embedding"])
17 changes: 14 additions & 3 deletions eu_fact_force/ingestion/parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,24 @@

from __future__ import annotations

import tempfile
from contextlib import contextmanager
from pathlib import Path
import tempfile

import structlog
from django.core.files.storage import default_storage

from eu_fact_force.exploration.parsing_benchmarking.benchmarking.parsers import (
parse_docling,
)
from eu_fact_force.ingestion.chunking import MAX_CHUNK_CHARS, split_into_paragraph_chunks
from django.core.files.storage import default_storage
from eu_fact_force.ingestion.chunking import (
MAX_CHUNK_CHARS,
split_into_paragraph_chunks,
)
from eu_fact_force.utils.decorators import tracker

LOGGER = structlog.get_logger(__name__)


@contextmanager
def _source_file_local_path(source_file):
Expand Down Expand Up @@ -47,6 +57,7 @@ def _extract_text_from_source_file(source_file) -> str:
return full_text


@tracker(ulogger=LOGGER, inputs=True, log_start=True)
def parse_file(source_file) -> list[str]:
"""
Parse the source file and return paragraph-bounded text chunks.
Expand Down
124 changes: 124 additions & 0 deletions eu_fact_force/utils/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import functools
import inspect
import logging
import time

import structlog


def _is_structlog_logger(logger) -> bool:
"""Best-effort check to detect structlog loggers."""
if logger is None:
return False
return logger.__class__.__module__.startswith("structlog")


def log_msg(logger, level, msg, extra):
# If this is a structlog logger, keep structured kwargs
if _is_structlog_logger(logger):
if level.lower() == "info":
logger.info(msg, **extra)
elif level.lower() == "debug":
logger.debug(msg, **extra)
else:
logger.log(level.upper(), msg, **extra) # type: ignore[attr-defined]
return

# For non-structlog loggers (e.g. Dagster / stdlib logging),
# stringify the extra dict and log it as part of the message.
if not isinstance(logger, logging.Logger | logging.LoggerAdapter):
logger = logging.getLogger() # root logger

logging_level = getattr(logging, level.upper(), logging.INFO)
extra_str = str(extra) if extra is not None else ""
if extra_str:
logger.log(logging_level, "%s %s", msg, extra_str)
else:
logger.log(logging_level, "%s", msg)


def _build_extra(func_name, args, kwargs, include_inputs):
extra = {"function_": func_name}
if include_inputs:
for k, v in kwargs.items():
extra["args_" + k] = v

for i, v in enumerate(args):
extra["args_" + str(i)] = v
return extra


def _log_start(ulogger, level, log_start, extra):
if log_start:
log_msg(ulogger, level, "start", extra)


def _log_end(ulogger, level, tracker_state: dict) -> None:
outputs = tracker_state["outputs"]
value = tracker_state["value"]
start_time = tracker_state["start_time"]
extra = tracker_state["extra"]
end_time = time.time()
extra["duration_"] = round((end_time - start_time), 3)
if outputs:
extra["return_"] = value
log_msg(ulogger, level, "tracker", extra)


def tracker(
_func=None, ulogger=None, inputs=False, outputs=False, log_start=False, level="info"
):
"""Log the trace of the program"""
if ulogger is None:
ulogger = structlog.get_logger()

def decorator_tracker(func):
if inspect.iscoroutinefunction(func):

@functools.wraps(func)
async def wrapper_logger(*args, **kwargs):
extra = _build_extra(func.__name__, args, kwargs, inputs)
_log_start(ulogger, level, log_start, extra)
start_time = time.time()
value = await func(*args, **kwargs)
_log_end(
ulogger,
level,
{
"outputs": outputs,
"value": value,
"start_time": start_time,
"extra": extra,
},
)

return value

return wrapper_logger
else:

@functools.wraps(func)
def wrapper_logger(*args, **kwargs):
extra = _build_extra(func.__name__, args, kwargs, inputs)
_log_start(ulogger, level, log_start, extra)
start_time = time.time()
value = func(*args, **kwargs)
_log_end(
ulogger,
level,
{
"outputs": outputs,
"value": value,
"start_time": start_time,
"extra": extra,
},
)

return value

return wrapper_logger

if _func is None:
return decorator_tracker
else:
return decorator_tracker(_func)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies = [
"docling-hierarchical-pdf>=0.1.6",
"requests>=2.32.5",
"arxiv>=2.4.1",
"structlog>=25.5.0",
]

[tool.pytest.ini_options]
Expand Down
11 changes: 11 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading