Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions services/api/src/owl/routers/gen_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,10 @@
TableMetaResponse,
TableSchemaCreate,
TableType,
URLEmbedFormData,
UserAuth,
)
from owl.url_loader import load_url_content
from owl.utils.auth import auth_user_project, has_permissions
from owl.utils.billing import BillingManager
from owl.utils.exceptions import (
Expand Down Expand Up @@ -890,6 +892,21 @@ async def embed_file_options():
return Response(content=None, headers=headers)


@router.options(
"/v2/gen_tables/knowledge/embed_url",
summary="Get CORS preflight options for URL embedding endpoint",
description="Permissions: None, publicly accessible.",
)
@handle_exception
async def embed_url_options():
headers = {
"Allow": "POST, OPTIONS",
"Access-Control-Allow-Methods": "POST, OPTIONS",
"Access-Control-Allow-Headers": "Content-Type",
}
return Response(content=None, headers=headers)


@router.post(
"/v2/gen_tables/knowledge/embed_file",
summary="Embed a file into a knowledge table.",
Expand Down Expand Up @@ -1015,6 +1032,117 @@ async def embed_file(
return OkResponse()


@router.post(
"/v2/gen_tables/knowledge/embed_url",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If possible, I suggest merging this with /v2/gen_tables/knowledge/embed_file, by merging URLEmbedFormData into FileEmbedFormData.
Then we can just check if file is None and/or url is None.

summary="Embed a URL into a knowledge table.",
description="Permissions: `organization.MEMBER` OR `project.MEMBER`.",
)
@handle_exception
async def embed_url(
*,
request: Request,
auth_info: Annotated[
tuple[UserAuth, ProjectRead, OrganizationRead], Depends(auth_user_project)
],
data: Annotated[URLEmbedFormData, Form()],
) -> OkResponse:
user, project, org = auth_info
has_permissions(
user,
["organization.MEMBER", "project.MEMBER"],
organization_id=org.id,
project_id=project.id,
)
# --- Fetch URL content --- #
logger.info(f'Fetching content from URL "{data.url}".')
try:
file_content_str, file_name = await load_url_content(data.url)
file_content = file_content_str.encode("utf-8")
except ValueError as e:
raise BadInputError(f"Invalid URL: {e}")
except Exception as e:
logger.warning(f'Failed to fetch URL "{data.url}" due to error: {repr(e)}')
raise BadInputError(f"Failed to fetch URL content: {str(e)}")

table = await KnowledgeTable.open_table(
project_id=project.id,
table_id=data.table_id,
)
# Check quota
request_id: str = request.state.id
billing: BillingManager = request.state.billing
billing.has_gen_table_quota(table)
billing.has_db_storage_quota()
billing.has_egress_quota()

# --- Add into Knowledge Table --- #
logger.info(f'{request_id} - Parsing content from "{data.url}".')
doc_parser = GeneralDocLoader(request_id=request_id)
try:
chunks = await doc_parser.load_document_chunks(
file_name, file_content, data.chunk_size, data.chunk_overlap
)
except BadInputError as e:
logger.warning(f'Failed to parse content from "{data.url}" due to error: {repr(e)}')
raise
except Exception as e:
logger.warning(f'Failed to parse content from "{data.url}" due to error: {repr(e)}')
raise BadInputError(
f'Sorry we encountered an issue while processing content from "{data.url}". '
"Please ensure the URL is valid and contains parseable content."
) from e

logger.info(f'{request_id} - Embedding content from "{data.url}" with {len(chunks):,d} chunks.')

# --- Extract title --- #
lm = LMEngine(
organization=org,
project=project,
request=request,
)
first_page_chunks = [d.text for d in chunks[:8]]
excerpt = "".join(first_page_chunks)[:50000]
logger.debug(f"{request_id} - Performing title extraction.")
title = await lm.generate_title(excerpt=excerpt, model="")

# --- Embed --- #
title_embed = text_embeds = None
for col in table.column_metadata:
if col.column_id.lower() == "title embed":
title_embed = await lm.embed_documents(
model=col.gen_config.embedding_model,
texts=[title],
encoding_format="float",
)
title_embed = title_embed.data[0].embedding
elif col.column_id.lower() == "text embed":
text_embeds = await lm.embed_documents(
model=col.gen_config.embedding_model,
texts=[chunk.text for chunk in chunks],
encoding_format="float",
)
text_embeds = [data.embedding for data in text_embeds.data]

if title_embed is None or text_embeds is None or len(text_embeds) == 0:
raise BadInputError(
"Sorry we encountered an issue during embedding. If this issue persists, please contact support."
)
# --- Store into Knowledge Table --- #
row_add_data = [
{
"Title": title,
"Title Embed": title_embed,
"Text": chunk.text,
"Text Embed": text_embed,
"File ID": data.url,
"Page": chunk.page,
}
for chunk, text_embed in zip(chunks, text_embeds, strict=True)
]
await table.add_rows(row_add_data)
return OkResponse()


@router.post(
"/v2/gen_tables/{table_type}/import_data",
summary="Import data into a table.",
Expand Down
24 changes: 24 additions & 0 deletions services/api/src/owl/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,30 @@ class FileEmbedFormData(BaseModel):
] = 200


class URLEmbedFormData(BaseModel):
url: Annotated[str, Field(description="The URL to extract content from.")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be merged with FileEmbedFormData by making file nullable and url nullable? Then we can add a validator that requires either one to be not null.

table_id: Annotated[SanitisedNonEmptyStr, Field(description="Knowledge Table ID.")]
chunk_size: Annotated[
int, Field(gt=0, description="Maximum chunk size (number of characters). Must be > 0.")
] = 2000
chunk_overlap: Annotated[
int, Field(ge=0, description="Overlap in characters between chunks. Must be >= 0.")
] = 200

@field_validator("url", mode="before")
@classmethod
def validate_url_format(cls, v: str) -> str:
"""Validate URL format: must be http or https."""
if not isinstance(v, str):
raise ValueError("URL must be a string")
v = v.strip()
if not v.startswith(("http://", "https://")):
raise ValueError("URL must start with http:// or https://")
if len(v) < 10: # Minimum viable URL length
raise ValueError("URL is too short")
return v


class TableDataImportFormData(BaseModel):
file: Annotated[UploadFile, File(description="The CSV or TSV file.")]
file_name: Annotated[str, Field(description="File name.", deprecated=True)] = ""
Expand Down
72 changes: 72 additions & 0 deletions services/api/src/owl/url_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""URL content loader for knowledge table embedding."""

from typing import Tuple
from urllib.parse import urlparse

import httpx
from bs4 import BeautifulSoup

# Maximum content size: 50MB
MAX_CONTENT_SIZE = 50 * 1024 * 1024


async def load_url_content(url: str, timeout: int = 30) -> Tuple[str, str]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps can build on top of open_uri_async (has URL validation) and move into utils.io?

"""
Fetch and extract text content from URL.

Args:
url: The URL to fetch
timeout: Request timeout in seconds

Returns:
Tuple of (content_text, filename_identifier)

Raises:
httpx.HTTPError: If the URL cannot be fetched
ValueError: If URL is invalid or content exceeds size limit
"""
try:
async with httpx.AsyncClient(limits=httpx.Limits(max_connections=1)) as client:
response = await client.get(
url,
timeout=timeout,
follow_redirects=True,
headers={"User-Agent": "JamAIBase/1.0"},
)
response.raise_for_status()

# Check Content-Length header before full download
content_length = response.headers.get("content-length")
if content_length:
try:
size = int(content_length)
if size > MAX_CONTENT_SIZE:
raise ValueError(
f"Content size ({size} bytes) exceeds maximum allowed ({MAX_CONTENT_SIZE} bytes)"
)
except ValueError:
pass # If conversion fails, proceed with download

except httpx.InvalidURL as e:
raise ValueError(f"Invalid URL: {url}") from e
except httpx.HTTPError as e:
raise ValueError(f"Failed to fetch URL: {str(e)}") from e

soup = BeautifulSoup(response.content, "html.parser")

# Remove noise
for tag in soup(["script", "style", "meta", "link"]):
tag.decompose()

content = soup.get_text(separator="\n", strip=True)

# Validate extracted content is not empty
if not content or len(content.strip()) < 10:
raise ValueError("URL content is empty or too short")

# Use domain as filename-like identifier
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "")
filename = f"{domain}_content.txt"

return content, filename