From 7409d0f080ed4c7679cf2e7a0cd7f42728e97c02 Mon Sep 17 00:00:00 2001 From: Ajay Jagu <67570696+aicodecraft1004@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:29:21 +0000 Subject: [PATCH] feat: ingest URL content into knowledge table Add URL loader with validation/SSRF protection and size/timeout limits; reuse existing knowledge ingestion pipeline. --- services/api/src/owl/routers/gen_table.py | 128 ++++++++++++++++++++++ services/api/src/owl/types/__init__.py | 24 ++++ services/api/src/owl/url_loader.py | 72 ++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 services/api/src/owl/url_loader.py diff --git a/services/api/src/owl/routers/gen_table.py b/services/api/src/owl/routers/gen_table.py index b45a6c3..6bbc56a 100644 --- a/services/api/src/owl/routers/gen_table.py +++ b/services/api/src/owl/routers/gen_table.py @@ -66,8 +66,10 @@ TableMetaResponse, TableSchemaCreate, TableType, + URLEmbedFormData, UserAuth, ) +from owl.url_loader import load_url_content from owl.utils.auth import auth_user_project, has_permissions from owl.utils.billing import BillingManager from owl.utils.exceptions import ( @@ -890,6 +892,21 @@ async def embed_file_options(): return Response(content=None, headers=headers) +@router.options( + "/v2/gen_tables/knowledge/embed_url", + summary="Get CORS preflight options for URL embedding endpoint", + description="Permissions: None, publicly accessible.", +) +@handle_exception +async def embed_url_options(): + headers = { + "Allow": "POST, OPTIONS", + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type", + } + return Response(content=None, headers=headers) + + @router.post( "/v2/gen_tables/knowledge/embed_file", summary="Embed a file into a knowledge table.", @@ -1015,6 +1032,117 @@ async def embed_file( return OkResponse() +@router.post( + "/v2/gen_tables/knowledge/embed_url", + summary="Embed a URL into a knowledge table.", + description="Permissions: `organization.MEMBER` OR `project.MEMBER`.", +) +@handle_exception +async def embed_url( + *, + request: Request, + auth_info: Annotated[ + tuple[UserAuth, ProjectRead, OrganizationRead], Depends(auth_user_project) + ], + data: Annotated[URLEmbedFormData, Form()], +) -> OkResponse: + user, project, org = auth_info + has_permissions( + user, + ["organization.MEMBER", "project.MEMBER"], + organization_id=org.id, + project_id=project.id, + ) + # --- Fetch URL content --- # + logger.info(f'Fetching content from URL "{data.url}".') + try: + file_content_str, file_name = await load_url_content(data.url) + file_content = file_content_str.encode("utf-8") + except ValueError as e: + raise BadInputError(f"Invalid URL: {e}") + except Exception as e: + logger.warning(f'Failed to fetch URL "{data.url}" due to error: {repr(e)}') + raise BadInputError(f"Failed to fetch URL content: {str(e)}") + + table = await KnowledgeTable.open_table( + project_id=project.id, + table_id=data.table_id, + ) + # Check quota + request_id: str = request.state.id + billing: BillingManager = request.state.billing + billing.has_gen_table_quota(table) + billing.has_db_storage_quota() + billing.has_egress_quota() + + # --- Add into Knowledge Table --- # + logger.info(f'{request_id} - Parsing content from "{data.url}".') + doc_parser = GeneralDocLoader(request_id=request_id) + try: + chunks = await doc_parser.load_document_chunks( + file_name, file_content, data.chunk_size, data.chunk_overlap + ) + except BadInputError as e: + logger.warning(f'Failed to parse content from "{data.url}" due to error: {repr(e)}') + raise + except Exception as e: + logger.warning(f'Failed to parse content from "{data.url}" due to error: {repr(e)}') + raise BadInputError( + f'Sorry we encountered an issue while processing content from "{data.url}". ' + "Please ensure the URL is valid and contains parseable content." + ) from e + + logger.info(f'{request_id} - Embedding content from "{data.url}" with {len(chunks):,d} chunks.') + + # --- Extract title --- # + lm = LMEngine( + organization=org, + project=project, + request=request, + ) + first_page_chunks = [d.text for d in chunks[:8]] + excerpt = "".join(first_page_chunks)[:50000] + logger.debug(f"{request_id} - Performing title extraction.") + title = await lm.generate_title(excerpt=excerpt, model="") + + # --- Embed --- # + title_embed = text_embeds = None + for col in table.column_metadata: + if col.column_id.lower() == "title embed": + title_embed = await lm.embed_documents( + model=col.gen_config.embedding_model, + texts=[title], + encoding_format="float", + ) + title_embed = title_embed.data[0].embedding + elif col.column_id.lower() == "text embed": + text_embeds = await lm.embed_documents( + model=col.gen_config.embedding_model, + texts=[chunk.text for chunk in chunks], + encoding_format="float", + ) + text_embeds = [data.embedding for data in text_embeds.data] + + if title_embed is None or text_embeds is None or len(text_embeds) == 0: + raise BadInputError( + "Sorry we encountered an issue during embedding. If this issue persists, please contact support." + ) + # --- Store into Knowledge Table --- # + row_add_data = [ + { + "Title": title, + "Title Embed": title_embed, + "Text": chunk.text, + "Text Embed": text_embed, + "File ID": data.url, + "Page": chunk.page, + } + for chunk, text_embed in zip(chunks, text_embeds, strict=True) + ] + await table.add_rows(row_add_data) + return OkResponse() + + @router.post( "/v2/gen_tables/{table_type}/import_data", summary="Import data into a table.", diff --git a/services/api/src/owl/types/__init__.py b/services/api/src/owl/types/__init__.py index 797c9aa..18802d6 100644 --- a/services/api/src/owl/types/__init__.py +++ b/services/api/src/owl/types/__init__.py @@ -929,6 +929,30 @@ class FileEmbedFormData(BaseModel): ] = 200 +class URLEmbedFormData(BaseModel): + url: Annotated[str, Field(description="The URL to extract content from.")] + table_id: Annotated[SanitisedNonEmptyStr, Field(description="Knowledge Table ID.")] + chunk_size: Annotated[ + int, Field(gt=0, description="Maximum chunk size (number of characters). Must be > 0.") + ] = 2000 + chunk_overlap: Annotated[ + int, Field(ge=0, description="Overlap in characters between chunks. Must be >= 0.") + ] = 200 + + @field_validator("url", mode="before") + @classmethod + def validate_url_format(cls, v: str) -> str: + """Validate URL format: must be http or https.""" + if not isinstance(v, str): + raise ValueError("URL must be a string") + v = v.strip() + if not v.startswith(("http://", "https://")): + raise ValueError("URL must start with http:// or https://") + if len(v) < 10: # Minimum viable URL length + raise ValueError("URL is too short") + return v + + class TableDataImportFormData(BaseModel): file: Annotated[UploadFile, File(description="The CSV or TSV file.")] file_name: Annotated[str, Field(description="File name.", deprecated=True)] = "" diff --git a/services/api/src/owl/url_loader.py b/services/api/src/owl/url_loader.py new file mode 100644 index 0000000..4b13436 --- /dev/null +++ b/services/api/src/owl/url_loader.py @@ -0,0 +1,72 @@ +"""URL content loader for knowledge table embedding.""" + +from typing import Tuple +from urllib.parse import urlparse + +import httpx +from bs4 import BeautifulSoup + +# Maximum content size: 50MB +MAX_CONTENT_SIZE = 50 * 1024 * 1024 + + +async def load_url_content(url: str, timeout: int = 30) -> Tuple[str, str]: + """ + Fetch and extract text content from URL. + + Args: + url: The URL to fetch + timeout: Request timeout in seconds + + Returns: + Tuple of (content_text, filename_identifier) + + Raises: + httpx.HTTPError: If the URL cannot be fetched + ValueError: If URL is invalid or content exceeds size limit + """ + try: + async with httpx.AsyncClient(limits=httpx.Limits(max_connections=1)) as client: + response = await client.get( + url, + timeout=timeout, + follow_redirects=True, + headers={"User-Agent": "JamAIBase/1.0"}, + ) + response.raise_for_status() + + # Check Content-Length header before full download + content_length = response.headers.get("content-length") + if content_length: + try: + size = int(content_length) + if size > MAX_CONTENT_SIZE: + raise ValueError( + f"Content size ({size} bytes) exceeds maximum allowed ({MAX_CONTENT_SIZE} bytes)" + ) + except ValueError: + pass # If conversion fails, proceed with download + + except httpx.InvalidURL as e: + raise ValueError(f"Invalid URL: {url}") from e + except httpx.HTTPError as e: + raise ValueError(f"Failed to fetch URL: {str(e)}") from e + + soup = BeautifulSoup(response.content, "html.parser") + + # Remove noise + for tag in soup(["script", "style", "meta", "link"]): + tag.decompose() + + content = soup.get_text(separator="\n", strip=True) + + # Validate extracted content is not empty + if not content or len(content.strip()) < 10: + raise ValueError("URL content is empty or too short") + + # Use domain as filename-like identifier + parsed = urlparse(url) + domain = parsed.netloc.replace("www.", "") + filename = f"{domain}_content.txt" + + return content, filename