-
Notifications
You must be signed in to change notification settings - Fork 39
feat: ingest URL content into knowledge table #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -929,6 +929,30 @@ class FileEmbedFormData(BaseModel): | |
| ] = 200 | ||
|
|
||
|
|
||
| class URLEmbedFormData(BaseModel): | ||
| url: Annotated[str, Field(description="The URL to extract content from.")] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be merged with |
||
| table_id: Annotated[SanitisedNonEmptyStr, Field(description="Knowledge Table ID.")] | ||
| chunk_size: Annotated[ | ||
| int, Field(gt=0, description="Maximum chunk size (number of characters). Must be > 0.") | ||
| ] = 2000 | ||
| chunk_overlap: Annotated[ | ||
| int, Field(ge=0, description="Overlap in characters between chunks. Must be >= 0.") | ||
| ] = 200 | ||
|
|
||
| @field_validator("url", mode="before") | ||
| @classmethod | ||
| def validate_url_format(cls, v: str) -> str: | ||
| """Validate URL format: must be http or https.""" | ||
| if not isinstance(v, str): | ||
| raise ValueError("URL must be a string") | ||
| v = v.strip() | ||
| if not v.startswith(("http://", "https://")): | ||
| raise ValueError("URL must start with http:// or https://") | ||
| if len(v) < 10: # Minimum viable URL length | ||
| raise ValueError("URL is too short") | ||
| return v | ||
|
|
||
|
|
||
| class TableDataImportFormData(BaseModel): | ||
| file: Annotated[UploadFile, File(description="The CSV or TSV file.")] | ||
| file_name: Annotated[str, Field(description="File name.", deprecated=True)] = "" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| """URL content loader for knowledge table embedding.""" | ||
|
|
||
| from typing import Tuple | ||
| from urllib.parse import urlparse | ||
|
|
||
| import httpx | ||
| from bs4 import BeautifulSoup | ||
|
|
||
| # Maximum content size: 50MB | ||
| MAX_CONTENT_SIZE = 50 * 1024 * 1024 | ||
|
|
||
|
|
||
| async def load_url_content(url: str, timeout: int = 30) -> Tuple[str, str]: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps can build on top of |
||
| """ | ||
| Fetch and extract text content from URL. | ||
|
|
||
| Args: | ||
| url: The URL to fetch | ||
| timeout: Request timeout in seconds | ||
|
|
||
| Returns: | ||
| Tuple of (content_text, filename_identifier) | ||
|
|
||
| Raises: | ||
| httpx.HTTPError: If the URL cannot be fetched | ||
| ValueError: If URL is invalid or content exceeds size limit | ||
| """ | ||
| try: | ||
| async with httpx.AsyncClient(limits=httpx.Limits(max_connections=1)) as client: | ||
| response = await client.get( | ||
| url, | ||
| timeout=timeout, | ||
| follow_redirects=True, | ||
| headers={"User-Agent": "JamAIBase/1.0"}, | ||
| ) | ||
| response.raise_for_status() | ||
|
|
||
| # Check Content-Length header before full download | ||
| content_length = response.headers.get("content-length") | ||
| if content_length: | ||
| try: | ||
| size = int(content_length) | ||
| if size > MAX_CONTENT_SIZE: | ||
| raise ValueError( | ||
| f"Content size ({size} bytes) exceeds maximum allowed ({MAX_CONTENT_SIZE} bytes)" | ||
| ) | ||
| except ValueError: | ||
| pass # If conversion fails, proceed with download | ||
|
|
||
| except httpx.InvalidURL as e: | ||
| raise ValueError(f"Invalid URL: {url}") from e | ||
| except httpx.HTTPError as e: | ||
| raise ValueError(f"Failed to fetch URL: {str(e)}") from e | ||
|
|
||
| soup = BeautifulSoup(response.content, "html.parser") | ||
|
|
||
| # Remove noise | ||
| for tag in soup(["script", "style", "meta", "link"]): | ||
| tag.decompose() | ||
|
|
||
| content = soup.get_text(separator="\n", strip=True) | ||
|
|
||
| # Validate extracted content is not empty | ||
| if not content or len(content.strip()) < 10: | ||
| raise ValueError("URL content is empty or too short") | ||
|
|
||
| # Use domain as filename-like identifier | ||
| parsed = urlparse(url) | ||
| domain = parsed.netloc.replace("www.", "") | ||
| filename = f"{domain}_content.txt" | ||
|
|
||
| return content, filename | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If possible, I suggest merging this with
/v2/gen_tables/knowledge/embed_file, by mergingURLEmbedFormDataintoFileEmbedFormData.Then we can just check if
fileisNoneand/orurlis None.