-
Notifications
You must be signed in to change notification settings - Fork 1
Persist slide images outside localStorage #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9296c8c
dc98d07
5fc4e06
81f1772
5c6b605
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |||||||||||||||||||
| text stay lightweight and do not require Docling. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import re | ||||||||||||||||||||
| from dataclasses import dataclass, field | ||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||
| from typing import Any, Dict, Optional | ||||||||||||||||||||
|
|
@@ -51,7 +52,7 @@ def _parse_pdf_text(self, path: Path) -> Optional[ParsedDocument]: | |||||||||||||||||||
| try: | ||||||||||||||||||||
| from pypdf import PdfReader | ||||||||||||||||||||
| except ImportError: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| return self._parse_literal_pdf_text(path) | ||||||||||||||||||||
|
Comment on lines
52
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The fallback literal PDF text extractor is extremely fragile. It does not handle FlateDecode stream compression (which is standard for almost all modern PDFs), hexadecimal strings,
Suggested change
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| try: | ||||||||||||||||||||
| reader = PdfReader(str(path)) | ||||||||||||||||||||
|
|
@@ -60,15 +61,39 @@ def _parse_pdf_text(self, path: Path) -> Optional[ParsedDocument]: | |||||||||||||||||||
| text = page.extract_text() or "" | ||||||||||||||||||||
| if text.strip(): | ||||||||||||||||||||
| pages.append(f"<!-- page: {index} -->\n{text.strip()}") | ||||||||||||||||||||
| if not pages: | ||||||||||||||||||||
| fallback_text = self._extract_literal_pdf_text(path) | ||||||||||||||||||||
| if fallback_text: | ||||||||||||||||||||
| pages.append(f"<!-- page: 1 -->\n{fallback_text}") | ||||||||||||||||||||
| except Exception: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| return self._parse_literal_pdf_text(path) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| return ParsedDocument( | ||||||||||||||||||||
| filename=path.name, | ||||||||||||||||||||
| normalized_markdown="\n\n".join(pages), | ||||||||||||||||||||
| metadata={"parser": "pypdf", "extension": ".pdf", "pages": len(reader.pages)}, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _parse_literal_pdf_text(self, path: Path) -> Optional[ParsedDocument]: | ||||||||||||||||||||
| fallback_text = self._extract_literal_pdf_text(path) | ||||||||||||||||||||
| if not fallback_text: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| return ParsedDocument( | ||||||||||||||||||||
| filename=path.name, | ||||||||||||||||||||
| normalized_markdown=f"<!-- page: 1 -->\n{fallback_text}", | ||||||||||||||||||||
| metadata={"parser": "pypdf", "extension": ".pdf", "pages": 1}, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
Comment on lines
+77
to
+85
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def _extract_literal_pdf_text(self, path: Path) -> str: | ||||||||||||||||||||
| raw = path.read_bytes() | ||||||||||||||||||||
| chunks = [] | ||||||||||||||||||||
| for match in re.findall(rb"\((.*?)\)\s*Tj", raw, flags=re.DOTALL): | ||||||||||||||||||||
| text = match.replace(rb"\(", b"(").replace(rb"\)", b")").replace(rb"\\", b"\\") | ||||||||||||||||||||
| decoded = text.decode("utf-8", errors="ignore").strip() | ||||||||||||||||||||
| if decoded: | ||||||||||||||||||||
| chunks.append(decoded) | ||||||||||||||||||||
| return "\n".join(chunks) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _parse_with_docling(self, path: Path) -> ParsedDocument: | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| from docling.document_converter import DocumentConverter | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -30,7 +30,13 @@ def generate_text(self, prompt: str, system_instruction: Optional[str] = None) - | |||||||||
| if system_instruction: | ||||||||||
| messages.append({"role": "system", "content": system_instruction}) | ||||||||||
| messages.append({"role": "user", "content": prompt}) | ||||||||||
| response = client.chat.completions.create(model=profile.model, messages=messages) | ||||||||||
| kwargs: Dict[str, Any] = { | ||||||||||
| "model": profile.model, | ||||||||||
| "messages": messages, | ||||||||||
| } | ||||||||||
| if profile.thinking in {"enabled", "disabled"}: | ||||||||||
| kwargs["extra_body"] = {"thinking": {"type": profile.thinking}} | ||||||||||
|
Comment on lines
+37
to
+38
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the client configuration, passing
Suggested change
|
||||||||||
| response = client.chat.completions.create(**kwargs) | ||||||||||
| if not response.choices: | ||||||||||
| return "" | ||||||||||
| return response.choices[0].message.content or "" | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| import unittest | ||
| from types import SimpleNamespace | ||
| from unittest.mock import patch | ||
|
|
||
| from src.model_profiles import ModelProfile, ModelProfileSet | ||
| from src.model_router import ModelRouter | ||
|
|
||
|
|
||
| class ModelRouterTest(unittest.TestCase): | ||
| def test_generate_text_passes_thinking_extra_body(self): | ||
| profiles = ModelProfileSet( | ||
| prompt=ModelProfile( | ||
| role="prompt", | ||
| model="text-model", | ||
| base_url="https://text.example/v1", | ||
| api_key="text-key", | ||
| thinking="enabled", | ||
| ), | ||
| image=ModelProfile( | ||
| role="image", | ||
| model="image-model", | ||
| base_url="https://image.example/v1", | ||
| api_key="image-key", | ||
| adapter="raw_chat_multimodal", | ||
| ), | ||
| edit=ModelProfile( | ||
| role="edit", | ||
| model="image-model", | ||
| base_url="https://image.example/v1", | ||
| api_key="image-key", | ||
| adapter="raw_chat_multimodal", | ||
| ), | ||
| ) | ||
|
|
||
| response = SimpleNamespace( | ||
| choices=[SimpleNamespace(message=SimpleNamespace(content="ok"))] | ||
| ) | ||
|
|
||
| with patch("src.model_router.OpenAI") as openai_cls: | ||
| create = openai_cls.return_value.chat.completions.create | ||
| create.return_value = response | ||
|
|
||
| result = ModelRouter(profiles).generate_text("hello") | ||
|
|
||
| self.assertEqual(result, "ok") | ||
| create.assert_called_once() | ||
| self.assertEqual( | ||
| create.call_args.kwargs["extra_body"], | ||
| {"thinking": {"type": "enabled"}}, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Passing
extra_bodywith"thinking": {"type": "disabled"}by default when thinking is disabled will break compatibility with standard OpenAI-compatible endpoints that do not support thethinkingparameter (resulting in a 400 Bad Request). We should only pass thethinkingparameter inextra_bodywhen it is explicitly set to"enabled".