Skip to content

Commit 6e307d2

Browse files
author
SentienceDEV
committed
dedicated support for read tasks
1 parent 6ac3614 commit 6e307d2

File tree

2 files changed

+357
-42
lines changed

2 files changed

+357
-42
lines changed

predicate/agents/planner_executor_agent.py

Lines changed: 184 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,112 @@ def build_predicate(spec: PredicateSpec | dict[str, Any]) -> Predicate:
965965
raise ValueError(f"Unsupported predicate: {name}")
966966

967967

968+
# ---------------------------------------------------------------------------
969+
# Extraction Keywords for Markdown-based Text Extraction
970+
# ---------------------------------------------------------------------------
971+
972+
# Keywords that indicate a simple text extraction task suitable for read_markdown()
973+
# These tasks don't need LLM-based extraction - just return the page content as markdown
974+
TEXT_EXTRACTION_KEYWORDS = frozenset([
975+
# Direct extraction verbs
976+
"extract",
977+
"read",
978+
"parse",
979+
"scrape",
980+
"get",
981+
"fetch",
982+
"retrieve",
983+
"capture",
984+
"grab",
985+
"copy",
986+
"pull",
987+
# Question words that indicate reading content
988+
"what is",
989+
"what are",
990+
"what's",
991+
"show me",
992+
"tell me",
993+
"find",
994+
"list",
995+
"display",
996+
# Content-specific patterns
997+
"title",
998+
"headline",
999+
"heading",
1000+
"text",
1001+
"content",
1002+
"body",
1003+
"paragraph",
1004+
"article",
1005+
"post",
1006+
"message",
1007+
"description",
1008+
"summary",
1009+
"excerpt",
1010+
# Data extraction patterns
1011+
"price",
1012+
"cost",
1013+
"amount",
1014+
"name",
1015+
"label",
1016+
"value",
1017+
"number",
1018+
"date",
1019+
"time",
1020+
"address",
1021+
"email",
1022+
"phone",
1023+
"rating",
1024+
"review",
1025+
"comment",
1026+
"author",
1027+
"username",
1028+
# Table/list extraction
1029+
"table",
1030+
"row",
1031+
"column",
1032+
"item",
1033+
"entry",
1034+
"record",
1035+
])
1036+
1037+
1038+
def _is_text_extraction_task(task: str) -> bool:
1039+
"""
1040+
Determine if a task is a simple text extraction that can use read_markdown().
1041+
1042+
Returns True if the task contains keywords indicating text extraction,
1043+
where returning the page markdown is sufficient without LLM-based extraction.
1044+
1045+
Args:
1046+
task: The task description to analyze
1047+
1048+
Returns:
1049+
True if this is a text extraction task suitable for read_markdown()
1050+
"""
1051+
if not task:
1052+
return False
1053+
1054+
task_lower = task.lower()
1055+
1056+
# Check for extraction keyword patterns using word boundary matching
1057+
# to avoid false positives (e.g., "time" in "sentiment")
1058+
for keyword in TEXT_EXTRACTION_KEYWORDS:
1059+
# Multi-word keywords (like "what is") use substring matching
1060+
if " " in keyword:
1061+
if keyword in task_lower:
1062+
return True
1063+
else:
1064+
# Single-word keywords use word boundary matching via regex
1065+
# Match keyword at word boundaries, allowing for plurals (optional 's' or 'es')
1066+
# e.g., "title" matches "title", "titles", "title's"
1067+
pattern = rf"\b{re.escape(keyword)}(s|es)?\b"
1068+
if re.search(pattern, task_lower):
1069+
return True
1070+
1071+
return False
1072+
1073+
9681074
# ---------------------------------------------------------------------------
9691075
# Plan Normalization and Validation
9701076
# ---------------------------------------------------------------------------
@@ -4178,39 +4284,89 @@ async def _execute_step(
41784284

41794285
if action_type == "EXTRACT":
41804286
action_taken = "EXTRACT"
4181-
page = (
4182-
getattr(getattr(runtime, "backend", None), "page", None)
4183-
or getattr(getattr(runtime, "backend", None), "_page", None)
4184-
or getattr(runtime, "_legacy_page", None)
4287+
# Determine extraction query from step goal or task
4288+
extract_query = step.goal or (
4289+
self._current_task.task if self._current_task is not None else "Extract relevant data from the current page"
41854290
)
4186-
if page is None:
4187-
error = "No page available for EXTRACT"
4188-
else:
4189-
from types import SimpleNamespace
41904291

4191-
from ..read import extract_async
4292+
# Check if this is a text extraction task that can use markdown-based extraction
4293+
use_markdown_extraction = _is_text_extraction_task(extract_query)
41924294

4193-
browser_like = SimpleNamespace(page=page)
4194-
extract_query = step.goal or (
4195-
self._current_task.task if self._current_task is not None else "Extract relevant data from the current page"
4196-
)
4197-
result = await extract_async(
4198-
browser_like,
4199-
self.planner,
4200-
query=extract_query,
4201-
schema=None,
4202-
)
4203-
llm_resp = getattr(result, "llm_response", None)
4204-
if llm_resp is not None:
4205-
self._record_token_usage("extract", llm_resp)
4206-
if result.ok:
4207-
extraction_succeeded = True
4208-
extracted_data = result.data
4295+
if use_markdown_extraction:
4296+
# Step 1: Get page content as markdown (faster than snapshot-based extraction)
4297+
markdown_content = await runtime.read_markdown(max_chars=8000)
4298+
if markdown_content:
42094299
if self.config.verbose:
4210-
preview = str(result.raw or "")[:160]
4211-
print(f" [ACTION] EXTRACT ok: {preview}", flush=True)
4300+
preview = markdown_content[:160].replace("\n", " ")
4301+
print(f" [ACTION] EXTRACT - got markdown: {preview}...", flush=True)
4302+
4303+
# Step 2: Use LLM (executor) to extract specific data from markdown
4304+
extraction_prompt = f"""You are a text extraction assistant. Given the page content in markdown format, extract the specific information requested.
4305+
4306+
PAGE CONTENT (MARKDOWN):
4307+
{markdown_content}
4308+
4309+
EXTRACTION REQUEST:
4310+
{extract_query}
4311+
4312+
INSTRUCTIONS:
4313+
1. Read the markdown content carefully
4314+
2. Find and extract ONLY the specific information requested
4315+
3. Return ONLY the extracted text, nothing else
4316+
4. If the information is not found, return "NOT_FOUND"
4317+
4318+
EXTRACTED TEXT:"""
4319+
4320+
resp = self.executor.generate(
4321+
"You extract specific text from markdown content. Return only the extracted text.",
4322+
extraction_prompt,
4323+
temperature=0.0,
4324+
max_new_tokens=500,
4325+
)
4326+
self._record_token_usage("extract", resp)
4327+
4328+
extracted_text = resp.content.strip()
4329+
if extracted_text and extracted_text != "NOT_FOUND":
4330+
extraction_succeeded = True
4331+
extracted_data = {"text": extracted_text, "query": extract_query}
4332+
if self.config.verbose:
4333+
print(f" [ACTION] EXTRACT ok: {extracted_text[:160]}", flush=True)
4334+
else:
4335+
error = f"Could not find requested data: {extract_query}"
4336+
else:
4337+
error = "Failed to extract markdown from page"
4338+
else:
4339+
# Use LLM-based extraction for complex extraction tasks
4340+
page = (
4341+
getattr(getattr(runtime, "backend", None), "page", None)
4342+
or getattr(getattr(runtime, "backend", None), "_page", None)
4343+
or getattr(runtime, "_legacy_page", None)
4344+
)
4345+
if page is None:
4346+
error = "No page available for EXTRACT"
42124347
else:
4213-
error = result.error or "Extraction failed"
4348+
from types import SimpleNamespace
4349+
4350+
from ..read import extract_async
4351+
4352+
browser_like = SimpleNamespace(page=page)
4353+
result = await extract_async(
4354+
browser_like,
4355+
self.planner,
4356+
query=extract_query,
4357+
schema=None,
4358+
)
4359+
llm_resp = getattr(result, "llm_response", None)
4360+
if llm_resp is not None:
4361+
self._record_token_usage("extract", llm_resp)
4362+
if result.ok:
4363+
extraction_succeeded = True
4364+
extracted_data = result.data
4365+
if self.config.verbose:
4366+
preview = str(result.raw or "")[:160]
4367+
print(f" [ACTION] EXTRACT ok: {preview}", flush=True)
4368+
else:
4369+
error = result.error or "Extraction failed"
42144370
elif action_type in ("CLICK", "TYPE_AND_SUBMIT"):
42154371
# Try intent heuristics first (if available)
42164372
elements = getattr(ctx.snapshot, "elements", []) or []

0 commit comments

Comments
 (0)