notbadai · hnipun · Oct 27, 2025
diff --git a/websearch/MANIFEST.in b/websearch/MANIFEST.in
@@ -0,0 +1 @@
+include readme.md
diff --git a/websearch/notbadai_websearch/__init__.py b/websearch/notbadai_websearch/__init__.py
@@ -0,0 +1,90 @@
+import asyncio
+import time
+import json
+from string import Template
+from typing import List
+from pathlib import Path
+
+from notbadai_ide import api, START_METADATA, END_METADATA
+
+from .common.llm import call_llm
+from .common.utils import parse_prompt, extract_code_block
+from .common.prompt import build_context
+from .common.formatting import markdown_section
+from .websearch import WebSearch
+from .crawler import Crawler
+
+
+def websearch(query: str) -> List[str]:
+    start_time = time.time()
+    urls = WebSearch(query).search()
+    api.chat(f'{START_METADATA}{len(urls)} search results ({int(time.time() - start_time)}s): {", ".join(urls)}{END_METADATA}')
+    results = asyncio.run(Crawler(urls, query).run())
+
+    res = []
+    for result in results:
+        if result.markdown:
+            api.log(result.markdown.fit_markdown)
+            res.append(result.markdown.fit_markdown)
+        else:
+            continue
+
+    return res
+
+
+def get_prompt_template(template_path: str, **kwargs) -> str:
+    path = Path(__file__).parent / f'{template_path}.md'
+    with open(str(path)) as f:
+        template = Template(f.read())
+
+    return template.substitute(kwargs)
+
+
+def parse_query_json(content: str) -> str:
+    try:
+        data = json.loads(content.strip())
+        if 'query' not in data:
+            raise ValueError("JSON response missing 'query' field")
+        return data['query']
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Failed to parse JSON response: {e}")
+
+
+def start():
+    """Main extension function that handles chat interactions with the AI assistant."""
+    command, model, prompt = parse_prompt()
+    selection = api.get_selection()
+    chat_history = api.get_chat_history()
+    prompt = api.get_prompt()
+
+    api.chat(f'{START_METADATA}model: {model}, command: {command}{END_METADATA}')
+
+    context = build_context()
+
+    api.chat(f'{START_METADATA}With context: {len(context) :,} characters,'
+             f' selection: {bool(selection)}{END_METADATA}')
+
+    messages = [
+        {'role': 'system', 'content': get_prompt_template('query.system', model='qwen')},
+        {'role': 'user', 'content': context},
+        {'role': 'user', 'content': f'Prompt:\n\n```\n{prompt}\n```'},
+    ]
+
+    content = call_llm('qwen', messages, push_to_chat=False)
+    api.log(content)
+    search_query = parse_query_json(extract_code_block(content))
+
+    api.chat(f'{START_METADATA}Search Query: {search_query}{END_METADATA}')
+
+    results = websearch(search_query)
+    context += '\n\n' + markdown_section('Websearch Results', "\n\n".join(results))
+    api.chat(context)
+
+    messages = [
+        {'role': 'system', 'content': get_prompt_template('chat.system', model=model)},
+        {'role': 'user', 'content': context},
+        *[m.to_dict() for m in chat_history],
+        {'role': 'user', 'content': prompt},
+    ]
+
+    call_llm(model, messages)
diff --git a/websearch/notbadai_websearch/chat.system.md b/websearch/notbadai_websearch/chat.system.md
@@ -0,0 +1,37 @@
+You are an intelligent programmer, powered by {model}. You are happy to help answer any questions that the user has (usually they will be about coding).
+
+1. When the user is asking for edits to their code, please output a simplified version of the code block that highlights the changes necessary and adds comments to indicate where unchanged code has been skipped. For example:
+
+```language:path/to/file
+// ... existing code ...
+{{ 2 lines before updated_code_1 }}
+{{ updated_code_1 }}
+{{ 2 lines after updated_code_1 }}
+// ... existing code ...
+{{ 2 lines after updated_code_2 }}
+{{ updated_code_2 }}
+{{ 2 lines after updated_code_2 }}
+// ... existing code ...
+```
+
+The user prefers to only read the updates to the code. Often this will mean that the start/end of the file will be skipped, but that's okay! Rewrite the entire file only if specifically requested. Always provide a brief explanation of the updates outside the codeblocks, unless the user specifically requests only the code.
+
+Include about two unchanged non empty lines around each updated code segment. This is to help user identify where the updated code should be applied.
+
+Use the appropriate prefix for comments; e.g. `//` for Javascript/C and `#` for Python.
+
+2. Do not lie or make up facts.
+
+3. Format your response in markdown.
+
+4. When writing out new code blocks, please specify the language ID after the initial backticks, and the path of the file that needs to change. Like so:
+
+```python:my_folder/example.py
+{{ code }}
+```
+
+5. When writing out code blocks for an existing file, please also specify the file path (instead of `path/to/file` in the below example) after the initial backticks and restate the method / class your codeblock belongs to, like so:
+
+6. The code you generate might contain triple ticks (\\`\\`\\`) which could interfere with markdown formating. Use 4 or more ticks (\\`\\`\\`\\`) when defining your code block to be safe.
+
+7. Include all changes to a single file withing a single large code block instead of multiple code blocks. Use `... existing code ...` comment to separate segments.
diff --git a/websearch/notbadai_websearch/crawler.py b/websearch/notbadai_websearch/crawler.py
@@ -0,0 +1,32 @@
+from typing import List
+
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+class Crawler:
+    def __init__(self, urls: List[str], query: str):
+        self.urls = urls
+        self.query = query
+
+    async def run(self):
+        bm25_filter = BM25ContentFilter(user_query=self.query, bm25_threshold=1.2)
+        md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+
+        crawler_config = CrawlerRunConfig(
+            markdown_generator=md_generator,
+            excluded_tags=["nav", "footer", "header", "form", "img", "a"],
+            only_text=True,
+            exclude_social_media_links=True,
+            keep_data_attributes=False,
+            cache_mode=CacheMode.BYPASS,
+            remove_overlay_elements=True,
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            page_timeout=20000,
+        )
+        browser_config = BrowserConfig(headless=True, text_mode=True, light_mode=True)
+
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            results = await crawler.arun_many(self.urls, config=crawler_config)
+            return results
diff --git a/websearch/notbadai_websearch/query.system.md b/websearch/notbadai_websearch/query.system.md
@@ -0,0 +1,23 @@
+You are an intelligent search query generator for programming and software engineering questions powered by {model}. Your task is to create a single, effective web search query based on the user's question and the provided code context.
+
+## Guidelines
+
+1. **Analyze the Context**: Review the user prompt and any provided context (files, terminal, selection etc.) to understand what they need help with.
+
+2. **Generate One Focused Query**: Create a single search query that will help find the most relevant technical information. The query should be:
+   - Specific to the programming problem or question
+   - Include relevant technical terms, library names, frameworks, or language features
+   - Avoid overly broad or vague terms
+   - Written in a way that search engines can understand
+
+3. **Consider the Code Context**: If code context is provided, incorporate relevant:
+   - Programming languages (Python, JavaScript, TypeScript, etc.)
+   - Frameworks (FastAPI, React, Django, etc.)
+   - Libraries and packages being used
+   - Specific error messages or issues visible in the code
+
+4. **Output Format**: Return a JSON object with a single field "query" containing the search query string:
+
+```json
+{"query": "<SEARCH_QUERY>"}
+```
diff --git a/websearch/notbadai_websearch/websearch.py b/websearch/notbadai_websearch/websearch.py
@@ -0,0 +1,50 @@
+from typing import List
+from urllib.parse import urlparse
+from urllib.robotparser import RobotFileParser
+from collections import defaultdict
+
+from ddgs import DDGS
+
+
+class WebSearch:
+    def __init__(self,
+                 query: str,
+                 num_results: int = 10,
+                 discard_urls: List[str] = None,
+                 user_agent: str = "*"
+                 ):
+        self.query = query
+        self.num_results = num_results
+        self.discard_urls = discard_urls if discard_urls is not None else ["youtube.com", "britannica.com", "vimeo.com"]
+        self.user_agent = user_agent
+
+    def filter_urls_by_robots_txt(self, urls: List[str]) -> List[str]:
+        robot_urls = defaultdict(list)
+        for url in urls:
+            parsed = urlparse(url)
+            robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+            robot_urls[robots_url].append(url)
+
+        allowed_urls = []
+        for robots_url, url_list in robot_urls.items():
+            try:
+                rp = RobotFileParser(robots_url)
+                rp.read()
+
+                for url in url_list:
+                    if rp.can_fetch(self.user_agent, url):
+                        allowed_urls.append(url)
+            except Exception as e:
+                allowed_urls.extend(url_list)
+
+        return allowed_urls
+
+    def search(self) -> List[str]:
+        search_term = self.query
+        for url in self.discard_urls:
+            search_term += f" -site:{url}"
+
+        results = DDGS().text(search_term, max_results=self.num_results)
+        results = [result["href"] for result in results]
+
+        return self.filter_urls_by_robots_txt(results)
diff --git a/websearch/readme.md b/websearch/readme.md
diff --git a/websearch/setup.py b/websearch/setup.py
@@ -0,0 +1,35 @@
+from setuptools import setup, find_packages
+
+with open("readme.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+setup(
+    name="notbadai_websearch",
+    version="0.1.3",
+    author="NotBadAI Team",
+    author_email="contact@notbad.ai",
+    description="An intelligent programming assistant powered by AI",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/notbadai/extensions",
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    python_requires=">=3.8",
+    install_requires=[
+        "notbadai_ide",
+        "labml",
+        "requests",
+        "openai",
+        "ddgs",
+        "crawl4ai"
+    ],
+    include_package_data=True,
+    package_data={"notbadai_websearch": ["*.md", "**/*.md"]},
+)