Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions websearch/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include readme.md
90 changes: 90 additions & 0 deletions websearch/notbadai_websearch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import asyncio
import time
import json
from string import Template
from typing import List
from pathlib import Path

from notbadai_ide import api, START_METADATA, END_METADATA

from .common.llm import call_llm
from .common.utils import parse_prompt, extract_code_block
from .common.prompt import build_context
from .common.formatting import markdown_section
from .websearch import WebSearch
from .crawler import Crawler


def websearch(query: str) -> List[str]:
start_time = time.time()
urls = WebSearch(query).search()
api.chat(f'{START_METADATA}{len(urls)} search results ({int(time.time() - start_time)}s): {", ".join(urls)}{END_METADATA}')
results = asyncio.run(Crawler(urls, query).run())

res = []
for result in results:
if result.markdown:
api.log(result.markdown.fit_markdown)
res.append(result.markdown.fit_markdown)
else:
continue

return res


def get_prompt_template(template_path: str, **kwargs) -> str:
path = Path(__file__).parent / f'{template_path}.md'
with open(str(path)) as f:
template = Template(f.read())

return template.substitute(kwargs)


def parse_query_json(content: str) -> str:
try:
data = json.loads(content.strip())
if 'query' not in data:
raise ValueError("JSON response missing 'query' field")
return data['query']
except json.JSONDecodeError as e:
raise ValueError(f"Failed to parse JSON response: {e}")


def start():
"""Main extension function that handles chat interactions with the AI assistant."""
command, model, prompt = parse_prompt()
selection = api.get_selection()
chat_history = api.get_chat_history()
prompt = api.get_prompt()

api.chat(f'{START_METADATA}model: {model}, command: {command}{END_METADATA}')

context = build_context()

api.chat(f'{START_METADATA}With context: {len(context) :,} characters,'
f' selection: {bool(selection)}{END_METADATA}')

messages = [
{'role': 'system', 'content': get_prompt_template('query.system', model='qwen')},
{'role': 'user', 'content': context},
{'role': 'user', 'content': f'Prompt:\n\n```\n{prompt}\n```'},
]

content = call_llm('qwen', messages, push_to_chat=False)
api.log(content)
search_query = parse_query_json(extract_code_block(content))

api.chat(f'{START_METADATA}Search Query: {search_query}{END_METADATA}')

results = websearch(search_query)
context += '\n\n' + markdown_section('Websearch Results', "\n\n".join(results))
api.chat(context)

messages = [
{'role': 'system', 'content': get_prompt_template('chat.system', model=model)},
{'role': 'user', 'content': context},
*[m.to_dict() for m in chat_history],
{'role': 'user', 'content': prompt},
]

call_llm(model, messages)
37 changes: 37 additions & 0 deletions websearch/notbadai_websearch/chat.system.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
You are an intelligent programmer, powered by {model}. You are happy to help answer any questions that the user has (usually they will be about coding).

1. When the user is asking for edits to their code, please output a simplified version of the code block that highlights the changes necessary and adds comments to indicate where unchanged code has been skipped. For example:

```language:path/to/file
// ... existing code ...
{{ 2 lines before updated_code_1 }}
{{ updated_code_1 }}
{{ 2 lines after updated_code_1 }}
// ... existing code ...
{{ 2 lines after updated_code_2 }}
{{ updated_code_2 }}
{{ 2 lines after updated_code_2 }}
// ... existing code ...
```

The user prefers to only read the updates to the code. Often this will mean that the start/end of the file will be skipped, but that's okay! Rewrite the entire file only if specifically requested. Always provide a brief explanation of the updates outside the codeblocks, unless the user specifically requests only the code.

Include about two unchanged non empty lines around each updated code segment. This is to help user identify where the updated code should be applied.

Use the appropriate prefix for comments; e.g. `//` for Javascript/C and `#` for Python.

2. Do not lie or make up facts.

3. Format your response in markdown.

4. When writing out new code blocks, please specify the language ID after the initial backticks, and the path of the file that needs to change. Like so:

```python:my_folder/example.py
{{ code }}
```

5. When writing out code blocks for an existing file, please also specify the file path (instead of `path/to/file` in the below example) after the initial backticks and restate the method / class your codeblock belongs to, like so:

6. The code you generate might contain triple ticks (\\`\\`\\`) which could interfere with markdown formating. Use 4 or more ticks (\\`\\`\\`\\`) when defining your code block to be safe.

7. Include all changes to a single file withing a single large code block instead of multiple code blocks. Use `... existing code ...` comment to separate segments.
32 changes: 32 additions & 0 deletions websearch/notbadai_websearch/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List

from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator


class Crawler:
def __init__(self, urls: List[str], query: str):
self.urls = urls
self.query = query

async def run(self):
bm25_filter = BM25ContentFilter(user_query=self.query, bm25_threshold=1.2)
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)

crawler_config = CrawlerRunConfig(
markdown_generator=md_generator,
excluded_tags=["nav", "footer", "header", "form", "img", "a"],
only_text=True,
exclude_social_media_links=True,
keep_data_attributes=False,
cache_mode=CacheMode.BYPASS,
remove_overlay_elements=True,
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
page_timeout=20000,
)
browser_config = BrowserConfig(headless=True, text_mode=True, light_mode=True)

async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(self.urls, config=crawler_config)
return results
23 changes: 23 additions & 0 deletions websearch/notbadai_websearch/query.system.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
You are an intelligent search query generator for programming and software engineering questions powered by {model}. Your task is to create a single, effective web search query based on the user's question and the provided code context.

## Guidelines

1. **Analyze the Context**: Review the user prompt and any provided context (files, terminal, selection etc.) to understand what they need help with.

2. **Generate One Focused Query**: Create a single search query that will help find the most relevant technical information. The query should be:
- Specific to the programming problem or question
- Include relevant technical terms, library names, frameworks, or language features
- Avoid overly broad or vague terms
- Written in a way that search engines can understand

3. **Consider the Code Context**: If code context is provided, incorporate relevant:
- Programming languages (Python, JavaScript, TypeScript, etc.)
- Frameworks (FastAPI, React, Django, etc.)
- Libraries and packages being used
- Specific error messages or issues visible in the code

4. **Output Format**: Return a JSON object with a single field "query" containing the search query string:

```json
{"query": "<SEARCH_QUERY>"}
```
50 changes: 50 additions & 0 deletions websearch/notbadai_websearch/websearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from typing import List
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from collections import defaultdict

from ddgs import DDGS


class WebSearch:
def __init__(self,
query: str,
num_results: int = 10,
discard_urls: List[str] = None,
user_agent: str = "*"
):
self.query = query
self.num_results = num_results
self.discard_urls = discard_urls if discard_urls is not None else ["youtube.com", "britannica.com", "vimeo.com"]
self.user_agent = user_agent

def filter_urls_by_robots_txt(self, urls: List[str]) -> List[str]:
robot_urls = defaultdict(list)
for url in urls:
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
robot_urls[robots_url].append(url)

allowed_urls = []
for robots_url, url_list in robot_urls.items():
try:
rp = RobotFileParser(robots_url)
rp.read()

for url in url_list:
if rp.can_fetch(self.user_agent, url):
allowed_urls.append(url)
except Exception as e:
allowed_urls.extend(url_list)

return allowed_urls

def search(self) -> List[str]:
search_term = self.query
for url in self.discard_urls:
search_term += f" -site:{url}"

results = DDGS().text(search_term, max_results=self.num_results)
results = [result["href"] for result in results]

return self.filter_urls_by_robots_txt(results)
Empty file added websearch/readme.md
Empty file.
35 changes: 35 additions & 0 deletions websearch/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from setuptools import setup, find_packages

with open("readme.md", "r", encoding="utf-8") as fh:
long_description = fh.read()

setup(
name="notbadai_websearch",
version="0.1.3",
author="NotBadAI Team",
author_email="contact@notbad.ai",
description="An intelligent programming assistant powered by AI",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/notbadai/extensions",
packages=find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
"Topic :: Software Development :: Libraries :: Python Modules",
],
python_requires=">=3.8",
install_requires=[
"notbadai_ide",
"labml",
"requests",
"openai",
"ddgs",
"crawl4ai"
],
include_package_data=True,
package_data={"notbadai_websearch": ["*.md", "**/*.md"]},
)