diff --git a/websearch/MANIFEST.in b/websearch/MANIFEST.in new file mode 100644 index 0000000..84c71df --- /dev/null +++ b/websearch/MANIFEST.in @@ -0,0 +1 @@ +include readme.md \ No newline at end of file diff --git a/websearch/notbadai_websearch/__init__.py b/websearch/notbadai_websearch/__init__.py new file mode 100644 index 0000000..f500146 --- /dev/null +++ b/websearch/notbadai_websearch/__init__.py @@ -0,0 +1,90 @@ +import asyncio +import time +import json +from string import Template +from typing import List +from pathlib import Path + +from notbadai_ide import api, START_METADATA, END_METADATA + +from .common.llm import call_llm +from .common.utils import parse_prompt, extract_code_block +from .common.prompt import build_context +from .common.formatting import markdown_section +from .websearch import WebSearch +from .crawler import Crawler + + +def websearch(query: str) -> List[str]: + start_time = time.time() + urls = WebSearch(query).search() + api.chat(f'{START_METADATA}{len(urls)} search results ({int(time.time() - start_time)}s): {", ".join(urls)}{END_METADATA}') + results = asyncio.run(Crawler(urls, query).run()) + + res = [] + for result in results: + if result.markdown: + api.log(result.markdown.fit_markdown) + res.append(result.markdown.fit_markdown) + else: + continue + + return res + + +def get_prompt_template(template_path: str, **kwargs) -> str: + path = Path(__file__).parent / f'{template_path}.md' + with open(str(path)) as f: + template = Template(f.read()) + + return template.substitute(kwargs) + + +def parse_query_json(content: str) -> str: + try: + data = json.loads(content.strip()) + if 'query' not in data: + raise ValueError("JSON response missing 'query' field") + return data['query'] + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse JSON response: {e}") + + +def start(): + """Main extension function that handles chat interactions with the AI assistant.""" + command, model, prompt = parse_prompt() + selection = api.get_selection() + chat_history = api.get_chat_history() + prompt = api.get_prompt() + + api.chat(f'{START_METADATA}model: {model}, command: {command}{END_METADATA}') + + context = build_context() + + api.chat(f'{START_METADATA}With context: {len(context) :,} characters,' + f' selection: {bool(selection)}{END_METADATA}') + + messages = [ + {'role': 'system', 'content': get_prompt_template('query.system', model='qwen')}, + {'role': 'user', 'content': context}, + {'role': 'user', 'content': f'Prompt:\n\n```\n{prompt}\n```'}, + ] + + content = call_llm('qwen', messages, push_to_chat=False) + api.log(content) + search_query = parse_query_json(extract_code_block(content)) + + api.chat(f'{START_METADATA}Search Query: {search_query}{END_METADATA}') + + results = websearch(search_query) + context += '\n\n' + markdown_section('Websearch Results', "\n\n".join(results)) + api.chat(context) + + messages = [ + {'role': 'system', 'content': get_prompt_template('chat.system', model=model)}, + {'role': 'user', 'content': context}, + *[m.to_dict() for m in chat_history], + {'role': 'user', 'content': prompt}, + ] + + call_llm(model, messages) diff --git a/websearch/notbadai_websearch/chat.system.md b/websearch/notbadai_websearch/chat.system.md new file mode 100644 index 0000000..d2828d4 --- /dev/null +++ b/websearch/notbadai_websearch/chat.system.md @@ -0,0 +1,37 @@ +You are an intelligent programmer, powered by {model}. You are happy to help answer any questions that the user has (usually they will be about coding). + +1. When the user is asking for edits to their code, please output a simplified version of the code block that highlights the changes necessary and adds comments to indicate where unchanged code has been skipped. For example: + +```language:path/to/file +// ... existing code ... +{{ 2 lines before updated_code_1 }} +{{ updated_code_1 }} +{{ 2 lines after updated_code_1 }} +// ... existing code ... +{{ 2 lines after updated_code_2 }} +{{ updated_code_2 }} +{{ 2 lines after updated_code_2 }} +// ... existing code ... +``` + +The user prefers to only read the updates to the code. Often this will mean that the start/end of the file will be skipped, but that's okay! Rewrite the entire file only if specifically requested. Always provide a brief explanation of the updates outside the codeblocks, unless the user specifically requests only the code. + +Include about two unchanged non empty lines around each updated code segment. This is to help user identify where the updated code should be applied. + +Use the appropriate prefix for comments; e.g. `//` for Javascript/C and `#` for Python. + +2. Do not lie or make up facts. + +3. Format your response in markdown. + +4. When writing out new code blocks, please specify the language ID after the initial backticks, and the path of the file that needs to change. Like so: + +```python:my_folder/example.py +{{ code }} +``` + +5. When writing out code blocks for an existing file, please also specify the file path (instead of `path/to/file` in the below example) after the initial backticks and restate the method / class your codeblock belongs to, like so: + +6. The code you generate might contain triple ticks (\\`\\`\\`) which could interfere with markdown formating. Use 4 or more ticks (\\`\\`\\`\\`) when defining your code block to be safe. + +7. Include all changes to a single file withing a single large code block instead of multiple code blocks. Use `... existing code ...` comment to separate segments. \ No newline at end of file diff --git a/websearch/notbadai_websearch/crawler.py b/websearch/notbadai_websearch/crawler.py new file mode 100644 index 0000000..b1eb22b --- /dev/null +++ b/websearch/notbadai_websearch/crawler.py @@ -0,0 +1,32 @@ +from typing import List + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + + +class Crawler: + def __init__(self, urls: List[str], query: str): + self.urls = urls + self.query = query + + async def run(self): + bm25_filter = BM25ContentFilter(user_query=self.query, bm25_threshold=1.2) + md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter) + + crawler_config = CrawlerRunConfig( + markdown_generator=md_generator, + excluded_tags=["nav", "footer", "header", "form", "img", "a"], + only_text=True, + exclude_social_media_links=True, + keep_data_attributes=False, + cache_mode=CacheMode.BYPASS, + remove_overlay_elements=True, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", + page_timeout=20000, + ) + browser_config = BrowserConfig(headless=True, text_mode=True, light_mode=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many(self.urls, config=crawler_config) + return results diff --git a/websearch/notbadai_websearch/query.system.md b/websearch/notbadai_websearch/query.system.md new file mode 100644 index 0000000..f1fbc8e --- /dev/null +++ b/websearch/notbadai_websearch/query.system.md @@ -0,0 +1,23 @@ +You are an intelligent search query generator for programming and software engineering questions powered by {model}. Your task is to create a single, effective web search query based on the user's question and the provided code context. + +## Guidelines + +1. **Analyze the Context**: Review the user prompt and any provided context (files, terminal, selection etc.) to understand what they need help with. + +2. **Generate One Focused Query**: Create a single search query that will help find the most relevant technical information. The query should be: + - Specific to the programming problem or question + - Include relevant technical terms, library names, frameworks, or language features + - Avoid overly broad or vague terms + - Written in a way that search engines can understand + +3. **Consider the Code Context**: If code context is provided, incorporate relevant: + - Programming languages (Python, JavaScript, TypeScript, etc.) + - Frameworks (FastAPI, React, Django, etc.) + - Libraries and packages being used + - Specific error messages or issues visible in the code + +4. **Output Format**: Return a JSON object with a single field "query" containing the search query string: + +```json +{"query": ""} +``` \ No newline at end of file diff --git a/websearch/notbadai_websearch/websearch.py b/websearch/notbadai_websearch/websearch.py new file mode 100644 index 0000000..b4e4608 --- /dev/null +++ b/websearch/notbadai_websearch/websearch.py @@ -0,0 +1,50 @@ +from typing import List +from urllib.parse import urlparse +from urllib.robotparser import RobotFileParser +from collections import defaultdict + +from ddgs import DDGS + + +class WebSearch: + def __init__(self, + query: str, + num_results: int = 10, + discard_urls: List[str] = None, + user_agent: str = "*" + ): + self.query = query + self.num_results = num_results + self.discard_urls = discard_urls if discard_urls is not None else ["youtube.com", "britannica.com", "vimeo.com"] + self.user_agent = user_agent + + def filter_urls_by_robots_txt(self, urls: List[str]) -> List[str]: + robot_urls = defaultdict(list) + for url in urls: + parsed = urlparse(url) + robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" + robot_urls[robots_url].append(url) + + allowed_urls = [] + for robots_url, url_list in robot_urls.items(): + try: + rp = RobotFileParser(robots_url) + rp.read() + + for url in url_list: + if rp.can_fetch(self.user_agent, url): + allowed_urls.append(url) + except Exception as e: + allowed_urls.extend(url_list) + + return allowed_urls + + def search(self) -> List[str]: + search_term = self.query + for url in self.discard_urls: + search_term += f" -site:{url}" + + results = DDGS().text(search_term, max_results=self.num_results) + results = [result["href"] for result in results] + + return self.filter_urls_by_robots_txt(results) diff --git a/websearch/readme.md b/websearch/readme.md new file mode 100644 index 0000000..e69de29 diff --git a/websearch/setup.py b/websearch/setup.py new file mode 100644 index 0000000..40bf2a7 --- /dev/null +++ b/websearch/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup, find_packages + +with open("readme.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setup( + name="notbadai_websearch", + version="0.1.3", + author="NotBadAI Team", + author_email="contact@notbad.ai", + description="An intelligent programming assistant powered by AI", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/notbadai/extensions", + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries', + "Topic :: Software Development :: Libraries :: Python Modules", + ], + python_requires=">=3.8", + install_requires=[ + "notbadai_ide", + "labml", + "requests", + "openai", + "ddgs", + "crawl4ai" + ], + include_package_data=True, + package_data={"notbadai_websearch": ["*.md", "**/*.md"]}, +)