code-next-ai/main.py at main · github-hc/code-next-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from backend.retrieval import hybrid_search
from backend.retrieval import hybrid_search
import sys
import argparse
import os
os.environ["ANONYMIZED_TELEMETRY"] = "false"
import json
from pathlib import Path

def load_settings():
    settings_path = Path("settings.json")
    if settings_path.exists():
        try:
            with open(settings_path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
SETTINGS = load_settings()
LOG_CHUNKS = SETTINGS.get("log_chunks", False)
LOG_AST_PARSER = SETTINGS.get("log_ast_parser", False)

from backend.retrieval.scanner import FileScanner
from backend.retrieval.parser import ASTParser, TreeSitterParser
from backend.retrieval.chunker import ChunkBuilder
from backend.retrieval.embeddings import OllamaEmbedder
from backend.retrieval.vector_store import ChromaVectorStore
from backend.retrieval.semantic_search import SemanticSearch
from backend.retrieval.keyword_search import KeywordSearch
from backend.retrieval.reranker import Reranker
from backend.retrieval.hybrid_search import HybridSearch
from backend.generation.ollama_llm import OllamaLLM

def build_index(repo_path: str):
    """
    Scans the repository, extracts symbols, generates embeddings, and stores them.
    """
    repo_path_obj = Path(repo_path).expanduser().resolve()
    print(f"[*] Scanning repository: {repo_path_obj} ...")

    scanner = FileScanner(repo_path_obj)
    files = scanner.scan()
    print(f"[*] Found {len(files)} valid source files.")

    ast_parser = ASTParser(log_ast_parser=LOG_AST_PARSER)
    ts_parser = TreeSitterParser()
    chunker = ChunkBuilder()
    embedder = OllamaEmbedder()
    vector_store = ChromaVectorStore()

    print("[*] Purging previous index...")
    vector_store.clear()

    if LOG_CHUNKS:
        with open("chunks_log.txt", "w", encoding="utf-8") as f:
            f.write(f"=== CHUNK LOG for {repo_path_obj} ===\n\n")

    total_chunks = 0
    for i, file in enumerate(files):
        print(f"  [{i+1}/{len(files)}] Parsing {file.name} ...")
        if file.suffix == '.py':
            symbols = ast_parser.parse_file(file)
            if LOG_AST_PARSER:
                with open("log.txt", "a", encoding="utf-8") as log_f:
                    log_f.write(f"--- Symbols for {file.name} ---\n")
                    log_f.write(json.dumps(symbols, indent=4) + "\n\n")
        else:
            symbols = ts_parser.parse_file(file)

        if not symbols:
            continue

        chunks = chunker.build_chunks(file, symbols)
        if not chunks:
            continue

        # Generate embeddings and assign to chunks
        for chunk in chunks:
            chunk.embedding = embedder.embed_text(chunk.code)

            if LOG_CHUNKS:
                log_msg = (
                    f"\n[DEBUG] --------------------------------------------------\n"
                    f"[DEBUG] Storing Chunk -> Symbol: {chunk.symbol_name}\n"
                    f"[DEBUG] File: {chunk.file_path} (Lines {chunk.start_line}-{chunk.end_line})\n"
                    f"[DEBUG] Code:\n{chunk.code}\n"
                    f"[DEBUG] --------------------------------------------------\n"
                )
                print(log_msg)
                with open("chunks_log.txt", "a", encoding="utf-8") as f:
                    f.write(log_msg + "\n")

        vector_store.add_chunks(chunks)
        total_chunks += len(chunks)

    print(f"[*] Indexing complete. Indexed {total_chunks} code chunks into ChromaDB.")
    return total_chunks

def query_repo(repo_path: str, query: str, model_name: str = "qwen2.5:7b"):
    """
    Executes a hybrid search query against the indexed repository.
    Returns (token_stream_generator, results_list).
    """
    repo_path_obj = Path(repo_path).expanduser().resolve()
    print(f"\n==================================================")
    print(f"QUERY:\n\"{query}\"\n")
    print(f"RESULTS:")

    embedder = OllamaEmbedder()
    vector_store = ChromaVectorStore()

    semantic_search = SemanticSearch(embedder, vector_store)
    keyword_search = KeywordSearch(repo_path_obj)
    reranker = Reranker()

    hybrid_search = HybridSearch(semantic_search, keyword_search, reranker)
    results = hybrid_search.search(query, top_k=5)

    llm = OllamaLLM(model_name=model_name)
    token_stream = llm.generate_answer_stream(query, results)

    return token_stream, results

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Local AI Coding Agent MVP")
    parser.add_argument("command", choices=["index", "query"], help="Command to run: 'index' to build DB, 'query' to search.")
    parser.add_argument("--repo", default=".", help="Path to the repository to scan/query (default: current dir)")
    parser.add_argument("--query", type=str, help="The natural language query (required for 'query' command)")

    args = parser.parse_args()

    if args.command == "index":
        build_index(args.repo)
    elif args.command == "query":
        if not args.query:
            print("Error: --query argument is required when using the 'query' command.")
            sys.exit(1)
        token_stream, results = query_repo(args.repo, args.query)

        print("\n=== AI Answer ===")
        # Print each token as it arrives for a typewriter effect in the terminal
        for token in token_stream:
            print(token, end="", flush=True)
        print()
        print("\n=== References ===")

        if not results:
            print("No relevant code chunks found.")
            print("==================================================")
        else:
            repo_path_obj = Path(args.repo).expanduser().resolve()
            for i, res in enumerate(results, 1):
                print(f"{i}.")
                try:
                    rel_path = Path(res['file_path']).relative_to(repo_path_obj)
                except ValueError:
                    rel_path = res['file_path']
                print(f"File: {rel_path}")
                print(f"Symbol: {res['symbol_name']}")
                print(f"Lines: {res['start_line']}-{res['end_line']}")
                print(f"Score: {res['score']}")
                print(f"Code Snippet:\n--------------------------------------------------\n{res['code']}\n--------------------------------------------------")
                print()
            print("==================================================")