-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathingest-code.py
More file actions
84 lines (66 loc) · 2.45 KB
/
ingest-code.py
File metadata and controls
84 lines (66 loc) · 2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import warnings
import dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
DirectoryLoader,
UnstructuredMarkdownLoader,
)
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from helpers import load_env
warnings.simplefilter("ignore")
attrs = load_env()
chunk_size = 3000
chunk_overlap = 400
def create_vector_database():
"""
Creates a vector database using document loaders and embeddings.
This function loads data from markdown, text and code files in the codebase directory,
splits the loaded documents into chunks, transforms them into embeddings using HuggingFaceEmbeddings,
and finally persists the embeddings into a Chroma vector database.
"""
chunked_documents = []
chunked_documents.extend(chunk_code())
chunked_documents.extend(chunk_docs())
embeddings = HuggingFaceEmbeddings(
model_name="jinaai/jina-embeddings-v2-base-code",
model_kwargs={"trust_remote_code": True},
)
vector_database = Chroma.from_documents(
documents=chunked_documents,
embedding=embeddings,
persist_directory=attrs["VECTOR_DB_PATH"],
)
vector_database.persist()
def chunk_code():
parser = LanguageParser(language=attrs["CODEBASE_LANGUAGE"])
loader = GenericLoader.from_filesystem(
attrs["CODEBASE_PATH"],
glob="**/*",
suffixes=attrs["CODE_SUFFIXES"],
parser=parser,
)
loaded_documents = loader.load()
splitter = RecursiveCharacterTextSplitter.from_language(
language=attrs["CODEBASE_LANGUAGE"],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
chunked_documents = splitter.split_documents(loaded_documents)
return chunked_documents
def chunk_docs():
loader = DirectoryLoader(
attrs["CODEBASE_PATH"], glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
)
loaded_documents = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunked_documents = splitter.split_documents(loaded_documents)
return chunked_documents
if __name__ == "__main__":
create_vector_database()