-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest_data.py
More file actions
30 lines (24 loc) · 1018 Bytes
/
ingest_data.py
File metadata and controls
30 lines (24 loc) · 1018 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from get_embeddings import get_embedding
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient
from app.config import settings
async def ingest_data():
# Load the PDF
loader = PyPDFLoader(
"https://drive.google.com/file/d/1JGkZkTcG_OC6XPCZQ0AWW5Nfa0chkP0a/view?usp=sharing"
)
data = loader.load()
# Split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)
# Prepare documents for insertion
docs_to_insert = [
{"text": doc.page_content, "embedding": get_embedding(doc.page_content)}
for doc in documents
]
# Connect to your Atlas cluster
client = MongoClient(settings.atlas_connection_string)
collection = client["rag-atlas"]["starter"]
# Insert documents into the collection
result = collection.insert_many(docs_to_insert)