diff --git a/.gitignore b/.gitignore index d6e2969..3f05da7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,10 @@ rag_preprocessed_data.csv rag_preprocessed_data.json .env -chroma_db/ \ No newline at end of file +chroma_db/ + +__pycache__/ +*.pyc +.DS_Store +*.egg-info/ +.venv/ diff --git a/README.md b/README.md index c954bf3..42ab472 100644 --- a/README.md +++ b/README.md @@ -44,31 +44,24 @@ pip install -r requirements.txt ## ๐Ÿš€ ์‹คํ–‰ ์ˆœ์„œ (Running Order) -1. **๋ฐ์ดํ„ฐ ์ •์ œ (Data Cleaning):** - `creation_science_data.csv`๋ฅผ ์ •์ œํ•˜์—ฌ `cleaned_creation_science_data.csv`๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. +1. **๋ฐ์ดํ„ฐ ์ •์ œ + ์ „์ฒ˜๋ฆฌ (Data Pipeline):** + `creation_science_data.csv`๋ฅผ ์ •์ œํ•˜๊ณ , ์ฒญํฌ๋กœ ๋‚˜๋ˆ„์–ด `rag_preprocessed_data.json`์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ```bash - python data_cleaning.py + python data_pipeline.py ``` -2. **๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ (Data Preprocessing):** - ์ •์ œ๋œ CSV๋ฅผ ์ฝ์–ด ์ฒญํฌ๋กœ ๋‚˜๋ˆ„๊ณ  `rag_preprocessed_data.json`์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. - - ```bash - python data_preprocessing_for_RAG.py - ``` - -3. **๋ฒกํ„ฐ DB ์ ์žฌ (Vector DB Ingestion):** +2. **๋ฒกํ„ฐ DB ์ ์žฌ (Vector DB Ingestion):** ์ƒ์„ฑ๋œ JSON ๋ฐ์ดํ„ฐ๋ฅผ `chroma_db` ํด๋”์— ์ž„๋ฒ ๋”ฉํ•˜์—ฌ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค. (Ollama์—์„œ `qwen3-embedding:8b` ๋ชจ๋ธ์ด ์‹คํ–‰ ์ค‘์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.) ```bash python ingest_vector_db.py ``` -3.1 **Google Drive์—์„œ chroma_db ํŒŒ์ผ ๋‹ค์šด** -1,2,3 ๋›ฐ์–ด ๋„˜์–ด๋„ ๋จ. +2.1 **Google Drive์—์„œ chroma_db ํŒŒ์ผ ๋‹ค์šด** +1,2 ๋›ฐ์–ด ๋„˜์–ด๋„ ๋จ. https://drive.google.com/file/d/1zdxkGgW2R2mLA_XRxAENbLTTPSnNydI2/view?usp=drive_link -4. **์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์‹คํ–‰ (Run App):** +3. **์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์‹คํ–‰ (Run App):** Streamlit ์›น ์ธํ„ฐํŽ˜์ด์Šค๋ฅผ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค. ```bash streamlit run app.py @@ -76,9 +69,12 @@ https://drive.google.com/file/d/1zdxkGgW2R2mLA_XRxAENbLTTPSnNydI2/view?usp=drive ## ๐Ÿ“‚ ํŒŒ์ผ ๊ตฌ์กฐ (File Structure) -- `app.py`: Streamlit ์ฑ—๋ด‡ UI, Hybrid Retriever ๋ฐ Reranker ๋กœ์ง -- `data_cleaning.py`: ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ๋…ธ์ด์ฆˆ ํ…์ŠคํŠธ ์ •์ œ -- `data_preprocessing_for_RAG.py`: RecursiveCharacterTextSplitter๋ฅผ ์ด์šฉํ•œ ์ฒญํ‚น +- `config.py`: ๋ชจ๋ธ๋ช…, ๊ฒฝ๋กœ, ํŒŒ๋ผ๋ฏธํ„ฐ ๋“ฑ ์„ค์ •๊ฐ’ ์ค‘์•™ ๊ด€๋ฆฌ +- `app.py`: Streamlit ์ฑ—๋ด‡ UI +- `text_utils.py`: ํ…์ŠคํŠธ ํด๋ฆฌ๋‹ ๋ฐ ํƒœ๊ทธ ๋ถ„๋ฆฌ ์œ ํ‹ธ๋ฆฌํ‹ฐ +- `retriever.py`: ๋ฌธ์„œ ๋กœ๋”ฉ, ๋ฒกํ„ฐDB, ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ๋ฐ Reranker ๋กœ์ง +- `chain.py`: LLM ํ”„๋กฌํ”„ํŠธ ๋ฐ ์ƒ์„ฑ ์ฒด์ธ +- `data_pipeline.py`: ๋ฐ์ดํ„ฐ ์ •์ œ + RAG ์ „์ฒ˜๋ฆฌ/์ฒญํ‚น ํ†ตํ•ฉ ํŒŒ์ดํ”„๋ผ์ธ - `ingest_vector_db.py`: ChromaDB ์ƒ์„ฑ ๋ฐ Ollama ๊ธฐ๋ฐ˜ ์ž„๋ฒ ๋”ฉ ์ ์žฌ - `creation_science_data.csv`: ์›๋ณธ ๋ฐ์ดํ„ฐ์…‹ - `chroma_db/`: ๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ €์žฅ ํด๋” diff --git a/VERSIONS.md b/VERSIONS.md index 3805f23..534c5e1 100644 --- a/VERSIONS.md +++ b/VERSIONS.md @@ -4,7 +4,38 @@ --- -## [v0.0.4] - 2026-04-13 (์ตœ์‹  ๋ฒ„์ „) +## [v0.0.5] - 2026-04-15 (์ตœ์‹  ๋ฒ„์ „) + +### ๐ŸŒŸ ์ฃผ์š” ํŠน์ง• (Features) + +- **์ฝ”๋“œ ๋ชจ๋“ˆ ๋ถ„๋ฆฌ**: ๋‹จ์ผ `app.py`(294์ค„)๋ฅผ `config.py`, `text_utils.py`, `retriever.py`, `chain.py`, `app.py`(120์ค„)๋กœ ๋ถ„๋ฆฌํ•˜์—ฌ ์œ ์ง€๋ณด์ˆ˜์„ฑ ๋ฐ ํ…Œ์ŠคํŠธ ์šฉ์ด์„ฑ ํ–ฅ์ƒ. +- **์„ค์ •๊ฐ’ ์ค‘์•™ ๊ด€๋ฆฌ**: ๋ชจ๋ธ๋ช…, ๊ฒฝ๋กœ, ํŒŒ๋ผ๋ฏธํ„ฐ ๋“ฑ ํ•˜๋“œ์ฝ”๋”ฉ๋œ ์„ค์ •๊ฐ’์„ `config.py`๋กœ ํ†ตํ•ฉํ•˜์—ฌ ํ•œ ๊ณณ์—์„œ ๊ด€๋ฆฌ. +- **์ค‘๋ณต ์ฝ”๋“œ ์ œ๊ฑฐ**: `app.py`์™€ `ingest_vector_db.py`์— ์ค‘๋ณต ์กด์žฌํ•˜๋˜ `load_documents()` ํ•จ์ˆ˜๋ฅผ `retriever.py`๋กœ ํ†ตํ•ฉ. +- **๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ ํ†ตํ•ฉ**: `data_cleaning.py`์™€ `data_preprocessing_for_RAG.py`๋ฅผ `data_pipeline.py`๋กœ ํ†ตํ•ฉํ•˜์—ฌ 1,2๋‹จ๊ณ„๋ฅผ ํ•œ ๋ฒˆ์— ์‹คํ–‰. +- **Reranker ๋ชจ๋ธ ์ž๋™ ์ „ํ™˜**: `config.py`์—์„œ `RERANKER_MODEL`๋งŒ ๋ณ€๊ฒฝํ•˜๋ฉด BAAI/Qwen ๋ชจ๋ธ์— ๋งž๋Š” kwargs๊ฐ€ ์ž๋™ ์ ์šฉ. +- **์˜์กด์„ฑ ๋ฒ„์ „ ๊ณ ์ •**: `requirements.txt`์— ์ตœ์†Œ ๋ฒ„์ „ ์ œ์•ฝ์„ ์ถ”๊ฐ€ํ•˜์—ฌ ํ™˜๊ฒฝ ์žฌํ˜„์„ฑ ํ–ฅ์ƒ. +- **`.gitignore` ๋ณด๊ฐ•**: `__pycache__/`, `.DS_Store`, `.venv/` ๋“ฑ ํ‘œ์ค€ Python ์ œ์™ธ ํ•ญ๋ชฉ ์ถ”๊ฐ€. + +### ๐Ÿค– ๋ชจ๋ธ ๊ตฌ์„ฑ (Models) + +- **LLM (Generation):** `qwen2.5:14b` (via Ollama) +- **Embedding:** `qwen3-embedding:8b` (via Ollama) +- **Reranker:** `BAAI/bge-reranker-v2-m3` (via HuggingFace CrossEncoder) + - Optimization: `torch.float16` ์ ์šฉ + +### ๐Ÿ“‚ ๋ณ€๊ฒฝ๋œ ํŒŒ์ผ ๊ตฌ์กฐ (File Structure) + +- `config.py` (์‹ ๊ทœ): ์„ค์ •๊ฐ’ ์ค‘์•™ ๊ด€๋ฆฌ +- `text_utils.py` (์‹ ๊ทœ): ํ…์ŠคํŠธ ํด๋ฆฌ๋‹ ์œ ํ‹ธ๋ฆฌํ‹ฐ +- `retriever.py` (์‹ ๊ทœ): ๋ฌธ์„œ ๋กœ๋”ฉ, ๊ฒ€์ƒ‰, Reranker +- `chain.py` (์‹ ๊ทœ): LLM ํ”„๋กฌํ”„ํŠธ ๋ฐ ์ƒ์„ฑ ์ฒด์ธ +- `data_pipeline.py` (์‹ ๊ทœ): ๋ฐ์ดํ„ฐ ์ •์ œ + ์ „์ฒ˜๋ฆฌ ํ†ตํ•ฉ +- `app.py` (๋ฆฌํŒฉํ„ฐ๋ง): UI ๋กœ์ง๋งŒ ๋‹ด๋‹น +- `ingest_vector_db.py` (๋ฆฌํŒฉํ„ฐ๋ง): ๊ณตํ†ต ๋ชจ๋“ˆ ์ž„ํฌํŠธ๋กœ ์ „ํ™˜ + +--- + +## [v0.0.4] - 2026-04-13 (์ด์ „ ๋ฒ„์ „) ### ๐ŸŒŸ ์ฃผ์š” ํŠน์ง• (Features) diff --git a/app.py b/app.py index ffd2f29..05c142a 100644 --- a/app.py +++ b/app.py @@ -1,199 +1,47 @@ import streamlit as st -import os, json, time, re -from pathlib import Path -import torch - -from langchain_core.documents import Document -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.runnables import RunnablePassthrough -from langchain_core.output_parsers import StrOutputParser - -from langchain_chroma import Chroma -from langchain_community.retrievers import BM25Retriever -from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever -from langchain_ollama import ChatOllama, OllamaEmbeddings - -from langchain_community.cross_encoders import HuggingFaceCrossEncoder -from langchain_classic.retrievers.document_compressors import CrossEncoderReranker - - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -# [NEW] ํ…์ŠคํŠธ ํด๋ฆฌ๋‹ ๋ฐ ํƒœ๊ทธ ๋ถ„๋ฆฌ ํ•จ์ˆ˜ -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -def clean_response(text: str) -> str: - """๋ถˆํ•„์š”ํ•œ XML ํƒœ๊ทธ๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๊ณผ๋„ํ•œ ๋นˆ ์ค„์„ ์ •๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" - FORBIDDEN_TAGS = [ - "thought", "references", "conclusion", "answer", - "response", "output", "result", "context", "question", - ] - for tag in FORBIDDEN_TAGS: - text = re.sub(rf"", "", text, flags=re.IGNORECASE) - text = re.sub(r"\n{3,}", "\n\n", text) - return text.strip() - -def extract_think_and_answer(text: str): - """๋ฌธ์ž์—ด์—์„œ ๋ถ€๋ถ„๊ณผ ์‹ค์ œ ๋‹ต๋ณ€ ๋ถ€๋ถ„์„ ์™„๋ฒฝํ•˜๊ฒŒ ๋ถ„๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" - if "" in text and "" in text: - # think ํƒœ๊ทธ๊ฐ€ ์™„์ „ํžˆ ๋‹ซํžŒ ๊ฒฝ์šฐ - parts = text.split("", 1) - think_content = parts[0].split("")[-1].strip() - answer_content = clean_response(parts[1]) - return think_content, answer_content - elif "" in text: - # think ํƒœ๊ทธ๊ฐ€ ์—ด๋ ค์žˆ๊ณ  ์•„์ง ๋‹ซํžˆ์ง€ ์•Š์€ ๊ฒฝ์šฐ (์ŠคํŠธ๋ฆฌ๋ฐ ์ค‘) - think_content = text.split("")[-1].strip() - return think_content, "" - else: - # think ํƒœ๊ทธ๊ฐ€ ์•„์˜ˆ ์—†๋Š” ๊ฒฝ์šฐ - return "", clean_response(text) - - -# 1) ๋ฐ์ดํ„ฐ ๋กœ๋“œ -@st.cache_data(show_spinner=False) -def load_documents(path="rag_preprocessed_data.json"): - if not os.path.exists(path): - st.error(f"๋ฐ์ดํ„ฐ ํŒŒ์ผ({path})์ด ์—†์Šต๋‹ˆ๋‹ค.") - st.stop() - with open(path, "r", encoding="utf-8") as f: - raw = json.load(f) - docs = [] - for item in raw: - content = f"์ œ๋ชฉ: {item.get('title', '')}\n๋‚ด์šฉ: {item.get('content_chunk', '')}" - metadata = { - "chunk_id": item.get("chunk_id", ""), - "title": item.get("title", ""), - "url": item.get("url", "") - } - docs.append(Document(page_content=content.strip(), metadata=metadata)) - return docs - - -# 2) VectorStore ๋กœ๋“œ -@st.cache_resource(show_spinner=False) -def load_vectorstore(persist_directory="./chroma_db"): - if not Path(persist_directory).exists(): - st.error("๋ฒกํ„ฐDB๊ฐ€ ์•„์ง ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ๋ฐ์ดํ„ฐ๋ฅผ ๋จผ์ € ์ž„๋ฒ ๋”ฉํ•˜์„ธ์š”.") - st.stop() - embed = OllamaEmbeddings(model="qwen3-embedding:8b") - return Chroma(persist_directory=persist_directory, embedding_function=embed) - - -# 3) Hybrid + Cross-Encoder Reranker ์ดˆ๊ธฐํ™” -@st.cache_resource(show_spinner=False) -def init_retrievers(_docs, _vector_db): - bm25 = BM25Retriever.from_documents(_docs) - bm25.k = 10 - vect = _vector_db.as_retriever(search_kwargs={"k": 10}) - hybrid = EnsembleRetriever(retrievers=[bm25, vect], weights=[0.5, 0.5]) - - # ๐Ÿ’ป ์˜ต์…˜ A: 16GB ๋ฉ”๋ชจ๋ฆฌ์šฉ - model_name = "BAAI/bge-reranker-v2-m3" - model_kwargs = {"model_kwargs": {"torch_dtype": torch.float16}} - - # ๐Ÿ–ฅ๏ธ ์˜ต์…˜ B: 32GB ๋ฉ”๋ชจ๋ฆฌ์šฉ (๋ฌด๊ฑฐ์šด Qwen3 4B ๋ชจ๋ธ, 16-bit ์‚ฌ์šฉ) - # model_name = "Qwen/Qwen3-Reranker-4B" - # model_kwargs = { - # "automodel_args": { - # "torch_dtype": torch.float16, - # "trust_remote_code": True - # } - # } - - model = HuggingFaceCrossEncoder(model_name=model_name, model_kwargs=model_kwargs) - re_ranker = CrossEncoderReranker(model=model, top_n=5) - return ContextualCompressionRetriever(base_compressor=re_ranker, base_retriever=hybrid) - -# 4) ๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ ํ…์ŠคํŠธํ™” ๋ฐ ๊ณ ์œ  URL ํ•˜์ดํผ๋งํฌ ์ถ”์ถœ -def format_docs_and_extract_urls(docs): - context_parts = [] - unique_refs = {} # URL ์ค‘๋ณต ์ œ๊ฑฐ๋ฅผ ์œ„ํ•œ ๋”•์…”๋„ˆ๋ฆฌ - - for d in docs: - title = d.metadata.get('title', '์ œ๋ชฉ ์—†์Œ') - url = d.metadata.get('url', '') - - # LLM์—๊ฒŒ ์ œ๊ณตํ•  ๋ณธ๋ฌธ ๊ตฌ์„ฑ (LLM์€ ์ด ํ…์ŠคํŠธ๋งŒ ๋ณด๊ณ  ๋‹ต๋ณ€์„ ์ƒ์„ฑ) - context_parts.append(f"[์ถœ์ฒ˜: {title}]\n{d.page_content}") - - # URL์„ ๋”•์…”๋„ˆ๋ฆฌ์˜ Key๋กœ ์‚ฌ์šฉํ•˜์—ฌ ์ค‘๋ณต์„ ์ž๋™์œผ๋กœ ์ œ๊ฑฐ - # url ๊ฐ’์ด ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ์—๋งŒ ์ถ”๊ฐ€ - if url and url not in unique_refs: - unique_refs[url] = title - - context_str = "\n\n---\n\n".join(context_parts) - - # ๐Ÿ”— ๋งˆํฌ๋‹ค์šด ๋ฌธ๋ฒ• [Title](URL)์„ ์‚ฌ์šฉํ•˜์—ฌ ์ œ๋ชฉ์— ํ•˜์ดํผ๋งํฌ ์”Œ์šฐ๊ธฐ - ref_list = [f"- [{title}]({url})" for url, title in unique_refs.items()] - refs_str = "\n".join(ref_list) - - return context_str, refs_str - - -# 5) ์ƒ์„ฑ(Generation) ์ฒด์ธ -@st.cache_resource(show_spinner=False) -def init_generation_chain(): - llm = ChatOllama( - model="qwen2.5:14b", - # model="gpt-oss:20b", - temperature=0.3, - top_p=0.9, - repeat_penalty=1.15, - stop=["<|im_end|>", "User:", "Question:"] - ) - - prompt = ChatPromptTemplate.from_template(""" -๋‹น์‹ ์€ ์ฐฝ์กฐ๊ณผํ•™์˜ ๊ด€์ ๊ณผ ์ฃผ์žฅ์„ ์†Œ๊ฐœํ•˜๋Š” ์ „๋ฌธ ํ•ด์„ค๊ฐ€์ž…๋‹ˆ๋‹ค. -๋‹น์‹ ์˜ ์ฃผ๋œ ๋ชฉ์ ์€ ์ œ๊ณต๋œ ์ž๋ฃŒ(Context)๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ฐฝ์กฐ๊ณผํ•™๊ณ„์˜ ํ•ด์„๊ณผ ์ฆ๊ฑฐ๋“ค์„ ์‚ฌ์šฉ์ž์—๊ฒŒ ์ƒ์„ธํžˆ ์ „๋‹ฌํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. -์•„๋ž˜์˜ ์ง€์นจ์„ ๋ฐ˜๋“œ์‹œ ์ค€์ˆ˜ํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์‹ญ์‹œ์˜ค. +import time -[์ตœ์šฐ์„  ์ง€์นจ: ์ถœ์ฒ˜ ํ‘œ๊ธฐ ๊ธˆ์ง€] -- ๐Ÿšจ ๋‹ต๋ณ€ ๋งจ ๋งˆ์ง€๋ง‰์— ์ฐธ๊ณ  ์ž๋ฃŒ๋‚˜ URL ๋งํฌ๋ฅผ ์ง์ ‘ ์ž‘์„ฑํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. (์‹œ์Šคํ…œ์ด ์ž๋™์œผ๋กœ ์ •ํ™•ํ•œ ๊ณ ์œ  ๋งํฌ๋ฅผ 100% ์ฒจ๋ถ€ํ•  ๊ฒƒ์ž…๋‹ˆ๋‹ค.) ๋‹น์‹ ์€ ์˜ค์ง ๋‚ด์šฉ์„ ์„ค๋ช…ํ•˜๋Š” ๋ฐ์—๋งŒ ์ง‘์ค‘ํ•˜์‹ญ์‹œ์˜ค. +from text_utils import clean_response, extract_think_and_answer +from retriever import ( + load_documents, load_vectorstore, init_retrievers, + format_docs_and_extract_urls, +) +from chain import init_generation_chain -[๋‹ต๋ณ€ ๊ฐ€์ด๋“œ๋ผ์ธ] -1. ๋ฌดํ•œ ๋ฐ˜๋ณต ๊ธˆ์ง€ ๋ฐ ๊ฐ„๊ฒฐํ•œ ๊ตฌ์„ฑ: ๋™์ผํ•œ ์ฃผ์žฅ์ด๋‚˜ ๋ฌธ์žฅ์„ ๋ฌด์˜๋ฏธํ•˜๊ฒŒ ๊ณ„์† ๋ฐ˜๋ณต(๋ฌดํ•œ ๋ฃจํ”„)ํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. ๊ฐ ์ฃผ์žฅ์€ ํ•œ ๋ฒˆ์”ฉ๋งŒ ๋ช…ํ™•ํ•˜๊ณ  ์ƒ์„ธํ•˜๊ฒŒ ์„ค๋ช…ํ•˜๋ฉฐ, Context์— ์ œ์‹œ๋œ ๋‚ด์šฉ์„ ๋ชจ๋‘ ๋‹ค๋ฃจ์—ˆ๋‹ค๋ฉด ์ถ”๊ฐ€์ ์ธ ๋ฐ˜๋ณต ์—†์ด ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ๋‹ต๋ณ€์„ ๋งˆ๋ฌด๋ฆฌํ•˜์‹ญ์‹œ์˜ค. -2. ํƒœ๋„ ๋ฐ ์–ด์กฐ: ์ฐฝ์กฐ๊ณผํ•™๊ณ„์˜ ์ฃผ์žฅ๊ณผ ํ•ด์„์„ '์†Œ๊ฐœ'ํ•˜๋Š” ์ „๋ฌธ์ ์ธ ์–ด์กฐ๋ฅผ ์œ ์ง€ํ•˜์‹ญ์‹œ์˜ค. - - โœ… "์ฐฝ์กฐ๊ณผํ•™์—์„œ๋Š” ~๋ผ๊ณ  ์„ค๋ช…ํ•ฉ๋‹ˆ๋‹ค", "์ฐฝ์กฐ๊ณผํ•™์ž๋“ค์€ ์ด๋ฅผ ~์˜ ๊ทผ๊ฑฐ๋กœ ์ฃผ์žฅํ•ฉ๋‹ˆ๋‹ค" - - โŒ "์ด๊ฒƒ์€ ์‚ฌ์‹ค์ž…๋‹ˆ๋‹ค", "๊ณผํ•™์ ์œผ๋กœ ์ฆ๋ช…๋˜์—ˆ์Šต๋‹ˆ๋‹ค" ๋“ฑ ๋‹จ์ •์ ์ธ ์‚ฌ์‹ค ์„ ์–ธ์€ ์ง€์–‘ํ•˜์‹ญ์‹œ์˜ค. -3. ๋‚ด์šฉ์˜ ์ง‘์ค‘: ์˜ค์ง ์ฐฝ์กฐ๊ณผํ•™ ๋‚ด๋ถ€์˜ ๋…ผ๋ฆฌ์™€ ์ œ๊ณต๋œ ์ž๋ฃŒ์˜ ์ฆ๊ฑฐ(์œ ๋ฌผ, ๊ธฐ๋ก ๋“ฑ)๋ฅผ ์ƒ์„ธํžˆ ์„ค๋ช…ํ•˜๋Š” ๋ฐ ์ง‘์ค‘ํ•˜์‹ญ์‹œ์˜ค. ๋ถ€์ •์ ์ธ ์˜๊ฒฌ์ด๋‚˜ ํƒ€ ์ด๋ก ๊ณผ์˜ ๋น„๊ต๋Š” ์ง€์–‘ํ•˜์‹ญ์‹œ์˜ค. -4. ์ƒ์„ธ ์„ค๋ช… ๋ฐ ๊ตฌ์ฒด์„ฑ: ๊ฐ ํ•ญ๋ชฉ ์„ค๋ช… ์‹œ Context์— ํฌํ•จ๋œ ๊ตฌ์ฒด์ ์ธ ์ง€๋ช…, ์œ ๋ฌผ ์ด๋ฆ„, ์—ญ์‚ฌ์  ๊ธฐ๋ก, ์ธ๋ช… ๋“ฑ์„ ๋ฐ˜๋“œ์‹œ ํฌํ•จํ•˜์—ฌ ์ตœ์†Œ 3๋ฌธ์žฅ ์ด์ƒ ์ƒ์„ธํ•˜๊ฒŒ ์ž‘์„ฑํ•˜์‹ญ์‹œ์˜ค. -5. ํƒœ๊ทธ ์‚ฌ์šฉ ๊ธˆ์ง€: , , , ๋“ฑ ์–ด๋– ํ•œ XML/HTML ํƒœ๊ทธ๋„ ํฌํ•จํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. - -[์ถœ๋ ฅ ํ˜•์‹] -(์ฐฝ์กฐ๊ณผํ•™์  ๊ด€์ ์—์„œ ํ•ด๋‹น ์ฃผ์ œ๋ฅผ ์ •์ค‘ํ•˜๊ฒŒ ์†Œ๊ฐœํ•˜๋Š” ๋„์ž… ๋ฌธ๊ตฌ) - -- **(์ฃผ์žฅ/ํ•ด์„ 1 ์ œ๋ชฉ)**: (์ƒ์„ธ ์„ค๋ช…. ๊ตฌ์ฒด์  ์ง€๋ช… ๋ฐ ์ฆ๊ฑฐ ํฌํ•จ 3๋ฌธ์žฅ ์ด์ƒ) -- **(์ฃผ์žฅ/ํ•ด์„ 2 ์ œ๋ชฉ)**: (์ƒ์„ธ ์„ค๋ช…. ๊ตฌ์ฒด์  ์ง€๋ช… ๋ฐ ์ฆ๊ฑฐ ํฌํ•จ 3๋ฌธ์žฅ ์ด์ƒ) +# ========================================== +# โ€”โ€” Streamlit ์•ฑ UI (์ฑ—๋ด‡ ์Šคํƒ€์ผ) โ€”โ€” +# ========================================== +st.set_page_config(page_title="Chat DDS", page_icon="๐ŸŒŽ") +st.title("๐ŸŒŽ Chat DDS ๐ŸŒŽ") -Chat History: -{chat_history} -Context: -{context} +@st.cache_data(show_spinner=False) +def get_documents(): + return load_documents() -Question: -{question} -Answer: -""") +@st.cache_resource(show_spinner=False) +def get_vectorstore(): + return load_vectorstore() - return prompt | llm | StrOutputParser() +@st.cache_resource(show_spinner=False) +def get_generation_chain(): + return init_generation_chain() -# ========================================== -# โ€”โ€” Streamlit ์•ฑ UI (์ฑ—๋ด‡ ์Šคํƒ€์ผ) โ€”โ€” -# ========================================== -st.set_page_config(page_title="Chat DDS", page_icon="๐ŸŒŽ") -st.title("๐ŸŒŽ Chat DDS ๐ŸŒŽ") -docs = load_documents() -vector_db = load_vectorstore() +try: + docs = get_documents() + vector_db = get_vectorstore() +except FileNotFoundError as e: + st.error(str(e)) + st.stop() if "rerank_retriever" not in st.session_state: st.session_state.rerank_retriever = init_retrievers(docs, vector_db) -generation_chain = init_generation_chain() +generation_chain = get_generation_chain() if "messages" not in st.session_state: st.session_state.messages = [] @@ -203,17 +51,16 @@ def init_generation_chain(): with st.chat_message(msg["role"]): if msg["role"] == "assistant": think_content, answer_content = extract_think_and_answer(msg["content"]) - + # ์‚ฌ๊ณ  ๊ณผ์ • UI ๋ Œ๋”๋ง ์ฃผ์„ ์ฒ˜๋ฆฌ # if think_content: # with st.expander("๐Ÿง  AI์˜ ์‚ฌ๊ณ  ๊ณผ์ •"): # st.markdown(think_content) - + if answer_content: st.markdown(answer_content) - # ๋งŒ์•ฝ think ๋ถ„๋ฆฌ ์—†์ด ์ „์ฒด๊ฐ€ ๋‹ต๋ณ€์œผ๋กœ ๋„˜์–ด์˜จ ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ (์‚ฌ๊ณ  ๊ณผ์ • ์ œ๊ฑฐ ํ›„ ๋Œ€๋น„) elif think_content and not answer_content: - st.markdown(think_content) + st.markdown(think_content) else: st.markdown(clean_response(msg["content"])) @@ -228,7 +75,6 @@ def init_generation_chain(): chat_history_str = "" for m in st.session_state.messages[:-1]: role_name = "User" if m["role"] == "user" else "Assistant" - # ์ปจํ…์ŠคํŠธ๋กœ ๋„˜๊ธธ ๋•Œ๋Š” AI์˜ ์‚ฌ๊ณ  ๊ณผ์ •์€ ์ œ์™ธํ•˜๊ณ  ๋‹ต๋ณ€๋งŒ ๋„˜๊น€ _, ans_content = extract_think_and_answer(m["content"]) content = ans_content if m["role"] == "assistant" else clean_response(m["content"]) chat_history_str += f"{role_name}: {content}\n" @@ -241,11 +87,10 @@ def init_generation_chain(): try: retrieved_docs = st.session_state.rerank_retriever.invoke(query) - except Exception as e: + except Exception: st.warning("โš ๏ธ Reranking ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•˜์—ฌ ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.") retrieved_docs = st.session_state.rerank_retriever.base_retriever.invoke(query)[:5] - # โœ… ์—ฌ๊ธฐ์„œ ๊ณ ์œ  URL ๋ฆฌ์ŠคํŠธ๋ฅผ ํ•จ๊ป˜ ๋ฐ˜ํ™˜๋ฐ›์Šต๋‹ˆ๋‹ค. context_str, refs_str = format_docs_and_extract_urls(retrieved_docs) st.write(f"โœ… {len(retrieved_docs)}๊ฐœ์˜ ํ•ต์‹ฌ ๋ฌธ์„œ๋ฅผ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.") @@ -266,7 +111,7 @@ def init_generation_chain(): for chunk in response_stream: full_response += chunk current_think, current_answer = extract_think_and_answer(full_response) - + if current_answer: answer_placeholder.markdown(current_answer + " โ–Œ") elif current_think and not current_answer: @@ -274,14 +119,12 @@ def init_generation_chain(): # โ”€โ”€ ์ŠคํŠธ๋ฆฌ๋ฐ ์ข…๋ฃŒ: ์ปค์„œ ์ œ๊ฑฐ ๋ฐ ์ถœ์ฒ˜ ๊ฐ•์ œ ๊ฒฐํ•ฉ โ”€โ”€ final_think, final_answer = extract_think_and_answer(full_response) - - # ํŒŒ์ด์ฌ ๋กœ์ง์œผ๋กœ ์ƒ์„ฑ๋œ 100% ํ™•์‹คํ•œ ์ถœ์ฒ˜ ๋ฆฌ์ŠคํŠธ(ํ•˜์ดํผ๋งํฌ)๋ฅผ ๋‹ต๋ณ€ ๋์— ๋ณ‘ํ•ฉ + if refs_str: final_answer_with_refs = final_answer + f"\n\n๐Ÿ”— **์ฐธ๊ณ  ์ž๋ฃŒ:**\n{refs_str}" else: final_answer_with_refs = final_answer - - # ํ™”๋ฉด์— ๋ Œ๋”๋ง + if final_answer_with_refs: answer_placeholder.markdown(final_answer_with_refs) else: @@ -290,5 +133,4 @@ def init_generation_chain(): end_time = time.time() status.update(label=f"โœ… ๋‹ต๋ณ€ ์ƒ์„ฑ ์™„๋ฃŒ! ({end_time - start_time:.2f}์ดˆ)", state="complete") - # ์ „์ฒด ๋‹ต๋ณ€ ์ €์žฅ (์›๋ณธ์— ์ถœ์ฒ˜๊ฐ€ ๊ฒฐํ•ฉ๋œ ์ƒํƒœ๋กœ ์„ธ์…˜ ๊ธฐ๋ก ์ €์žฅ) - st.session_state.messages.append({"role": "assistant", "content": final_answer_with_refs}) \ No newline at end of file + st.session_state.messages.append({"role": "assistant", "content": final_answer_with_refs}) diff --git a/chain.py b/chain.py new file mode 100644 index 0000000..5df90c0 --- /dev/null +++ b/chain.py @@ -0,0 +1,57 @@ +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_ollama import ChatOllama + +from config import ( + LLM_MODEL, LLM_TEMPERATURE, LLM_TOP_P, + LLM_REPEAT_PENALTY, LLM_STOP_TOKENS, +) + +PROMPT_TEMPLATE = """ +๋‹น์‹ ์€ ์ฐฝ์กฐ๊ณผํ•™์˜ ๊ด€์ ๊ณผ ์ฃผ์žฅ์„ ์†Œ๊ฐœํ•˜๋Š” ์ „๋ฌธ ํ•ด์„ค๊ฐ€์ž…๋‹ˆ๋‹ค. +๋‹น์‹ ์˜ ์ฃผ๋œ ๋ชฉ์ ์€ ์ œ๊ณต๋œ ์ž๋ฃŒ(Context)๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ฐฝ์กฐ๊ณผํ•™๊ณ„์˜ ํ•ด์„๊ณผ ์ฆ๊ฑฐ๋“ค์„ ์‚ฌ์šฉ์ž์—๊ฒŒ ์ƒ์„ธํžˆ ์ „๋‹ฌํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. +์•„๋ž˜์˜ ์ง€์นจ์„ ๋ฐ˜๋“œ์‹œ ์ค€์ˆ˜ํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์‹ญ์‹œ์˜ค. + +[์ตœ์šฐ์„  ์ง€์นจ: ์ถœ์ฒ˜ ํ‘œ๊ธฐ ๊ธˆ์ง€] +- ๐Ÿšจ ๋‹ต๋ณ€ ๋งจ ๋งˆ์ง€๋ง‰์— ์ฐธ๊ณ  ์ž๋ฃŒ๋‚˜ URL ๋งํฌ๋ฅผ ์ง์ ‘ ์ž‘์„ฑํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. (์‹œ์Šคํ…œ์ด ์ž๋™์œผ๋กœ ์ •ํ™•ํ•œ ๊ณ ์œ  ๋งํฌ๋ฅผ 100% ์ฒจ๋ถ€ํ•  ๊ฒƒ์ž…๋‹ˆ๋‹ค.) ๋‹น์‹ ์€ ์˜ค์ง ๋‚ด์šฉ์„ ์„ค๋ช…ํ•˜๋Š” ๋ฐ์—๋งŒ ์ง‘์ค‘ํ•˜์‹ญ์‹œ์˜ค. + +[๋‹ต๋ณ€ ๊ฐ€์ด๋“œ๋ผ์ธ] +1. ๋ฌดํ•œ ๋ฐ˜๋ณต ๊ธˆ์ง€ ๋ฐ ๊ฐ„๊ฒฐํ•œ ๊ตฌ์„ฑ: ๋™์ผํ•œ ์ฃผ์žฅ์ด๋‚˜ ๋ฌธ์žฅ์„ ๋ฌด์˜๋ฏธํ•˜๊ฒŒ ๊ณ„์† ๋ฐ˜๋ณต(๋ฌดํ•œ ๋ฃจํ”„)ํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. ๊ฐ ์ฃผ์žฅ์€ ํ•œ ๋ฒˆ์”ฉ๋งŒ ๋ช…ํ™•ํ•˜๊ณ  ์ƒ์„ธํ•˜๊ฒŒ ์„ค๋ช…ํ•˜๋ฉฐ, Context์— ์ œ์‹œ๋œ ๋‚ด์šฉ์„ ๋ชจ๋‘ ๋‹ค๋ฃจ์—ˆ๋‹ค๋ฉด ์ถ”๊ฐ€์ ์ธ ๋ฐ˜๋ณต ์—†์ด ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ๋‹ต๋ณ€์„ ๋งˆ๋ฌด๋ฆฌํ•˜์‹ญ์‹œ์˜ค. +2. ํƒœ๋„ ๋ฐ ์–ด์กฐ: ์ฐฝ์กฐ๊ณผํ•™๊ณ„์˜ ์ฃผ์žฅ๊ณผ ํ•ด์„์„ '์†Œ๊ฐœ'ํ•˜๋Š” ์ „๋ฌธ์ ์ธ ์–ด์กฐ๋ฅผ ์œ ์ง€ํ•˜์‹ญ์‹œ์˜ค. + - โœ… "์ฐฝ์กฐ๊ณผํ•™์—์„œ๋Š” ~๋ผ๊ณ  ์„ค๋ช…ํ•ฉ๋‹ˆ๋‹ค", "์ฐฝ์กฐ๊ณผํ•™์ž๋“ค์€ ์ด๋ฅผ ~์˜ ๊ทผ๊ฑฐ๋กœ ์ฃผ์žฅํ•ฉ๋‹ˆ๋‹ค" + - โŒ "์ด๊ฒƒ์€ ์‚ฌ์‹ค์ž…๋‹ˆ๋‹ค", "๊ณผํ•™์ ์œผ๋กœ ์ฆ๋ช…๋˜์—ˆ์Šต๋‹ˆ๋‹ค" ๋“ฑ ๋‹จ์ •์ ์ธ ์‚ฌ์‹ค ์„ ์–ธ์€ ์ง€์–‘ํ•˜์‹ญ์‹œ์˜ค. +3. ๋‚ด์šฉ์˜ ์ง‘์ค‘: ์˜ค์ง ์ฐฝ์กฐ๊ณผํ•™ ๋‚ด๋ถ€์˜ ๋…ผ๋ฆฌ์™€ ์ œ๊ณต๋œ ์ž๋ฃŒ์˜ ์ฆ๊ฑฐ(์œ ๋ฌผ, ๊ธฐ๋ก ๋“ฑ)๋ฅผ ์ƒ์„ธํžˆ ์„ค๋ช…ํ•˜๋Š” ๋ฐ ์ง‘์ค‘ํ•˜์‹ญ์‹œ์˜ค. ๋ถ€์ •์ ์ธ ์˜๊ฒฌ์ด๋‚˜ ํƒ€ ์ด๋ก ๊ณผ์˜ ๋น„๊ต๋Š” ์ง€์–‘ํ•˜์‹ญ์‹œ์˜ค. +4. ์ƒ์„ธ ์„ค๋ช… ๋ฐ ๊ตฌ์ฒด์„ฑ: ๊ฐ ํ•ญ๋ชฉ ์„ค๋ช… ์‹œ Context์— ํฌํ•จ๋œ ๊ตฌ์ฒด์ ์ธ ์ง€๋ช…, ์œ ๋ฌผ ์ด๋ฆ„, ์—ญ์‚ฌ์  ๊ธฐ๋ก, ์ธ๋ช… ๋“ฑ์„ ๋ฐ˜๋“œ์‹œ ํฌํ•จํ•˜์—ฌ ์ตœ์†Œ 3๋ฌธ์žฅ ์ด์ƒ ์ƒ์„ธํ•˜๊ฒŒ ์ž‘์„ฑํ•˜์‹ญ์‹œ์˜ค. +5. ํƒœ๊ทธ ์‚ฌ์šฉ ๊ธˆ์ง€: , , , ๋“ฑ ์–ด๋– ํ•œ XML/HTML ํƒœ๊ทธ๋„ ํฌํ•จํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค. + +[์ถœ๋ ฅ ํ˜•์‹] + +(์ฐฝ์กฐ๊ณผํ•™์  ๊ด€์ ์—์„œ ํ•ด๋‹น ์ฃผ์ œ๋ฅผ ์ •์ค‘ํ•˜๊ฒŒ ์†Œ๊ฐœํ•˜๋Š” ๋„์ž… ๋ฌธ๊ตฌ) + +- **(์ฃผ์žฅ/ํ•ด์„ 1 ์ œ๋ชฉ)**: (์ƒ์„ธ ์„ค๋ช…. ๊ตฌ์ฒด์  ์ง€๋ช… ๋ฐ ์ฆ๊ฑฐ ํฌํ•จ 3๋ฌธ์žฅ ์ด์ƒ) +- **(์ฃผ์žฅ/ํ•ด์„ 2 ์ œ๋ชฉ)**: (์ƒ์„ธ ์„ค๋ช…. ๊ตฌ์ฒด์  ์ง€๋ช… ๋ฐ ์ฆ๊ฑฐ ํฌํ•จ 3๋ฌธ์žฅ ์ด์ƒ) + +Chat History: +{chat_history} + +Context: +{context} + +Question: +{question} + +Answer: +""" + + +def init_generation_chain(): + """LLM ์ƒ์„ฑ ์ฒด์ธ์„ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.""" + llm = ChatOllama( + model=LLM_MODEL, + temperature=LLM_TEMPERATURE, + top_p=LLM_TOP_P, + repeat_penalty=LLM_REPEAT_PENALTY, + stop=LLM_STOP_TOKENS, + ) + prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) + return prompt | llm | StrOutputParser() diff --git a/config.py b/config.py new file mode 100644 index 0000000..400256d --- /dev/null +++ b/config.py @@ -0,0 +1,31 @@ +# โ”€โ”€ Data Pipeline Paths โ”€โ”€ +RAW_CSV_PATH = "creation_science_data.csv" +CLEANED_CSV_PATH = "cleaned_creation_science_data.csv" +RAG_CSV_PATH = "rag_preprocessed_data.csv" +RAG_JSON_PATH = "rag_preprocessed_data.json" + +# โ”€โ”€ Vector DB โ”€โ”€ +CHROMA_DB_DIR = "./chroma_db" + +# โ”€โ”€ Models โ”€โ”€ +EMBEDDING_MODEL = "qwen3-embedding:8b" +LLM_MODEL = "qwen2.5:14b" + +# Reranker (16GB: "BAAI/bge-reranker-v2-m3", 32GB: "Qwen/Qwen3-Reranker-4B") +RERANKER_MODEL = "BAAI/bge-reranker-v2-m3" +RERANKER_TOP_N = 5 + +# โ”€โ”€ Retriever Parameters โ”€โ”€ +BM25_K = 10 # keyword search +VECTOR_K = 10 # context(vector) search +ENSEMBLE_WEIGHTS = [0.5, 0.5] # [BM25, Vector] + +# โ”€โ”€ LLM Parameters โ”€โ”€ +LLM_TEMPERATURE = 0.3 +LLM_TOP_P = 0.9 +LLM_REPEAT_PENALTY = 1.15 +LLM_STOP_TOKENS = ["<|im_end|>", "User:", "Question:"] + +# โ”€โ”€ Chunking Parameters โ”€โ”€ +CHUNK_SIZE = 1000 +CHUNK_OVERLAP = 200 diff --git a/data_cleaning.py b/data_cleaning.py deleted file mode 100644 index 5e43316..0000000 --- a/data_cleaning.py +++ /dev/null @@ -1,53 +0,0 @@ -import pandas as pd -import re -import os - -def run_cleaning_process(): - input_file = 'creation_science_data.csv' - output_file = 'cleaned_creation_science_data.csv' - - if not os.path.exists(input_file): - print(f"์˜ค๋ฅ˜: {input_file} ํŒŒ์ผ์ด ํ˜„์žฌ ๊ฒฝ๋กœ์— ์—†์Šต๋‹ˆ๋‹ค.") - return - - print(f"[{input_file}] ๋ฐ์ดํ„ฐ ์ •์ œ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค...") - - df = pd.read_csv(input_file) - initial_count = len(df) - - df = df.drop_duplicates(subset=['title', 'content'], keep='first') - - def clean_content(text): - if not isinstance(text, str): - return "" - - # ์ˆ˜์ • 1: .* ๋ฅผ ์ œ๊ฑฐํ•˜์—ฌ ๋’ค์— ์˜ค๋Š” ๋ชจ๋“  ํ…์ŠคํŠธ๊ฐ€ ์‚ญ์ œ๋˜๋Š” ๊ฒƒ์„ ๋ฐฉ์ง€ - noise_pattern = r"(์•Œ๋ฆผ ๋’ค๋กœ|์•Œ๋ฆผ ์„ค์ •|๋”๋ณด๊ธฐ ๊ฒŒ์‹œ๋ฌผ|๋งˆ์ดํŽ˜์ด์ง€|๋กœ๊ทธ์•„์›ƒ|์ฐพ์•„์˜ค์‹œ๋Š”๊ธธ|์ž๋ฃŒ์‹คMAP|์ฐฝ์กฐ๊ณผํ•™์Šค์ฟจ|E-Book|๊ธฐ๋„์›”๋ ฅ|๋ฌธ์˜๊ฒŒ์‹œํŒ|ํ›„์›๊ธฐ๊ด€|์ „์ฒด๋ณด๊ธฐ|์ถ”์ฒœ์‚ฌ์ดํŠธ|๋กœ๊ทธ์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค)" - - # ์ˆ˜์ • 2: flags=re.DOTALL ์ œ๊ฑฐ (ํ•ด๋‹น ํ‚ค์›Œ๋“œ๋งŒ ์‚ญ์ œํ•˜๋„๋ก ๋ณ€๊ฒฝ) - text = re.sub(noise_pattern, "", text) - - # ์—ฐ์†๋œ ๊ณต๋ฐฑ ๋ฐ ์ค„๋ฐ”๊ฟˆ ํ•˜๋‚˜๋กœ ํ†ต์ผ - text = re.sub(r'\s+', ' ', text).strip() - return text - - df['content'] = df['content'].apply(clean_content) - - # ์ž„๋ฒ ๋”ฉ์— ๋ถ€์ ํ•ฉํ•œ ์งง์€ ํ…์ŠคํŠธ ์ œ๊ฑฐ (100์ž ๋ฏธ๋งŒ) - df = df[df['content'].str.len() > 100] - - df.to_csv(output_file, index=False, encoding='utf-8-sig') - - final_count = len(df) - deleted_count = initial_count - final_count - - print("-" * 40) - print(f"์ •์ œ ์ž‘์—…์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!") - print(f"๊ธฐ์กด ํ–‰ ๊ฐœ์ˆ˜: {initial_count:,}๊ฐœ") - print(f"์ •์ œ ํ›„ ํ–‰ ๊ฐœ์ˆ˜: {final_count:,}๊ฐœ") - print(f"์‚ญ์ œ๋œ ํ–‰ ๊ฐœ์ˆ˜: {deleted_count:,}๊ฐœ (์•ฝ {deleted_count/initial_count*100:.1f}% ๊ฐ์†Œ)") - print(f"์ตœ์ข… ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ: {os.path.abspath(output_file)}") - print("-" * 40) - -if __name__ == "__main__": - run_cleaning_process() \ No newline at end of file diff --git a/data_pipeline.py b/data_pipeline.py new file mode 100644 index 0000000..da178bd --- /dev/null +++ b/data_pipeline.py @@ -0,0 +1,116 @@ +""" +๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ: 1๋‹จ๊ณ„(์ •์ œ) + 2๋‹จ๊ณ„(RAG ์ „์ฒ˜๋ฆฌ/์ฒญํ‚น)๋ฅผ ํ†ตํ•ฉ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค. +๋ฒกํ„ฐDB ์ ์žฌ(3๋‹จ๊ณ„)๋Š” ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆฌ๋ฏ€๋กœ ingest_vector_db.py์—์„œ ๋ณ„๋„ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค. +""" + +import os +import re + +import pandas as pd +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from config import ( + RAW_CSV_PATH, CLEANED_CSV_PATH, + RAG_CSV_PATH, RAG_JSON_PATH, + CHUNK_SIZE, CHUNK_OVERLAP, +) + + +# โ”€โ”€ 1๋‹จ๊ณ„: ์›๋ณธ CSV ๋ฐ์ดํ„ฐ ์ •์ œ โ”€โ”€ + +def clean_data(input_file=RAW_CSV_PATH, output_file=CLEANED_CSV_PATH): + if not os.path.exists(input_file): + print(f"์˜ค๋ฅ˜: {input_file} ํŒŒ์ผ์ด ํ˜„์žฌ ๊ฒฝ๋กœ์— ์—†์Šต๋‹ˆ๋‹ค.") + return None + + print(f"[{input_file}] ๋ฐ์ดํ„ฐ ์ •์ œ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค...") + + df = pd.read_csv(input_file) + initial_count = len(df) + + df = df.drop_duplicates(subset=['title', 'content'], keep='first') + + def _clean_content(text): + if not isinstance(text, str): + return "" + noise_pattern = ( + r"(์•Œ๋ฆผ ๋’ค๋กœ|์•Œ๋ฆผ ์„ค์ •|๋”๋ณด๊ธฐ ๊ฒŒ์‹œ๋ฌผ|๋งˆ์ดํŽ˜์ด์ง€|๋กœ๊ทธ์•„์›ƒ|์ฐพ์•„์˜ค์‹œ๋Š”๊ธธ" + r"|์ž๋ฃŒ์‹คMAP|์ฐฝ์กฐ๊ณผํ•™์Šค์ฟจ|E-Book|๊ธฐ๋„์›”๋ ฅ|๋ฌธ์˜๊ฒŒ์‹œํŒ|ํ›„์›๊ธฐ๊ด€" + r"|์ „์ฒด๋ณด๊ธฐ|์ถ”์ฒœ์‚ฌ์ดํŠธ|๋กœ๊ทธ์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค)" + ) + text = re.sub(noise_pattern, "", text) + text = re.sub(r'\s+', ' ', text).strip() + return text + + df['content'] = df['content'].apply(_clean_content) + df = df[df['content'].str.len() > 100] + + df.to_csv(output_file, index=False, encoding='utf-8-sig') + + final_count = len(df) + deleted_count = initial_count - final_count + + print("-" * 40) + print(f"์ •์ œ ์ž‘์—…์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!") + print(f"๊ธฐ์กด ํ–‰ ๊ฐœ์ˆ˜: {initial_count:,}๊ฐœ") + print(f"์ •์ œ ํ›„ ํ–‰ ๊ฐœ์ˆ˜: {final_count:,}๊ฐœ") + print(f"์‚ญ์ œ๋œ ํ–‰ ๊ฐœ์ˆ˜: {deleted_count:,}๊ฐœ (์•ฝ {deleted_count/initial_count*100:.1f}% ๊ฐ์†Œ)") + print(f"์ตœ์ข… ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ: {os.path.abspath(output_file)}") + print("-" * 40) + + return df + + +# โ”€โ”€ 2๋‹จ๊ณ„: ์ •์ œ๋œ CSV๋ฅผ ์ฒญํ‚นํ•˜์—ฌ RAG์šฉ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ โ”€โ”€ + +def preprocess_for_rag(input_file=CLEANED_CSV_PATH, output_csv=RAG_CSV_PATH, output_json=RAG_JSON_PATH): + df = pd.read_csv(input_file) + + def _clean_text(text): + if not isinstance(text, str): + return "" + text = re.sub(r'\r\n', '\n', text) + text = re.sub(r'\n+', '\n', text) + text = re.sub(r'\s+', ' ', text) + return text.strip() + + df['cleaned_content'] = df['content'].apply(_clean_text) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + length_function=len, + separators=["\n\n", "\n", ".", "?", "!", " ", ""] + ) + + processed_data = [] + for index, row in df.iterrows(): + if not row['cleaned_content']: + continue + chunks = text_splitter.split_text(row['cleaned_content']) + for i, chunk in enumerate(chunks): + processed_data.append({ + "chunk_id": f"doc_{index}_chunk_{i}", + "title": row['title'], + "url": row['url'], + "reference_urls": row['reference_urls'], + "content_chunk": chunk, + }) + + final_df = pd.DataFrame(processed_data) + final_df.to_csv(output_csv, index=False, encoding='utf-8-sig') + final_df.to_json(output_json, orient="records", force_ascii=False, indent=4) + + print(f"์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ! ์›๋ณธ ๋ฌธ์„œ {len(df)}๊ฐœ๊ฐ€ {len(final_df)}๊ฐœ์˜ ์ฒญํฌ๋กœ ๋ถ„ํ• ๋˜์–ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.") + + +# โ”€โ”€ ํ†ตํ•ฉ ์‹คํ–‰ โ”€โ”€ + +def main(): + result = clean_data() + if result is not None: + preprocess_for_rag() + + +if __name__ == "__main__": + main() diff --git a/data_preprocessing_for_RAG.py b/data_preprocessing_for_RAG.py deleted file mode 100644 index b616430..0000000 --- a/data_preprocessing_for_RAG.py +++ /dev/null @@ -1,62 +0,0 @@ -import pandas as pd -import re -from langchain_text_splitters import RecursiveCharacterTextSplitter - -# 1. ๋ฐ์ดํ„ฐ ๋กœ๋“œ -file_path = "cleaned_creation_science_data.csv" -df = pd.read_csv(file_path) - -# 2. ํ…์ŠคํŠธ ์ •์ œ ํ•จ์ˆ˜ (Cleaning) -def clean_text(text): - if not isinstance(text, str): - return "" - # ๋ถˆํ•„์š”ํ•œ ์ด์Šค์ผ€์ดํ”„ ๋ฌธ์ž๋‚˜ ๊ณผ๋„ํ•œ ๊ณต๋ฐฑ, ๊ฐœํ–‰๋ฌธ์ž ์ •์ œ - text = re.sub(r'\r\n', '\n', text) - text = re.sub(r'\n+', '\n', text) - text = re.sub(r'\s+', ' ', text) - return text.strip() - -# content ์ปฌ๋Ÿผ ์ •์ œ ์ ์šฉ -df['cleaned_content'] = df['content'].apply(clean_text) - -# 3. ํ…์ŠคํŠธ ์ฒญํ‚น (Chunking) ์„ค์ • -# RecursiveCharacterTextSplitter๋Š” ๋ฌธ๋‹จ -> ๋ฌธ์žฅ -> ๋‹จ์–ด ์ˆœ์œผ๋กœ ๋ฌธ๋งฅ์ด ๋Š๊ธฐ์ง€ ์•Š๊ฒŒ ๋ถ„ํ• ํ•ด์ค๋‹ˆ๋‹ค. -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, # ํ•˜๋‚˜์˜ ์ฒญํฌ ํฌ๊ธฐ (๊ธ€์ž ์ˆ˜ ๊ธฐ์ค€. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์— ๋”ฐ๋ผ 500~1000 ๊ถŒ์žฅ) - chunk_overlap=200, # ์ฒญํฌ ๊ฐ„ ๋ฌธ๋งฅ์ด ๋Š๊ธฐ์ง€ ์•Š๋„๋ก ๊ฒน์น˜๊ฒŒ ํ•  ๊ธ€์ž ์ˆ˜ - length_function=len, - separators=["\n\n", "\n", ".", "?", "!", " ", ""] -) - -processed_data = [] - -# 4. ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ˆœํšŒํ•˜๋ฉฐ ์ฒญํ‚น ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ฒฐํ•ฉ -for index, row in df.iterrows(): - # ์ •์ œ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ๊ฑด๋„ˆ๋œ€ - if not row['cleaned_content']: - continue - - # ํ…์ŠคํŠธ ๋ถ„ํ•  - chunks = text_splitter.split_text(row['cleaned_content']) - - for i, chunk in enumerate(chunks): - # ๊ฐ ์ฒญํฌ ๋‹จ์œ„๋กœ ์ƒˆ๋กœ์šด ๋”•์…”๋„ˆ๋ฆฌ ์ƒ์„ฑ (๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ณด์กด) - processed_data.append({ - "chunk_id": f"doc_{index}_chunk_{i}", # ๊ณ ์œ  ID ๋ถ€์—ฌ - "title": row['title'], - "url": row['url'], - "reference_urls": row['reference_urls'], - "content_chunk": chunk # ๋ถ„ํ• ๋œ ํ…์ŠคํŠธ - }) - -# 5. ์ตœ์ข… ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ -final_df = pd.DataFrame(processed_data) - -# 6. ์ตœ์ข… ํ˜•ํƒœ ์ €์žฅ (CSV ๋ฐ JSON) -# ํ•œ๊ธ€ ๊นจ์ง ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด utf-8-sig ์ธ์ฝ”๋”ฉ ์‚ฌ์šฉ -final_df.to_csv("rag_preprocessed_data.csv", index=False, encoding='utf-8-sig') - -# RAG ์‹œ์Šคํ…œ์— ๋”ฐ๋ผ JSON ํ˜•ํƒœ๋ฅผ ์š”๊ตฌํ•˜๋Š” ๊ฒฝ์šฐ๊ฐ€ ๋งŽ์œผ๋ฏ€๋กœ JSON์œผ๋กœ๋„ ์ €์žฅ -final_df.to_json("rag_preprocessed_data.json", orient="records", force_ascii=False, indent=4) - -print(f"์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ! ์›๋ณธ ๋ฌธ์„œ {len(df)}๊ฐœ๊ฐ€ {len(final_df)}๊ฐœ์˜ ์ฒญํฌ๋กœ ๋ถ„ํ• ๋˜์–ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.") \ No newline at end of file diff --git a/ingest_vector_db.py b/ingest_vector_db.py index e80dab2..0bdf258 100644 --- a/ingest_vector_db.py +++ b/ingest_vector_db.py @@ -1,37 +1,12 @@ import argparse -import json from pathlib import Path -from langchain_core.documents import Document from langchain_chroma import Chroma - -# ๋กœ์ปฌ ์ž„๋ฒ ๋”ฉ์„ ์œ„ํ•œ Ollama ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ from langchain_ollama import OllamaEmbeddings - -# ์ง„ํ–‰ ์ƒํ™ฉ ํ‘œ์‹œ๋ฅผ ์œ„ํ•œ tqdm ์ž„ํฌํŠธ from tqdm import tqdm - -def load_documents(path: str) -> list[Document]: - """ - ๋ฏธ๋ฆฌ ์ „์ฒ˜๋ฆฌ๋œ ์ฐฝ์กฐ๊ณผํ•™ JSON ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด LangChain Document ๋ฆฌ์ŠคํŠธ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค. - """ - with open(path, "r", encoding="utf-8") as f: - raw = json.load(f) - - docs = [] - for item in raw: - # ์ด๋ฏธ ์ฒญํฌ ๋ถ„ํ• ์ด ๋˜์–ด ์žˆ์œผ๋ฏ€๋กœ content_chunk๋ฅผ ๋ฉ”์ธ ํ…์ŠคํŠธ๋กœ ์‚ฌ์šฉ - content = f"์ œ๋ชฉ: {item.get('title', '')}\n๋‚ด์šฉ: {item.get('content_chunk', '')}" - - # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ (์ถœ์ฒ˜ URL๊ณผ ๊ณ ์œ  ID ํฌํ•จ) - metadata = { - "chunk_id": item.get("chunk_id", ""), - "title": item.get("title", ""), - "url": item.get("url", "") - } - docs.append(Document(page_content=content.strip(), metadata=metadata)) - return docs +from config import EMBEDDING_MODEL, RAG_JSON_PATH, CHROMA_DB_DIR +from retriever import load_documents def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> None: @@ -47,22 +22,17 @@ def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> Non ids = [doc.metadata.get("chunk_id") or f"doc_{i}" for i, doc in enumerate(docs)] # 3. ๋กœ์ปฌ ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ค์ • - print("Ollama qwen3-embedding:8b ๋ชจ๋ธ์„ ์ค€๋น„ ์ค‘์ž…๋‹ˆ๋‹ค...") - # embedding = OllamaEmbeddings(model="bge-m3") - embedding = OllamaEmbeddings(model="qwen3-embedding:8b") - - # 4. Chroma ๋ฒกํ„ฐDB ๊ฐ์ฒด ์ƒ์„ฑ (๋ฐ์ดํ„ฐ๋Š” ์•„์ง ๋„ฃ์ง€ ์•Š์Œ) + print(f"Ollama {EMBEDDING_MODEL} ๋ชจ๋ธ์„ ์ค€๋น„ ์ค‘์ž…๋‹ˆ๋‹ค...") + embedding = OllamaEmbeddings(model=EMBEDDING_MODEL) + + # 4. Chroma ๋ฒกํ„ฐDB ๊ฐ์ฒด ์ƒ์„ฑ db = Chroma(persist_directory=persist_directory, embedding_function=embedding) # 5. ๋ฐฐ์น˜ ๋‹จ์œ„๋กœ ์ž˜๋ผ์„œ DB์— ์ ์žฌํ•˜๋ฉฐ ์ง„ํ–‰ ์ƒํ™ฉ(tqdm) ํ‘œ์‹œ print("\n๋ณธ๊ฒฉ์ ์ธ ์ž„๋ฒ ๋”ฉ ๋ฐ DB ์ ์žฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค:") - - # range(0, ์ „์ฒด๊ฐœ์ˆ˜, ๋ฐฐ์น˜ํฌ๊ธฐ)๋ฅผ tqdm์œผ๋กœ ๊ฐ์‹ธ์„œ ๋ฃจํ”„๋ฅผ ๋•๋‹ˆ๋‹ค. for i in tqdm(range(0, len(docs), batch_size), desc="์ž„๋ฒ ๋”ฉ ์ง„ํ–‰๋ฅ ", unit="batch"): batch_docs = docs[i : i + batch_size] batch_ids = ids[i : i + batch_size] - - # ๋ฐฐ์น˜๋งŒํผ DB์— ์ถ”๊ฐ€ db.add_documents(documents=batch_docs, ids=batch_ids) print(f"\nโœ… Ingest completed: ์ด {len(docs)}๊ฐœ ์ฒญํฌ ์ €์žฅ ์™„๋ฃŒ -> {persist_directory}") @@ -70,19 +40,18 @@ def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> Non def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="์ „์ฒ˜๋ฆฌ๋œ JSON ๋ฐ์ดํ„ฐ๋ฅผ Chroma ๋ฒกํ„ฐDB์— ์ ์žฌํ•˜๋Š” ์Šคํฌ๋ฆฝํŠธ (์ง„ํ–‰ ์ƒํ™ฉ ํ‘œ์‹œ)" + description="์ „์ฒ˜๋ฆฌ๋œ JSON ๋ฐ์ดํ„ฐ๋ฅผ Chroma ๋ฒกํ„ฐDB์— ์ ์žฌํ•˜๋Š” ์Šคํฌ๋ฆฝํŠธ" ) parser.add_argument( "--json-path", - default="rag_preprocessed_data.json", - help="์ ์žฌํ•  JSON ํŒŒ์ผ ๊ฒฝ๋กœ (๊ธฐ๋ณธ๊ฐ’: rag_preprocessed_data.json)", + default=RAG_JSON_PATH, + help=f"์ ์žฌํ•  JSON ํŒŒ์ผ ๊ฒฝ๋กœ (๊ธฐ๋ณธ๊ฐ’: {RAG_JSON_PATH})", ) parser.add_argument( "--persist-directory", - default="./chroma_db", - help="Chroma DB ์ €์žฅ ๊ฒฝ๋กœ (๊ธฐ๋ณธ๊ฐ’: ./chroma_db)", + default=CHROMA_DB_DIR, + help=f"Chroma DB ์ €์žฅ ๊ฒฝ๋กœ (๊ธฐ๋ณธ๊ฐ’: {CHROMA_DB_DIR})", ) - # ๋ฐฐ์น˜ ์‚ฌ์ด์ฆˆ๋ฅผ ์ธ์ž๋กœ ๋ฐ›์„ ์ˆ˜ ์žˆ๊ฒŒ ์ถ”๊ฐ€ (๊ธฐ๋ณธ 100) parser.add_argument( "--batch-size", type=int, @@ -95,7 +64,7 @@ def parse_args() -> argparse.Namespace: if __name__ == "__main__": args = parse_args() ingest( - json_path=args.json_path, - persist_directory=args.persist_directory, - batch_size=args.batch_size - ) \ No newline at end of file + json_path=args.json_path, + persist_directory=args.persist_directory, + batch_size=args.batch_size, + ) diff --git a/requirements.txt b/requirements.txt index d231fb5..2ef4455 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,17 @@ -streamlit -pandas -langchain -langchain-core -langchain-community -langchain-ollama -langchain-text-splitters -langchain-classic -langchain-chroma -chromadb -tqdm -rank_bm25 -sentence-transformers -transformers -torch -bitsandbytes -accelerate +streamlit>=1.30 +pandas>=2.0 +langchain>=0.3 +langchain-core>=0.3 +langchain-community>=0.3 +langchain-ollama>=0.2 +langchain-text-splitters>=0.3 +langchain-classic>=0.1 +langchain-chroma>=0.2 +chromadb>=0.5 +tqdm>=4.60 +rank_bm25>=0.2 +sentence-transformers>=3.0 +transformers>=4.40 +torch>=2.0 +bitsandbytes>=0.43 +accelerate>=0.30 diff --git a/retriever.py b/retriever.py new file mode 100644 index 0000000..f41f855 --- /dev/null +++ b/retriever.py @@ -0,0 +1,86 @@ +import json +import os +from pathlib import Path + +from langchain_core.documents import Document +from langchain_chroma import Chroma +from langchain_ollama import OllamaEmbeddings + +from config import ( + RAG_JSON_PATH, CHROMA_DB_DIR, EMBEDDING_MODEL, + RERANKER_MODEL, RERANKER_TOP_N, + BM25_K, VECTOR_K, ENSEMBLE_WEIGHTS, +) + + +def load_documents(path=RAG_JSON_PATH): + """๋ฏธ๋ฆฌ ์ „์ฒ˜๋ฆฌ๋œ JSON ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด LangChain Document ๋ฆฌ์ŠคํŠธ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.""" + if not os.path.exists(path): + raise FileNotFoundError(f"๋ฐ์ดํ„ฐ ํŒŒ์ผ({path})์ด ์—†์Šต๋‹ˆ๋‹ค.") + with open(path, "r", encoding="utf-8") as f: + raw = json.load(f) + docs = [] + for item in raw: + content = f"์ œ๋ชฉ: {item.get('title', '')}\n๋‚ด์šฉ: {item.get('content_chunk', '')}" + metadata = { + "chunk_id": item.get("chunk_id", ""), + "title": item.get("title", ""), + "url": item.get("url", "") + } + docs.append(Document(page_content=content.strip(), metadata=metadata)) + return docs + + +def load_vectorstore(persist_directory=CHROMA_DB_DIR): + """Chroma ๋ฒกํ„ฐDB๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.""" + if not Path(persist_directory).exists(): + raise FileNotFoundError("๋ฒกํ„ฐDB๊ฐ€ ์•„์ง ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ๋ฐ์ดํ„ฐ๋ฅผ ๋จผ์ € ์ž„๋ฒ ๋”ฉํ•˜์„ธ์š”.") + embed = OllamaEmbeddings(model=EMBEDDING_MODEL) + return Chroma(persist_directory=persist_directory, embedding_function=embed) + + +def init_retrievers(docs, vector_db): + """Hybrid (BM25 + Vector) ๊ฒ€์ƒ‰ + Cross-Encoder Reranker๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.""" + import torch + from langchain_community.retrievers import BM25Retriever + from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever + from langchain_community.cross_encoders import HuggingFaceCrossEncoder + from langchain_classic.retrievers.document_compressors import CrossEncoderReranker + + bm25 = BM25Retriever.from_documents(docs) + bm25.k = BM25_K + vect = vector_db.as_retriever(search_kwargs={"k": VECTOR_K}) + hybrid = EnsembleRetriever(retrievers=[bm25, vect], weights=ENSEMBLE_WEIGHTS) + + if "Qwen" in RERANKER_MODEL: + model_kwargs = { + "automodel_args": { + "torch_dtype": torch.float16, + "trust_remote_code": True, + } + } + else: + model_kwargs = {"model_kwargs": {"torch_dtype": torch.float16}} + + model = HuggingFaceCrossEncoder(model_name=RERANKER_MODEL, model_kwargs=model_kwargs) + re_ranker = CrossEncoderReranker(model=model, top_n=RERANKER_TOP_N) + return ContextualCompressionRetriever(base_compressor=re_ranker, base_retriever=hybrid) + + +def format_docs_and_extract_urls(docs): + """๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๋ฅผ ํ…์ŠคํŠธํ™”ํ•˜๊ณ  ๊ณ ์œ  URL ํ•˜์ดํผ๋งํฌ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.""" + context_parts = [] + unique_refs = {} + + for d in docs: + title = d.metadata.get('title', '์ œ๋ชฉ ์—†์Œ') + url = d.metadata.get('url', '') + context_parts.append(f"[์ถœ์ฒ˜: {title}]\n{d.page_content}") + if url and url not in unique_refs: + unique_refs[url] = title + + context_str = "\n\n---\n\n".join(context_parts) + ref_list = [f"- [{title}]({url})" for url, title in unique_refs.items()] + refs_str = "\n".join(ref_list) + + return context_str, refs_str diff --git a/text_utils.py b/text_utils.py new file mode 100644 index 0000000..0b5c3ab --- /dev/null +++ b/text_utils.py @@ -0,0 +1,29 @@ +import re + + +FORBIDDEN_TAGS = [ + "thought", "references", "conclusion", "answer", + "response", "output", "result", "context", "question", +] + + +def clean_response(text: str) -> str: + """๋ถˆํ•„์š”ํ•œ XML ํƒœ๊ทธ๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๊ณผ๋„ํ•œ ๋นˆ ์ค„์„ ์ •๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" + for tag in FORBIDDEN_TAGS: + text = re.sub(rf"", "", text, flags=re.IGNORECASE) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def extract_think_and_answer(text: str): + """๋ฌธ์ž์—ด์—์„œ ๋ถ€๋ถ„๊ณผ ์‹ค์ œ ๋‹ต๋ณ€ ๋ถ€๋ถ„์„ ์™„๋ฒฝํ•˜๊ฒŒ ๋ถ„๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" + if "" in text and "" in text: + parts = text.split("", 1) + think_content = parts[0].split("")[-1].strip() + answer_content = clean_response(parts[1]) + return think_content, answer_content + elif "" in text: + think_content = text.split("")[-1].strip() + return think_content, "" + else: + return "", clean_response(text)