From 55db04b35599d54be863c8e13c0ed0f2646c422b Mon Sep 17 00:00:00 2001
From: Gyuho-Han <0203ho@naver.com>
Date: Wed, 15 Apr 2026 11:36:35 +0900
Subject: [PATCH] =?UTF-8?q?Edit:=20=EC=BD=94=EB=93=9C=20=EB=A6=AC=ED=8E=99?=
=?UTF-8?q?=ED=86=A0=EB=A7=81=20=EB=B0=8F=20v0.0.5=20=EC=97=85=EB=8D=B0?=
=?UTF-8?q?=EC=9D=B4=ED=8A=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 8 +-
README.md | 30 ++---
VERSIONS.md | 33 ++++-
app.py | 230 ++++++----------------------------
chain.py | 57 +++++++++
config.py | 31 +++++
data_cleaning.py | 53 --------
data_pipeline.py | 116 +++++++++++++++++
data_preprocessing_for_RAG.py | 62 ---------
ingest_vector_db.py | 61 +++------
requirements.txt | 34 ++---
retriever.py | 86 +++++++++++++
text_utils.py | 29 +++++
13 files changed, 439 insertions(+), 391 deletions(-)
create mode 100644 chain.py
create mode 100644 config.py
delete mode 100644 data_cleaning.py
create mode 100644 data_pipeline.py
delete mode 100644 data_preprocessing_for_RAG.py
create mode 100644 retriever.py
create mode 100644 text_utils.py
diff --git a/.gitignore b/.gitignore
index d6e2969..3f05da7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
rag_preprocessed_data.csv
rag_preprocessed_data.json
.env
-chroma_db/
\ No newline at end of file
+chroma_db/
+
+__pycache__/
+*.pyc
+.DS_Store
+*.egg-info/
+.venv/
diff --git a/README.md b/README.md
index c954bf3..42ab472 100644
--- a/README.md
+++ b/README.md
@@ -44,31 +44,24 @@ pip install -r requirements.txt
## ๐ ์คํ ์์ (Running Order)
-1. **๋ฐ์ดํฐ ์ ์ (Data Cleaning):**
- `creation_science_data.csv`๋ฅผ ์ ์ ํ์ฌ `cleaned_creation_science_data.csv`๋ฅผ ์์ฑํฉ๋๋ค.
+1. **๋ฐ์ดํฐ ์ ์ + ์ ์ฒ๋ฆฌ (Data Pipeline):**
+ `creation_science_data.csv`๋ฅผ ์ ์ ํ๊ณ , ์ฒญํฌ๋ก ๋๋์ด `rag_preprocessed_data.json`์ ์์ฑํฉ๋๋ค.
```bash
- python data_cleaning.py
+ python data_pipeline.py
```
-2. **๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ (Data Preprocessing):**
- ์ ์ ๋ CSV๋ฅผ ์ฝ์ด ์ฒญํฌ๋ก ๋๋๊ณ `rag_preprocessed_data.json`์ ์์ฑํฉ๋๋ค.
-
- ```bash
- python data_preprocessing_for_RAG.py
- ```
-
-3. **๋ฒกํฐ DB ์ ์ฌ (Vector DB Ingestion):**
+2. **๋ฒกํฐ DB ์ ์ฌ (Vector DB Ingestion):**
์์ฑ๋ JSON ๋ฐ์ดํฐ๋ฅผ `chroma_db` ํด๋์ ์๋ฒ ๋ฉํ์ฌ ์ ์ฅํฉ๋๋ค. (Ollama์์ `qwen3-embedding:8b` ๋ชจ๋ธ์ด ์คํ ์ค์ด์ด์ผ ํฉ๋๋ค.)
```bash
python ingest_vector_db.py
```
-3.1 **Google Drive์์ chroma_db ํ์ผ ๋ค์ด**
-1,2,3 ๋ฐ์ด ๋์ด๋ ๋จ.
+2.1 **Google Drive์์ chroma_db ํ์ผ ๋ค์ด**
+1,2 ๋ฐ์ด ๋์ด๋ ๋จ.
https://drive.google.com/file/d/1zdxkGgW2R2mLA_XRxAENbLTTPSnNydI2/view?usp=drive_link
-4. **์ ํ๋ฆฌ์ผ์ด์
์คํ (Run App):**
+3. **์ ํ๋ฆฌ์ผ์ด์
์คํ (Run App):**
Streamlit ์น ์ธํฐํ์ด์ค๋ฅผ ์คํํฉ๋๋ค.
```bash
streamlit run app.py
@@ -76,9 +69,12 @@ https://drive.google.com/file/d/1zdxkGgW2R2mLA_XRxAENbLTTPSnNydI2/view?usp=drive
## ๐ ํ์ผ ๊ตฌ์กฐ (File Structure)
-- `app.py`: Streamlit ์ฑ๋ด UI, Hybrid Retriever ๋ฐ Reranker ๋ก์ง
-- `data_cleaning.py`: ์ค๋ณต ์ ๊ฑฐ ๋ฐ ๋
ธ์ด์ฆ ํ
์คํธ ์ ์
-- `data_preprocessing_for_RAG.py`: RecursiveCharacterTextSplitter๋ฅผ ์ด์ฉํ ์ฒญํน
+- `config.py`: ๋ชจ๋ธ๋ช
, ๊ฒฝ๋ก, ํ๋ผ๋ฏธํฐ ๋ฑ ์ค์ ๊ฐ ์ค์ ๊ด๋ฆฌ
+- `app.py`: Streamlit ์ฑ๋ด UI
+- `text_utils.py`: ํ
์คํธ ํด๋ฆฌ๋ ๋ฐ ํ๊ทธ ๋ถ๋ฆฌ ์ ํธ๋ฆฌํฐ
+- `retriever.py`: ๋ฌธ์ ๋ก๋ฉ, ๋ฒกํฐDB, ํ์ด๋ธ๋ฆฌ๋ ๊ฒ์ ๋ฐ Reranker ๋ก์ง
+- `chain.py`: LLM ํ๋กฌํํธ ๋ฐ ์์ฑ ์ฒด์ธ
+- `data_pipeline.py`: ๋ฐ์ดํฐ ์ ์ + RAG ์ ์ฒ๋ฆฌ/์ฒญํน ํตํฉ ํ์ดํ๋ผ์ธ
- `ingest_vector_db.py`: ChromaDB ์์ฑ ๋ฐ Ollama ๊ธฐ๋ฐ ์๋ฒ ๋ฉ ์ ์ฌ
- `creation_science_data.csv`: ์๋ณธ ๋ฐ์ดํฐ์
- `chroma_db/`: ๋ฒกํฐ ๋ฐ์ดํฐ๋ฒ ์ด์ค ์ ์ฅ ํด๋
diff --git a/VERSIONS.md b/VERSIONS.md
index 3805f23..534c5e1 100644
--- a/VERSIONS.md
+++ b/VERSIONS.md
@@ -4,7 +4,38 @@
---
-## [v0.0.4] - 2026-04-13 (์ต์ ๋ฒ์ )
+## [v0.0.5] - 2026-04-15 (์ต์ ๋ฒ์ )
+
+### ๐ ์ฃผ์ ํน์ง (Features)
+
+- **์ฝ๋ ๋ชจ๋ ๋ถ๋ฆฌ**: ๋จ์ผ `app.py`(294์ค)๋ฅผ `config.py`, `text_utils.py`, `retriever.py`, `chain.py`, `app.py`(120์ค)๋ก ๋ถ๋ฆฌํ์ฌ ์ ์ง๋ณด์์ฑ ๋ฐ ํ
์คํธ ์ฉ์ด์ฑ ํฅ์.
+- **์ค์ ๊ฐ ์ค์ ๊ด๋ฆฌ**: ๋ชจ๋ธ๋ช
, ๊ฒฝ๋ก, ํ๋ผ๋ฏธํฐ ๋ฑ ํ๋์ฝ๋ฉ๋ ์ค์ ๊ฐ์ `config.py`๋ก ํตํฉํ์ฌ ํ ๊ณณ์์ ๊ด๋ฆฌ.
+- **์ค๋ณต ์ฝ๋ ์ ๊ฑฐ**: `app.py`์ `ingest_vector_db.py`์ ์ค๋ณต ์กด์ฌํ๋ `load_documents()` ํจ์๋ฅผ `retriever.py`๋ก ํตํฉ.
+- **๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ ํตํฉ**: `data_cleaning.py`์ `data_preprocessing_for_RAG.py`๋ฅผ `data_pipeline.py`๋ก ํตํฉํ์ฌ 1,2๋จ๊ณ๋ฅผ ํ ๋ฒ์ ์คํ.
+- **Reranker ๋ชจ๋ธ ์๋ ์ ํ**: `config.py`์์ `RERANKER_MODEL`๋ง ๋ณ๊ฒฝํ๋ฉด BAAI/Qwen ๋ชจ๋ธ์ ๋ง๋ kwargs๊ฐ ์๋ ์ ์ฉ.
+- **์์กด์ฑ ๋ฒ์ ๊ณ ์ **: `requirements.txt`์ ์ต์ ๋ฒ์ ์ ์ฝ์ ์ถ๊ฐํ์ฌ ํ๊ฒฝ ์ฌํ์ฑ ํฅ์.
+- **`.gitignore` ๋ณด๊ฐ**: `__pycache__/`, `.DS_Store`, `.venv/` ๋ฑ ํ์ค Python ์ ์ธ ํญ๋ชฉ ์ถ๊ฐ.
+
+### ๐ค ๋ชจ๋ธ ๊ตฌ์ฑ (Models)
+
+- **LLM (Generation):** `qwen2.5:14b` (via Ollama)
+- **Embedding:** `qwen3-embedding:8b` (via Ollama)
+- **Reranker:** `BAAI/bge-reranker-v2-m3` (via HuggingFace CrossEncoder)
+ - Optimization: `torch.float16` ์ ์ฉ
+
+### ๐ ๋ณ๊ฒฝ๋ ํ์ผ ๊ตฌ์กฐ (File Structure)
+
+- `config.py` (์ ๊ท): ์ค์ ๊ฐ ์ค์ ๊ด๋ฆฌ
+- `text_utils.py` (์ ๊ท): ํ
์คํธ ํด๋ฆฌ๋ ์ ํธ๋ฆฌํฐ
+- `retriever.py` (์ ๊ท): ๋ฌธ์ ๋ก๋ฉ, ๊ฒ์, Reranker
+- `chain.py` (์ ๊ท): LLM ํ๋กฌํํธ ๋ฐ ์์ฑ ์ฒด์ธ
+- `data_pipeline.py` (์ ๊ท): ๋ฐ์ดํฐ ์ ์ + ์ ์ฒ๋ฆฌ ํตํฉ
+- `app.py` (๋ฆฌํฉํฐ๋ง): UI ๋ก์ง๋ง ๋ด๋น
+- `ingest_vector_db.py` (๋ฆฌํฉํฐ๋ง): ๊ณตํต ๋ชจ๋ ์ํฌํธ๋ก ์ ํ
+
+---
+
+## [v0.0.4] - 2026-04-13 (์ด์ ๋ฒ์ )
### ๐ ์ฃผ์ ํน์ง (Features)
diff --git a/app.py b/app.py
index ffd2f29..05c142a 100644
--- a/app.py
+++ b/app.py
@@ -1,199 +1,47 @@
import streamlit as st
-import os, json, time, re
-from pathlib import Path
-import torch
-
-from langchain_core.documents import Document
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnablePassthrough
-from langchain_core.output_parsers import StrOutputParser
-
-from langchain_chroma import Chroma
-from langchain_community.retrievers import BM25Retriever
-from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever
-from langchain_ollama import ChatOllama, OllamaEmbeddings
-
-from langchain_community.cross_encoders import HuggingFaceCrossEncoder
-from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
-
-
-
-# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
-# [NEW] ํ
์คํธ ํด๋ฆฌ๋ ๋ฐ ํ๊ทธ ๋ถ๋ฆฌ ํจ์
-# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
-def clean_response(text: str) -> str:
- """๋ถํ์ํ XML ํ๊ทธ๋ฅผ ์ ๊ฑฐํ๊ณ ๊ณผ๋ํ ๋น ์ค์ ์ ๋ฆฌํฉ๋๋ค."""
- FORBIDDEN_TAGS = [
- "thought", "references", "conclusion", "answer",
- "response", "output", "result", "context", "question",
- ]
- for tag in FORBIDDEN_TAGS:
- text = re.sub(rf"?{tag}>", "", text, flags=re.IGNORECASE)
- text = re.sub(r"\n{3,}", "\n\n", text)
- return text.strip()
-
-def extract_think_and_answer(text: str):
- """๋ฌธ์์ด์์ ๋ถ๋ถ๊ณผ ์ค์ ๋ต๋ณ ๋ถ๋ถ์ ์๋ฒฝํ๊ฒ ๋ถ๋ฆฌํฉ๋๋ค."""
- if "" in text and "" in text:
- # think ํ๊ทธ๊ฐ ์์ ํ ๋ซํ ๊ฒฝ์ฐ
- parts = text.split("", 1)
- think_content = parts[0].split("")[-1].strip()
- answer_content = clean_response(parts[1])
- return think_content, answer_content
- elif "" in text:
- # think ํ๊ทธ๊ฐ ์ด๋ ค์๊ณ ์์ง ๋ซํ์ง ์์ ๊ฒฝ์ฐ (์คํธ๋ฆฌ๋ฐ ์ค)
- think_content = text.split("")[-1].strip()
- return think_content, ""
- else:
- # think ํ๊ทธ๊ฐ ์์ ์๋ ๊ฒฝ์ฐ
- return "", clean_response(text)
-
-
-# 1) ๋ฐ์ดํฐ ๋ก๋
-@st.cache_data(show_spinner=False)
-def load_documents(path="rag_preprocessed_data.json"):
- if not os.path.exists(path):
- st.error(f"๋ฐ์ดํฐ ํ์ผ({path})์ด ์์ต๋๋ค.")
- st.stop()
- with open(path, "r", encoding="utf-8") as f:
- raw = json.load(f)
- docs = []
- for item in raw:
- content = f"์ ๋ชฉ: {item.get('title', '')}\n๋ด์ฉ: {item.get('content_chunk', '')}"
- metadata = {
- "chunk_id": item.get("chunk_id", ""),
- "title": item.get("title", ""),
- "url": item.get("url", "")
- }
- docs.append(Document(page_content=content.strip(), metadata=metadata))
- return docs
-
-
-# 2) VectorStore ๋ก๋
-@st.cache_resource(show_spinner=False)
-def load_vectorstore(persist_directory="./chroma_db"):
- if not Path(persist_directory).exists():
- st.error("๋ฒกํฐDB๊ฐ ์์ง ์์ฑ๋์ง ์์์ต๋๋ค. ๋ฐ์ดํฐ๋ฅผ ๋จผ์ ์๋ฒ ๋ฉํ์ธ์.")
- st.stop()
- embed = OllamaEmbeddings(model="qwen3-embedding:8b")
- return Chroma(persist_directory=persist_directory, embedding_function=embed)
-
-
-# 3) Hybrid + Cross-Encoder Reranker ์ด๊ธฐํ
-@st.cache_resource(show_spinner=False)
-def init_retrievers(_docs, _vector_db):
- bm25 = BM25Retriever.from_documents(_docs)
- bm25.k = 10
- vect = _vector_db.as_retriever(search_kwargs={"k": 10})
- hybrid = EnsembleRetriever(retrievers=[bm25, vect], weights=[0.5, 0.5])
-
- # ๐ป ์ต์
A: 16GB ๋ฉ๋ชจ๋ฆฌ์ฉ
- model_name = "BAAI/bge-reranker-v2-m3"
- model_kwargs = {"model_kwargs": {"torch_dtype": torch.float16}}
-
- # ๐ฅ๏ธ ์ต์
B: 32GB ๋ฉ๋ชจ๋ฆฌ์ฉ (๋ฌด๊ฑฐ์ด Qwen3 4B ๋ชจ๋ธ, 16-bit ์ฌ์ฉ)
- # model_name = "Qwen/Qwen3-Reranker-4B"
- # model_kwargs = {
- # "automodel_args": {
- # "torch_dtype": torch.float16,
- # "trust_remote_code": True
- # }
- # }
-
- model = HuggingFaceCrossEncoder(model_name=model_name, model_kwargs=model_kwargs)
- re_ranker = CrossEncoderReranker(model=model, top_n=5)
- return ContextualCompressionRetriever(base_compressor=re_ranker, base_retriever=hybrid)
-
-# 4) ๊ฒ์๋ ๋ฌธ์ ํ
์คํธํ ๋ฐ ๊ณ ์ URL ํ์ดํผ๋งํฌ ์ถ์ถ
-def format_docs_and_extract_urls(docs):
- context_parts = []
- unique_refs = {} # URL ์ค๋ณต ์ ๊ฑฐ๋ฅผ ์ํ ๋์
๋๋ฆฌ
-
- for d in docs:
- title = d.metadata.get('title', '์ ๋ชฉ ์์')
- url = d.metadata.get('url', '')
-
- # LLM์๊ฒ ์ ๊ณตํ ๋ณธ๋ฌธ ๊ตฌ์ฑ (LLM์ ์ด ํ
์คํธ๋ง ๋ณด๊ณ ๋ต๋ณ์ ์์ฑ)
- context_parts.append(f"[์ถ์ฒ: {title}]\n{d.page_content}")
-
- # URL์ ๋์
๋๋ฆฌ์ Key๋ก ์ฌ์ฉํ์ฌ ์ค๋ณต์ ์๋์ผ๋ก ์ ๊ฑฐ
- # url ๊ฐ์ด ์กด์ฌํ๋ ๊ฒฝ์ฐ์๋ง ์ถ๊ฐ
- if url and url not in unique_refs:
- unique_refs[url] = title
-
- context_str = "\n\n---\n\n".join(context_parts)
-
- # ๐ ๋งํฌ๋ค์ด ๋ฌธ๋ฒ [Title](URL)์ ์ฌ์ฉํ์ฌ ์ ๋ชฉ์ ํ์ดํผ๋งํฌ ์์ฐ๊ธฐ
- ref_list = [f"- [{title}]({url})" for url, title in unique_refs.items()]
- refs_str = "\n".join(ref_list)
-
- return context_str, refs_str
-
-
-# 5) ์์ฑ(Generation) ์ฒด์ธ
-@st.cache_resource(show_spinner=False)
-def init_generation_chain():
- llm = ChatOllama(
- model="qwen2.5:14b",
- # model="gpt-oss:20b",
- temperature=0.3,
- top_p=0.9,
- repeat_penalty=1.15,
- stop=["<|im_end|>", "User:", "Question:"]
- )
-
- prompt = ChatPromptTemplate.from_template("""
-๋น์ ์ ์ฐฝ์กฐ๊ณผํ์ ๊ด์ ๊ณผ ์ฃผ์ฅ์ ์๊ฐํ๋ ์ ๋ฌธ ํด์ค๊ฐ์
๋๋ค.
-๋น์ ์ ์ฃผ๋ ๋ชฉ์ ์ ์ ๊ณต๋ ์๋ฃ(Context)๋ฅผ ๋ฐํ์ผ๋ก ์ฐฝ์กฐ๊ณผํ๊ณ์ ํด์๊ณผ ์ฆ๊ฑฐ๋ค์ ์ฌ์ฉ์์๊ฒ ์์ธํ ์ ๋ฌํ๋ ๊ฒ์
๋๋ค.
-์๋์ ์ง์นจ์ ๋ฐ๋์ ์ค์ํ์ฌ ๋ต๋ณํ์ญ์์ค.
+import time
-[์ต์ฐ์ ์ง์นจ: ์ถ์ฒ ํ๊ธฐ ๊ธ์ง]
-- ๐จ ๋ต๋ณ ๋งจ ๋ง์ง๋ง์ ์ฐธ๊ณ ์๋ฃ๋ URL ๋งํฌ๋ฅผ ์ง์ ์์ฑํ์ง ๋ง์ญ์์ค. (์์คํ
์ด ์๋์ผ๋ก ์ ํํ ๊ณ ์ ๋งํฌ๋ฅผ 100% ์ฒจ๋ถํ ๊ฒ์
๋๋ค.) ๋น์ ์ ์ค์ง ๋ด์ฉ์ ์ค๋ช
ํ๋ ๋ฐ์๋ง ์ง์คํ์ญ์์ค.
+from text_utils import clean_response, extract_think_and_answer
+from retriever import (
+ load_documents, load_vectorstore, init_retrievers,
+ format_docs_and_extract_urls,
+)
+from chain import init_generation_chain
-[๋ต๋ณ ๊ฐ์ด๋๋ผ์ธ]
-1. ๋ฌดํ ๋ฐ๋ณต ๊ธ์ง ๋ฐ ๊ฐ๊ฒฐํ ๊ตฌ์ฑ: ๋์ผํ ์ฃผ์ฅ์ด๋ ๋ฌธ์ฅ์ ๋ฌด์๋ฏธํ๊ฒ ๊ณ์ ๋ฐ๋ณต(๋ฌดํ ๋ฃจํ)ํ์ง ๋ง์ญ์์ค. ๊ฐ ์ฃผ์ฅ์ ํ ๋ฒ์ฉ๋ง ๋ช
ํํ๊ณ ์์ธํ๊ฒ ์ค๋ช
ํ๋ฉฐ, Context์ ์ ์๋ ๋ด์ฉ์ ๋ชจ๋ ๋ค๋ฃจ์๋ค๋ฉด ์ถ๊ฐ์ ์ธ ๋ฐ๋ณต ์์ด ์์ฐ์ค๋ฝ๊ฒ ๋ต๋ณ์ ๋ง๋ฌด๋ฆฌํ์ญ์์ค.
-2. ํ๋ ๋ฐ ์ด์กฐ: ์ฐฝ์กฐ๊ณผํ๊ณ์ ์ฃผ์ฅ๊ณผ ํด์์ '์๊ฐ'ํ๋ ์ ๋ฌธ์ ์ธ ์ด์กฐ๋ฅผ ์ ์งํ์ญ์์ค.
- - โ
"์ฐฝ์กฐ๊ณผํ์์๋ ~๋ผ๊ณ ์ค๋ช
ํฉ๋๋ค", "์ฐฝ์กฐ๊ณผํ์๋ค์ ์ด๋ฅผ ~์ ๊ทผ๊ฑฐ๋ก ์ฃผ์ฅํฉ๋๋ค"
- - โ "์ด๊ฒ์ ์ฌ์ค์
๋๋ค", "๊ณผํ์ ์ผ๋ก ์ฆ๋ช
๋์์ต๋๋ค" ๋ฑ ๋จ์ ์ ์ธ ์ฌ์ค ์ ์ธ์ ์ง์ํ์ญ์์ค.
-3. ๋ด์ฉ์ ์ง์ค: ์ค์ง ์ฐฝ์กฐ๊ณผํ ๋ด๋ถ์ ๋
ผ๋ฆฌ์ ์ ๊ณต๋ ์๋ฃ์ ์ฆ๊ฑฐ(์ ๋ฌผ, ๊ธฐ๋ก ๋ฑ)๋ฅผ ์์ธํ ์ค๋ช
ํ๋ ๋ฐ ์ง์คํ์ญ์์ค. ๋ถ์ ์ ์ธ ์๊ฒฌ์ด๋ ํ ์ด๋ก ๊ณผ์ ๋น๊ต๋ ์ง์ํ์ญ์์ค.
-4. ์์ธ ์ค๋ช
๋ฐ ๊ตฌ์ฒด์ฑ: ๊ฐ ํญ๋ชฉ ์ค๋ช
์ Context์ ํฌํจ๋ ๊ตฌ์ฒด์ ์ธ ์ง๋ช
, ์ ๋ฌผ ์ด๋ฆ, ์ญ์ฌ์ ๊ธฐ๋ก, ์ธ๋ช
๋ฑ์ ๋ฐ๋์ ํฌํจํ์ฌ ์ต์ 3๋ฌธ์ฅ ์ด์ ์์ธํ๊ฒ ์์ฑํ์ญ์์ค.
-5. ํ๊ทธ ์ฌ์ฉ ๊ธ์ง: , , , ๋ฑ ์ด๋ ํ XML/HTML ํ๊ทธ๋ ํฌํจํ์ง ๋ง์ญ์์ค.
-
-[์ถ๋ ฅ ํ์]
-(์ฐฝ์กฐ๊ณผํ์ ๊ด์ ์์ ํด๋น ์ฃผ์ ๋ฅผ ์ ์คํ๊ฒ ์๊ฐํ๋ ๋์
๋ฌธ๊ตฌ)
-
-- **(์ฃผ์ฅ/ํด์ 1 ์ ๋ชฉ)**: (์์ธ ์ค๋ช
. ๊ตฌ์ฒด์ ์ง๋ช
๋ฐ ์ฆ๊ฑฐ ํฌํจ 3๋ฌธ์ฅ ์ด์)
-- **(์ฃผ์ฅ/ํด์ 2 ์ ๋ชฉ)**: (์์ธ ์ค๋ช
. ๊ตฌ์ฒด์ ์ง๋ช
๋ฐ ์ฆ๊ฑฐ ํฌํจ 3๋ฌธ์ฅ ์ด์)
+# ==========================================
+# โโ Streamlit ์ฑ UI (์ฑ๋ด ์คํ์ผ) โโ
+# ==========================================
+st.set_page_config(page_title="Chat DDS", page_icon="๐")
+st.title("๐ Chat DDS ๐")
-Chat History:
-{chat_history}
-Context:
-{context}
+@st.cache_data(show_spinner=False)
+def get_documents():
+ return load_documents()
-Question:
-{question}
-Answer:
-""")
+@st.cache_resource(show_spinner=False)
+def get_vectorstore():
+ return load_vectorstore()
- return prompt | llm | StrOutputParser()
+@st.cache_resource(show_spinner=False)
+def get_generation_chain():
+ return init_generation_chain()
-# ==========================================
-# โโ Streamlit ์ฑ UI (์ฑ๋ด ์คํ์ผ) โโ
-# ==========================================
-st.set_page_config(page_title="Chat DDS", page_icon="๐")
-st.title("๐ Chat DDS ๐")
-docs = load_documents()
-vector_db = load_vectorstore()
+try:
+ docs = get_documents()
+ vector_db = get_vectorstore()
+except FileNotFoundError as e:
+ st.error(str(e))
+ st.stop()
if "rerank_retriever" not in st.session_state:
st.session_state.rerank_retriever = init_retrievers(docs, vector_db)
-generation_chain = init_generation_chain()
+generation_chain = get_generation_chain()
if "messages" not in st.session_state:
st.session_state.messages = []
@@ -203,17 +51,16 @@ def init_generation_chain():
with st.chat_message(msg["role"]):
if msg["role"] == "assistant":
think_content, answer_content = extract_think_and_answer(msg["content"])
-
+
# ์ฌ๊ณ ๊ณผ์ UI ๋ ๋๋ง ์ฃผ์ ์ฒ๋ฆฌ
# if think_content:
# with st.expander("๐ง AI์ ์ฌ๊ณ ๊ณผ์ "):
# st.markdown(think_content)
-
+
if answer_content:
st.markdown(answer_content)
- # ๋ง์ฝ think ๋ถ๋ฆฌ ์์ด ์ ์ฒด๊ฐ ๋ต๋ณ์ผ๋ก ๋์ด์จ ๊ฒฝ์ฐ ์ฒ๋ฆฌ (์ฌ๊ณ ๊ณผ์ ์ ๊ฑฐ ํ ๋๋น)
elif think_content and not answer_content:
- st.markdown(think_content)
+ st.markdown(think_content)
else:
st.markdown(clean_response(msg["content"]))
@@ -228,7 +75,6 @@ def init_generation_chain():
chat_history_str = ""
for m in st.session_state.messages[:-1]:
role_name = "User" if m["role"] == "user" else "Assistant"
- # ์ปจํ
์คํธ๋ก ๋๊ธธ ๋๋ AI์ ์ฌ๊ณ ๊ณผ์ ์ ์ ์ธํ๊ณ ๋ต๋ณ๋ง ๋๊น
_, ans_content = extract_think_and_answer(m["content"])
content = ans_content if m["role"] == "assistant" else clean_response(m["content"])
chat_history_str += f"{role_name}: {content}\n"
@@ -241,11 +87,10 @@ def init_generation_chain():
try:
retrieved_docs = st.session_state.rerank_retriever.invoke(query)
- except Exception as e:
+ except Exception:
st.warning("โ ๏ธ Reranking ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ฌ ๊ธฐ๋ณธ ๊ฒ์ ๊ฒฐ๊ณผ๋ฅผ ์ฌ์ฉํฉ๋๋ค.")
retrieved_docs = st.session_state.rerank_retriever.base_retriever.invoke(query)[:5]
- # โ
์ฌ๊ธฐ์ ๊ณ ์ URL ๋ฆฌ์คํธ๋ฅผ ํจ๊ป ๋ฐํ๋ฐ์ต๋๋ค.
context_str, refs_str = format_docs_and_extract_urls(retrieved_docs)
st.write(f"โ
{len(retrieved_docs)}๊ฐ์ ํต์ฌ ๋ฌธ์๋ฅผ ์ฐพ์์ต๋๋ค.")
@@ -266,7 +111,7 @@ def init_generation_chain():
for chunk in response_stream:
full_response += chunk
current_think, current_answer = extract_think_and_answer(full_response)
-
+
if current_answer:
answer_placeholder.markdown(current_answer + " โ")
elif current_think and not current_answer:
@@ -274,14 +119,12 @@ def init_generation_chain():
# โโ ์คํธ๋ฆฌ๋ฐ ์ข
๋ฃ: ์ปค์ ์ ๊ฑฐ ๋ฐ ์ถ์ฒ ๊ฐ์ ๊ฒฐํฉ โโ
final_think, final_answer = extract_think_and_answer(full_response)
-
- # ํ์ด์ฌ ๋ก์ง์ผ๋ก ์์ฑ๋ 100% ํ์คํ ์ถ์ฒ ๋ฆฌ์คํธ(ํ์ดํผ๋งํฌ)๋ฅผ ๋ต๋ณ ๋์ ๋ณํฉ
+
if refs_str:
final_answer_with_refs = final_answer + f"\n\n๐ **์ฐธ๊ณ ์๋ฃ:**\n{refs_str}"
else:
final_answer_with_refs = final_answer
-
- # ํ๋ฉด์ ๋ ๋๋ง
+
if final_answer_with_refs:
answer_placeholder.markdown(final_answer_with_refs)
else:
@@ -290,5 +133,4 @@ def init_generation_chain():
end_time = time.time()
status.update(label=f"โ
๋ต๋ณ ์์ฑ ์๋ฃ! ({end_time - start_time:.2f}์ด)", state="complete")
- # ์ ์ฒด ๋ต๋ณ ์ ์ฅ (์๋ณธ์ ์ถ์ฒ๊ฐ ๊ฒฐํฉ๋ ์ํ๋ก ์ธ์
๊ธฐ๋ก ์ ์ฅ)
- st.session_state.messages.append({"role": "assistant", "content": final_answer_with_refs})
\ No newline at end of file
+ st.session_state.messages.append({"role": "assistant", "content": final_answer_with_refs})
diff --git a/chain.py b/chain.py
new file mode 100644
index 0000000..5df90c0
--- /dev/null
+++ b/chain.py
@@ -0,0 +1,57 @@
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_ollama import ChatOllama
+
+from config import (
+ LLM_MODEL, LLM_TEMPERATURE, LLM_TOP_P,
+ LLM_REPEAT_PENALTY, LLM_STOP_TOKENS,
+)
+
+PROMPT_TEMPLATE = """
+๋น์ ์ ์ฐฝ์กฐ๊ณผํ์ ๊ด์ ๊ณผ ์ฃผ์ฅ์ ์๊ฐํ๋ ์ ๋ฌธ ํด์ค๊ฐ์
๋๋ค.
+๋น์ ์ ์ฃผ๋ ๋ชฉ์ ์ ์ ๊ณต๋ ์๋ฃ(Context)๋ฅผ ๋ฐํ์ผ๋ก ์ฐฝ์กฐ๊ณผํ๊ณ์ ํด์๊ณผ ์ฆ๊ฑฐ๋ค์ ์ฌ์ฉ์์๊ฒ ์์ธํ ์ ๋ฌํ๋ ๊ฒ์
๋๋ค.
+์๋์ ์ง์นจ์ ๋ฐ๋์ ์ค์ํ์ฌ ๋ต๋ณํ์ญ์์ค.
+
+[์ต์ฐ์ ์ง์นจ: ์ถ์ฒ ํ๊ธฐ ๊ธ์ง]
+- ๐จ ๋ต๋ณ ๋งจ ๋ง์ง๋ง์ ์ฐธ๊ณ ์๋ฃ๋ URL ๋งํฌ๋ฅผ ์ง์ ์์ฑํ์ง ๋ง์ญ์์ค. (์์คํ
์ด ์๋์ผ๋ก ์ ํํ ๊ณ ์ ๋งํฌ๋ฅผ 100% ์ฒจ๋ถํ ๊ฒ์
๋๋ค.) ๋น์ ์ ์ค์ง ๋ด์ฉ์ ์ค๋ช
ํ๋ ๋ฐ์๋ง ์ง์คํ์ญ์์ค.
+
+[๋ต๋ณ ๊ฐ์ด๋๋ผ์ธ]
+1. ๋ฌดํ ๋ฐ๋ณต ๊ธ์ง ๋ฐ ๊ฐ๊ฒฐํ ๊ตฌ์ฑ: ๋์ผํ ์ฃผ์ฅ์ด๋ ๋ฌธ์ฅ์ ๋ฌด์๋ฏธํ๊ฒ ๊ณ์ ๋ฐ๋ณต(๋ฌดํ ๋ฃจํ)ํ์ง ๋ง์ญ์์ค. ๊ฐ ์ฃผ์ฅ์ ํ ๋ฒ์ฉ๋ง ๋ช
ํํ๊ณ ์์ธํ๊ฒ ์ค๋ช
ํ๋ฉฐ, Context์ ์ ์๋ ๋ด์ฉ์ ๋ชจ๋ ๋ค๋ฃจ์๋ค๋ฉด ์ถ๊ฐ์ ์ธ ๋ฐ๋ณต ์์ด ์์ฐ์ค๋ฝ๊ฒ ๋ต๋ณ์ ๋ง๋ฌด๋ฆฌํ์ญ์์ค.
+2. ํ๋ ๋ฐ ์ด์กฐ: ์ฐฝ์กฐ๊ณผํ๊ณ์ ์ฃผ์ฅ๊ณผ ํด์์ '์๊ฐ'ํ๋ ์ ๋ฌธ์ ์ธ ์ด์กฐ๋ฅผ ์ ์งํ์ญ์์ค.
+ - โ
"์ฐฝ์กฐ๊ณผํ์์๋ ~๋ผ๊ณ ์ค๋ช
ํฉ๋๋ค", "์ฐฝ์กฐ๊ณผํ์๋ค์ ์ด๋ฅผ ~์ ๊ทผ๊ฑฐ๋ก ์ฃผ์ฅํฉ๋๋ค"
+ - โ "์ด๊ฒ์ ์ฌ์ค์
๋๋ค", "๊ณผํ์ ์ผ๋ก ์ฆ๋ช
๋์์ต๋๋ค" ๋ฑ ๋จ์ ์ ์ธ ์ฌ์ค ์ ์ธ์ ์ง์ํ์ญ์์ค.
+3. ๋ด์ฉ์ ์ง์ค: ์ค์ง ์ฐฝ์กฐ๊ณผํ ๋ด๋ถ์ ๋
ผ๋ฆฌ์ ์ ๊ณต๋ ์๋ฃ์ ์ฆ๊ฑฐ(์ ๋ฌผ, ๊ธฐ๋ก ๋ฑ)๋ฅผ ์์ธํ ์ค๋ช
ํ๋ ๋ฐ ์ง์คํ์ญ์์ค. ๋ถ์ ์ ์ธ ์๊ฒฌ์ด๋ ํ ์ด๋ก ๊ณผ์ ๋น๊ต๋ ์ง์ํ์ญ์์ค.
+4. ์์ธ ์ค๋ช
๋ฐ ๊ตฌ์ฒด์ฑ: ๊ฐ ํญ๋ชฉ ์ค๋ช
์ Context์ ํฌํจ๋ ๊ตฌ์ฒด์ ์ธ ์ง๋ช
, ์ ๋ฌผ ์ด๋ฆ, ์ญ์ฌ์ ๊ธฐ๋ก, ์ธ๋ช
๋ฑ์ ๋ฐ๋์ ํฌํจํ์ฌ ์ต์ 3๋ฌธ์ฅ ์ด์ ์์ธํ๊ฒ ์์ฑํ์ญ์์ค.
+5. ํ๊ทธ ์ฌ์ฉ ๊ธ์ง: , , , ๋ฑ ์ด๋ ํ XML/HTML ํ๊ทธ๋ ํฌํจํ์ง ๋ง์ญ์์ค.
+
+[์ถ๋ ฅ ํ์]
+
+(์ฐฝ์กฐ๊ณผํ์ ๊ด์ ์์ ํด๋น ์ฃผ์ ๋ฅผ ์ ์คํ๊ฒ ์๊ฐํ๋ ๋์
๋ฌธ๊ตฌ)
+
+- **(์ฃผ์ฅ/ํด์ 1 ์ ๋ชฉ)**: (์์ธ ์ค๋ช
. ๊ตฌ์ฒด์ ์ง๋ช
๋ฐ ์ฆ๊ฑฐ ํฌํจ 3๋ฌธ์ฅ ์ด์)
+- **(์ฃผ์ฅ/ํด์ 2 ์ ๋ชฉ)**: (์์ธ ์ค๋ช
. ๊ตฌ์ฒด์ ์ง๋ช
๋ฐ ์ฆ๊ฑฐ ํฌํจ 3๋ฌธ์ฅ ์ด์)
+
+Chat History:
+{chat_history}
+
+Context:
+{context}
+
+Question:
+{question}
+
+Answer:
+"""
+
+
+def init_generation_chain():
+ """LLM ์์ฑ ์ฒด์ธ์ ์ด๊ธฐํํฉ๋๋ค."""
+ llm = ChatOllama(
+ model=LLM_MODEL,
+ temperature=LLM_TEMPERATURE,
+ top_p=LLM_TOP_P,
+ repeat_penalty=LLM_REPEAT_PENALTY,
+ stop=LLM_STOP_TOKENS,
+ )
+ prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+ return prompt | llm | StrOutputParser()
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..400256d
--- /dev/null
+++ b/config.py
@@ -0,0 +1,31 @@
+# โโ Data Pipeline Paths โโ
+RAW_CSV_PATH = "creation_science_data.csv"
+CLEANED_CSV_PATH = "cleaned_creation_science_data.csv"
+RAG_CSV_PATH = "rag_preprocessed_data.csv"
+RAG_JSON_PATH = "rag_preprocessed_data.json"
+
+# โโ Vector DB โโ
+CHROMA_DB_DIR = "./chroma_db"
+
+# โโ Models โโ
+EMBEDDING_MODEL = "qwen3-embedding:8b"
+LLM_MODEL = "qwen2.5:14b"
+
+# Reranker (16GB: "BAAI/bge-reranker-v2-m3", 32GB: "Qwen/Qwen3-Reranker-4B")
+RERANKER_MODEL = "BAAI/bge-reranker-v2-m3"
+RERANKER_TOP_N = 5
+
+# โโ Retriever Parameters โโ
+BM25_K = 10 # keyword search
+VECTOR_K = 10 # context(vector) search
+ENSEMBLE_WEIGHTS = [0.5, 0.5] # [BM25, Vector]
+
+# โโ LLM Parameters โโ
+LLM_TEMPERATURE = 0.3
+LLM_TOP_P = 0.9
+LLM_REPEAT_PENALTY = 1.15
+LLM_STOP_TOKENS = ["<|im_end|>", "User:", "Question:"]
+
+# โโ Chunking Parameters โโ
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
diff --git a/data_cleaning.py b/data_cleaning.py
deleted file mode 100644
index 5e43316..0000000
--- a/data_cleaning.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pandas as pd
-import re
-import os
-
-def run_cleaning_process():
- input_file = 'creation_science_data.csv'
- output_file = 'cleaned_creation_science_data.csv'
-
- if not os.path.exists(input_file):
- print(f"์ค๋ฅ: {input_file} ํ์ผ์ด ํ์ฌ ๊ฒฝ๋ก์ ์์ต๋๋ค.")
- return
-
- print(f"[{input_file}] ๋ฐ์ดํฐ ์ ์ ๋ฅผ ์์ํฉ๋๋ค...")
-
- df = pd.read_csv(input_file)
- initial_count = len(df)
-
- df = df.drop_duplicates(subset=['title', 'content'], keep='first')
-
- def clean_content(text):
- if not isinstance(text, str):
- return ""
-
- # ์์ 1: .* ๋ฅผ ์ ๊ฑฐํ์ฌ ๋ค์ ์ค๋ ๋ชจ๋ ํ
์คํธ๊ฐ ์ญ์ ๋๋ ๊ฒ์ ๋ฐฉ์ง
- noise_pattern = r"(์๋ฆผ ๋ค๋ก|์๋ฆผ ์ค์ |๋๋ณด๊ธฐ ๊ฒ์๋ฌผ|๋ง์ดํ์ด์ง|๋ก๊ทธ์์|์ฐพ์์ค์๋๊ธธ|์๋ฃ์คMAP|์ฐฝ์กฐ๊ณผํ์ค์ฟจ|E-Book|๊ธฐ๋์๋ ฅ|๋ฌธ์๊ฒ์ํ|ํ์๊ธฐ๊ด|์ ์ฒด๋ณด๊ธฐ|์ถ์ฒ์ฌ์ดํธ|๋ก๊ทธ์ธ์ด ํ์ํฉ๋๋ค)"
-
- # ์์ 2: flags=re.DOTALL ์ ๊ฑฐ (ํด๋น ํค์๋๋ง ์ญ์ ํ๋๋ก ๋ณ๊ฒฝ)
- text = re.sub(noise_pattern, "", text)
-
- # ์ฐ์๋ ๊ณต๋ฐฑ ๋ฐ ์ค๋ฐ๊ฟ ํ๋๋ก ํต์ผ
- text = re.sub(r'\s+', ' ', text).strip()
- return text
-
- df['content'] = df['content'].apply(clean_content)
-
- # ์๋ฒ ๋ฉ์ ๋ถ์ ํฉํ ์งง์ ํ
์คํธ ์ ๊ฑฐ (100์ ๋ฏธ๋ง)
- df = df[df['content'].str.len() > 100]
-
- df.to_csv(output_file, index=False, encoding='utf-8-sig')
-
- final_count = len(df)
- deleted_count = initial_count - final_count
-
- print("-" * 40)
- print(f"์ ์ ์์
์ด ์๋ฃ๋์์ต๋๋ค!")
- print(f"๊ธฐ์กด ํ ๊ฐ์: {initial_count:,}๊ฐ")
- print(f"์ ์ ํ ํ ๊ฐ์: {final_count:,}๊ฐ")
- print(f"์ญ์ ๋ ํ ๊ฐ์: {deleted_count:,}๊ฐ (์ฝ {deleted_count/initial_count*100:.1f}% ๊ฐ์)")
- print(f"์ต์ข
ํ์ผ ์ ์ฅ ๊ฒฝ๋ก: {os.path.abspath(output_file)}")
- print("-" * 40)
-
-if __name__ == "__main__":
- run_cleaning_process()
\ No newline at end of file
diff --git a/data_pipeline.py b/data_pipeline.py
new file mode 100644
index 0000000..da178bd
--- /dev/null
+++ b/data_pipeline.py
@@ -0,0 +1,116 @@
+"""
+๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ: 1๋จ๊ณ(์ ์ ) + 2๋จ๊ณ(RAG ์ ์ฒ๋ฆฌ/์ฒญํน)๋ฅผ ํตํฉ ์คํํฉ๋๋ค.
+๋ฒกํฐDB ์ ์ฌ(3๋จ๊ณ)๋ ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆฌ๋ฏ๋ก ingest_vector_db.py์์ ๋ณ๋ ์คํํฉ๋๋ค.
+"""
+
+import os
+import re
+
+import pandas as pd
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from config import (
+ RAW_CSV_PATH, CLEANED_CSV_PATH,
+ RAG_CSV_PATH, RAG_JSON_PATH,
+ CHUNK_SIZE, CHUNK_OVERLAP,
+)
+
+
+# โโ 1๋จ๊ณ: ์๋ณธ CSV ๋ฐ์ดํฐ ์ ์ โโ
+
+def clean_data(input_file=RAW_CSV_PATH, output_file=CLEANED_CSV_PATH):
+ if not os.path.exists(input_file):
+ print(f"์ค๋ฅ: {input_file} ํ์ผ์ด ํ์ฌ ๊ฒฝ๋ก์ ์์ต๋๋ค.")
+ return None
+
+ print(f"[{input_file}] ๋ฐ์ดํฐ ์ ์ ๋ฅผ ์์ํฉ๋๋ค...")
+
+ df = pd.read_csv(input_file)
+ initial_count = len(df)
+
+ df = df.drop_duplicates(subset=['title', 'content'], keep='first')
+
+ def _clean_content(text):
+ if not isinstance(text, str):
+ return ""
+ noise_pattern = (
+ r"(์๋ฆผ ๋ค๋ก|์๋ฆผ ์ค์ |๋๋ณด๊ธฐ ๊ฒ์๋ฌผ|๋ง์ดํ์ด์ง|๋ก๊ทธ์์|์ฐพ์์ค์๋๊ธธ"
+ r"|์๋ฃ์คMAP|์ฐฝ์กฐ๊ณผํ์ค์ฟจ|E-Book|๊ธฐ๋์๋ ฅ|๋ฌธ์๊ฒ์ํ|ํ์๊ธฐ๊ด"
+ r"|์ ์ฒด๋ณด๊ธฐ|์ถ์ฒ์ฌ์ดํธ|๋ก๊ทธ์ธ์ด ํ์ํฉ๋๋ค)"
+ )
+ text = re.sub(noise_pattern, "", text)
+ text = re.sub(r'\s+', ' ', text).strip()
+ return text
+
+ df['content'] = df['content'].apply(_clean_content)
+ df = df[df['content'].str.len() > 100]
+
+ df.to_csv(output_file, index=False, encoding='utf-8-sig')
+
+ final_count = len(df)
+ deleted_count = initial_count - final_count
+
+ print("-" * 40)
+ print(f"์ ์ ์์
์ด ์๋ฃ๋์์ต๋๋ค!")
+ print(f"๊ธฐ์กด ํ ๊ฐ์: {initial_count:,}๊ฐ")
+ print(f"์ ์ ํ ํ ๊ฐ์: {final_count:,}๊ฐ")
+ print(f"์ญ์ ๋ ํ ๊ฐ์: {deleted_count:,}๊ฐ (์ฝ {deleted_count/initial_count*100:.1f}% ๊ฐ์)")
+ print(f"์ต์ข
ํ์ผ ์ ์ฅ ๊ฒฝ๋ก: {os.path.abspath(output_file)}")
+ print("-" * 40)
+
+ return df
+
+
+# โโ 2๋จ๊ณ: ์ ์ ๋ CSV๋ฅผ ์ฒญํนํ์ฌ RAG์ฉ ๋ฐ์ดํฐ ์์ฑ โโ
+
+def preprocess_for_rag(input_file=CLEANED_CSV_PATH, output_csv=RAG_CSV_PATH, output_json=RAG_JSON_PATH):
+ df = pd.read_csv(input_file)
+
+ def _clean_text(text):
+ if not isinstance(text, str):
+ return ""
+ text = re.sub(r'\r\n', '\n', text)
+ text = re.sub(r'\n+', '\n', text)
+ text = re.sub(r'\s+', ' ', text)
+ return text.strip()
+
+ df['cleaned_content'] = df['content'].apply(_clean_text)
+
+ text_splitter = RecursiveCharacterTextSplitter(
+ chunk_size=CHUNK_SIZE,
+ chunk_overlap=CHUNK_OVERLAP,
+ length_function=len,
+ separators=["\n\n", "\n", ".", "?", "!", " ", ""]
+ )
+
+ processed_data = []
+ for index, row in df.iterrows():
+ if not row['cleaned_content']:
+ continue
+ chunks = text_splitter.split_text(row['cleaned_content'])
+ for i, chunk in enumerate(chunks):
+ processed_data.append({
+ "chunk_id": f"doc_{index}_chunk_{i}",
+ "title": row['title'],
+ "url": row['url'],
+ "reference_urls": row['reference_urls'],
+ "content_chunk": chunk,
+ })
+
+ final_df = pd.DataFrame(processed_data)
+ final_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
+ final_df.to_json(output_json, orient="records", force_ascii=False, indent=4)
+
+ print(f"์ ์ฒ๋ฆฌ ์๋ฃ! ์๋ณธ ๋ฌธ์ {len(df)}๊ฐ๊ฐ {len(final_df)}๊ฐ์ ์ฒญํฌ๋ก ๋ถํ ๋์ด ์ ์ฅ๋์์ต๋๋ค.")
+
+
+# โโ ํตํฉ ์คํ โโ
+
+def main():
+ result = clean_data()
+ if result is not None:
+ preprocess_for_rag()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/data_preprocessing_for_RAG.py b/data_preprocessing_for_RAG.py
deleted file mode 100644
index b616430..0000000
--- a/data_preprocessing_for_RAG.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import pandas as pd
-import re
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-
-# 1. ๋ฐ์ดํฐ ๋ก๋
-file_path = "cleaned_creation_science_data.csv"
-df = pd.read_csv(file_path)
-
-# 2. ํ
์คํธ ์ ์ ํจ์ (Cleaning)
-def clean_text(text):
- if not isinstance(text, str):
- return ""
- # ๋ถํ์ํ ์ด์ค์ผ์ดํ ๋ฌธ์๋ ๊ณผ๋ํ ๊ณต๋ฐฑ, ๊ฐํ๋ฌธ์ ์ ์
- text = re.sub(r'\r\n', '\n', text)
- text = re.sub(r'\n+', '\n', text)
- text = re.sub(r'\s+', ' ', text)
- return text.strip()
-
-# content ์ปฌ๋ผ ์ ์ ์ ์ฉ
-df['cleaned_content'] = df['content'].apply(clean_text)
-
-# 3. ํ
์คํธ ์ฒญํน (Chunking) ์ค์
-# RecursiveCharacterTextSplitter๋ ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ๋จ์ด ์์ผ๋ก ๋ฌธ๋งฅ์ด ๋๊ธฐ์ง ์๊ฒ ๋ถํ ํด์ค๋๋ค.
-text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=1000, # ํ๋์ ์ฒญํฌ ํฌ๊ธฐ (๊ธ์ ์ ๊ธฐ์ค. ์๋ฒ ๋ฉ ๋ชจ๋ธ์ ๋ฐ๋ผ 500~1000 ๊ถ์ฅ)
- chunk_overlap=200, # ์ฒญํฌ ๊ฐ ๋ฌธ๋งฅ์ด ๋๊ธฐ์ง ์๋๋ก ๊ฒน์น๊ฒ ํ ๊ธ์ ์
- length_function=len,
- separators=["\n\n", "\n", ".", "?", "!", " ", ""]
-)
-
-processed_data = []
-
-# 4. ๋ฐ์ดํฐํ๋ ์ ์ํํ๋ฉฐ ์ฒญํน ๋ฐ ๋ฉํ๋ฐ์ดํฐ ๊ฒฐํฉ
-for index, row in df.iterrows():
- # ์ ์ ๋ ํ
์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ ๊ฑด๋๋
- if not row['cleaned_content']:
- continue
-
- # ํ
์คํธ ๋ถํ
- chunks = text_splitter.split_text(row['cleaned_content'])
-
- for i, chunk in enumerate(chunks):
- # ๊ฐ ์ฒญํฌ ๋จ์๋ก ์๋ก์ด ๋์
๋๋ฆฌ ์์ฑ (๋ฉํ๋ฐ์ดํฐ ๋ณด์กด)
- processed_data.append({
- "chunk_id": f"doc_{index}_chunk_{i}", # ๊ณ ์ ID ๋ถ์ฌ
- "title": row['title'],
- "url": row['url'],
- "reference_urls": row['reference_urls'],
- "content_chunk": chunk # ๋ถํ ๋ ํ
์คํธ
- })
-
-# 5. ์ต์ข
๋ฐ์ดํฐํ๋ ์ ์์ฑ
-final_df = pd.DataFrame(processed_data)
-
-# 6. ์ต์ข
ํํ ์ ์ฅ (CSV ๋ฐ JSON)
-# ํ๊ธ ๊นจ์ง ๋ฐฉ์ง๋ฅผ ์ํด utf-8-sig ์ธ์ฝ๋ฉ ์ฌ์ฉ
-final_df.to_csv("rag_preprocessed_data.csv", index=False, encoding='utf-8-sig')
-
-# RAG ์์คํ
์ ๋ฐ๋ผ JSON ํํ๋ฅผ ์๊ตฌํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง์ผ๋ฏ๋ก JSON์ผ๋ก๋ ์ ์ฅ
-final_df.to_json("rag_preprocessed_data.json", orient="records", force_ascii=False, indent=4)
-
-print(f"์ ์ฒ๋ฆฌ ์๋ฃ! ์๋ณธ ๋ฌธ์ {len(df)}๊ฐ๊ฐ {len(final_df)}๊ฐ์ ์ฒญํฌ๋ก ๋ถํ ๋์ด ์ ์ฅ๋์์ต๋๋ค.")
\ No newline at end of file
diff --git a/ingest_vector_db.py b/ingest_vector_db.py
index e80dab2..0bdf258 100644
--- a/ingest_vector_db.py
+++ b/ingest_vector_db.py
@@ -1,37 +1,12 @@
import argparse
-import json
from pathlib import Path
-from langchain_core.documents import Document
from langchain_chroma import Chroma
-
-# ๋ก์ปฌ ์๋ฒ ๋ฉ์ ์ํ Ollama ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
from langchain_ollama import OllamaEmbeddings
-
-# ์งํ ์ํฉ ํ์๋ฅผ ์ํ tqdm ์ํฌํธ
from tqdm import tqdm
-
-def load_documents(path: str) -> list[Document]:
- """
- ๋ฏธ๋ฆฌ ์ ์ฒ๋ฆฌ๋ ์ฐฝ์กฐ๊ณผํ JSON ๋ฐ์ดํฐ๋ฅผ ์ฝ์ด LangChain Document ๋ฆฌ์คํธ๋ก ๋ณํํฉ๋๋ค.
- """
- with open(path, "r", encoding="utf-8") as f:
- raw = json.load(f)
-
- docs = []
- for item in raw:
- # ์ด๋ฏธ ์ฒญํฌ ๋ถํ ์ด ๋์ด ์์ผ๋ฏ๋ก content_chunk๋ฅผ ๋ฉ์ธ ํ
์คํธ๋ก ์ฌ์ฉ
- content = f"์ ๋ชฉ: {item.get('title', '')}\n๋ด์ฉ: {item.get('content_chunk', '')}"
-
- # ๋ฉํ๋ฐ์ดํฐ ๊ตฌ์ฑ (์ถ์ฒ URL๊ณผ ๊ณ ์ ID ํฌํจ)
- metadata = {
- "chunk_id": item.get("chunk_id", ""),
- "title": item.get("title", ""),
- "url": item.get("url", "")
- }
- docs.append(Document(page_content=content.strip(), metadata=metadata))
- return docs
+from config import EMBEDDING_MODEL, RAG_JSON_PATH, CHROMA_DB_DIR
+from retriever import load_documents
def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> None:
@@ -47,22 +22,17 @@ def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> Non
ids = [doc.metadata.get("chunk_id") or f"doc_{i}" for i, doc in enumerate(docs)]
# 3. ๋ก์ปฌ ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ค์
- print("Ollama qwen3-embedding:8b ๋ชจ๋ธ์ ์ค๋น ์ค์
๋๋ค...")
- # embedding = OllamaEmbeddings(model="bge-m3")
- embedding = OllamaEmbeddings(model="qwen3-embedding:8b")
-
- # 4. Chroma ๋ฒกํฐDB ๊ฐ์ฒด ์์ฑ (๋ฐ์ดํฐ๋ ์์ง ๋ฃ์ง ์์)
+ print(f"Ollama {EMBEDDING_MODEL} ๋ชจ๋ธ์ ์ค๋น ์ค์
๋๋ค...")
+ embedding = OllamaEmbeddings(model=EMBEDDING_MODEL)
+
+ # 4. Chroma ๋ฒกํฐDB ๊ฐ์ฒด ์์ฑ
db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# 5. ๋ฐฐ์น ๋จ์๋ก ์๋ผ์ DB์ ์ ์ฌํ๋ฉฐ ์งํ ์ํฉ(tqdm) ํ์
print("\n๋ณธ๊ฒฉ์ ์ธ ์๋ฒ ๋ฉ ๋ฐ DB ์ ์ฌ๋ฅผ ์์ํฉ๋๋ค:")
-
- # range(0, ์ ์ฒด๊ฐ์, ๋ฐฐ์นํฌ๊ธฐ)๋ฅผ tqdm์ผ๋ก ๊ฐ์ธ์ ๋ฃจํ๋ฅผ ๋๋๋ค.
for i in tqdm(range(0, len(docs), batch_size), desc="์๋ฒ ๋ฉ ์งํ๋ฅ ", unit="batch"):
batch_docs = docs[i : i + batch_size]
batch_ids = ids[i : i + batch_size]
-
- # ๋ฐฐ์น๋งํผ DB์ ์ถ๊ฐ
db.add_documents(documents=batch_docs, ids=batch_ids)
print(f"\nโ
Ingest completed: ์ด {len(docs)}๊ฐ ์ฒญํฌ ์ ์ฅ ์๋ฃ -> {persist_directory}")
@@ -70,19 +40,18 @@ def ingest(json_path: str, persist_directory: str, batch_size: int = 100) -> Non
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
- description="์ ์ฒ๋ฆฌ๋ JSON ๋ฐ์ดํฐ๋ฅผ Chroma ๋ฒกํฐDB์ ์ ์ฌํ๋ ์คํฌ๋ฆฝํธ (์งํ ์ํฉ ํ์)"
+ description="์ ์ฒ๋ฆฌ๋ JSON ๋ฐ์ดํฐ๋ฅผ Chroma ๋ฒกํฐDB์ ์ ์ฌํ๋ ์คํฌ๋ฆฝํธ"
)
parser.add_argument(
"--json-path",
- default="rag_preprocessed_data.json",
- help="์ ์ฌํ JSON ํ์ผ ๊ฒฝ๋ก (๊ธฐ๋ณธ๊ฐ: rag_preprocessed_data.json)",
+ default=RAG_JSON_PATH,
+ help=f"์ ์ฌํ JSON ํ์ผ ๊ฒฝ๋ก (๊ธฐ๋ณธ๊ฐ: {RAG_JSON_PATH})",
)
parser.add_argument(
"--persist-directory",
- default="./chroma_db",
- help="Chroma DB ์ ์ฅ ๊ฒฝ๋ก (๊ธฐ๋ณธ๊ฐ: ./chroma_db)",
+ default=CHROMA_DB_DIR,
+ help=f"Chroma DB ์ ์ฅ ๊ฒฝ๋ก (๊ธฐ๋ณธ๊ฐ: {CHROMA_DB_DIR})",
)
- # ๋ฐฐ์น ์ฌ์ด์ฆ๋ฅผ ์ธ์๋ก ๋ฐ์ ์ ์๊ฒ ์ถ๊ฐ (๊ธฐ๋ณธ 100)
parser.add_argument(
"--batch-size",
type=int,
@@ -95,7 +64,7 @@ def parse_args() -> argparse.Namespace:
if __name__ == "__main__":
args = parse_args()
ingest(
- json_path=args.json_path,
- persist_directory=args.persist_directory,
- batch_size=args.batch_size
- )
\ No newline at end of file
+ json_path=args.json_path,
+ persist_directory=args.persist_directory,
+ batch_size=args.batch_size,
+ )
diff --git a/requirements.txt b/requirements.txt
index d231fb5..2ef4455 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,17 @@
-streamlit
-pandas
-langchain
-langchain-core
-langchain-community
-langchain-ollama
-langchain-text-splitters
-langchain-classic
-langchain-chroma
-chromadb
-tqdm
-rank_bm25
-sentence-transformers
-transformers
-torch
-bitsandbytes
-accelerate
+streamlit>=1.30
+pandas>=2.0
+langchain>=0.3
+langchain-core>=0.3
+langchain-community>=0.3
+langchain-ollama>=0.2
+langchain-text-splitters>=0.3
+langchain-classic>=0.1
+langchain-chroma>=0.2
+chromadb>=0.5
+tqdm>=4.60
+rank_bm25>=0.2
+sentence-transformers>=3.0
+transformers>=4.40
+torch>=2.0
+bitsandbytes>=0.43
+accelerate>=0.30
diff --git a/retriever.py b/retriever.py
new file mode 100644
index 0000000..f41f855
--- /dev/null
+++ b/retriever.py
@@ -0,0 +1,86 @@
+import json
+import os
+from pathlib import Path
+
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from langchain_ollama import OllamaEmbeddings
+
+from config import (
+ RAG_JSON_PATH, CHROMA_DB_DIR, EMBEDDING_MODEL,
+ RERANKER_MODEL, RERANKER_TOP_N,
+ BM25_K, VECTOR_K, ENSEMBLE_WEIGHTS,
+)
+
+
+def load_documents(path=RAG_JSON_PATH):
+ """๋ฏธ๋ฆฌ ์ ์ฒ๋ฆฌ๋ JSON ๋ฐ์ดํฐ๋ฅผ ์ฝ์ด LangChain Document ๋ฆฌ์คํธ๋ก ๋ณํํฉ๋๋ค."""
+ if not os.path.exists(path):
+ raise FileNotFoundError(f"๋ฐ์ดํฐ ํ์ผ({path})์ด ์์ต๋๋ค.")
+ with open(path, "r", encoding="utf-8") as f:
+ raw = json.load(f)
+ docs = []
+ for item in raw:
+ content = f"์ ๋ชฉ: {item.get('title', '')}\n๋ด์ฉ: {item.get('content_chunk', '')}"
+ metadata = {
+ "chunk_id": item.get("chunk_id", ""),
+ "title": item.get("title", ""),
+ "url": item.get("url", "")
+ }
+ docs.append(Document(page_content=content.strip(), metadata=metadata))
+ return docs
+
+
+def load_vectorstore(persist_directory=CHROMA_DB_DIR):
+ """Chroma ๋ฒกํฐDB๋ฅผ ๋ก๋ํฉ๋๋ค."""
+ if not Path(persist_directory).exists():
+ raise FileNotFoundError("๋ฒกํฐDB๊ฐ ์์ง ์์ฑ๋์ง ์์์ต๋๋ค. ๋ฐ์ดํฐ๋ฅผ ๋จผ์ ์๋ฒ ๋ฉํ์ธ์.")
+ embed = OllamaEmbeddings(model=EMBEDDING_MODEL)
+ return Chroma(persist_directory=persist_directory, embedding_function=embed)
+
+
+def init_retrievers(docs, vector_db):
+ """Hybrid (BM25 + Vector) ๊ฒ์ + Cross-Encoder Reranker๋ฅผ ์ด๊ธฐํํฉ๋๋ค."""
+ import torch
+ from langchain_community.retrievers import BM25Retriever
+ from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever
+ from langchain_community.cross_encoders import HuggingFaceCrossEncoder
+ from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
+
+ bm25 = BM25Retriever.from_documents(docs)
+ bm25.k = BM25_K
+ vect = vector_db.as_retriever(search_kwargs={"k": VECTOR_K})
+ hybrid = EnsembleRetriever(retrievers=[bm25, vect], weights=ENSEMBLE_WEIGHTS)
+
+ if "Qwen" in RERANKER_MODEL:
+ model_kwargs = {
+ "automodel_args": {
+ "torch_dtype": torch.float16,
+ "trust_remote_code": True,
+ }
+ }
+ else:
+ model_kwargs = {"model_kwargs": {"torch_dtype": torch.float16}}
+
+ model = HuggingFaceCrossEncoder(model_name=RERANKER_MODEL, model_kwargs=model_kwargs)
+ re_ranker = CrossEncoderReranker(model=model, top_n=RERANKER_TOP_N)
+ return ContextualCompressionRetriever(base_compressor=re_ranker, base_retriever=hybrid)
+
+
+def format_docs_and_extract_urls(docs):
+ """๊ฒ์๋ ๋ฌธ์๋ฅผ ํ
์คํธํํ๊ณ ๊ณ ์ URL ํ์ดํผ๋งํฌ๋ฅผ ์ถ์ถํฉ๋๋ค."""
+ context_parts = []
+ unique_refs = {}
+
+ for d in docs:
+ title = d.metadata.get('title', '์ ๋ชฉ ์์')
+ url = d.metadata.get('url', '')
+ context_parts.append(f"[์ถ์ฒ: {title}]\n{d.page_content}")
+ if url and url not in unique_refs:
+ unique_refs[url] = title
+
+ context_str = "\n\n---\n\n".join(context_parts)
+ ref_list = [f"- [{title}]({url})" for url, title in unique_refs.items()]
+ refs_str = "\n".join(ref_list)
+
+ return context_str, refs_str
diff --git a/text_utils.py b/text_utils.py
new file mode 100644
index 0000000..0b5c3ab
--- /dev/null
+++ b/text_utils.py
@@ -0,0 +1,29 @@
+import re
+
+
+FORBIDDEN_TAGS = [
+ "thought", "references", "conclusion", "answer",
+ "response", "output", "result", "context", "question",
+]
+
+
+def clean_response(text: str) -> str:
+ """๋ถํ์ํ XML ํ๊ทธ๋ฅผ ์ ๊ฑฐํ๊ณ ๊ณผ๋ํ ๋น ์ค์ ์ ๋ฆฌํฉ๋๋ค."""
+ for tag in FORBIDDEN_TAGS:
+ text = re.sub(rf"?{tag}>", "", text, flags=re.IGNORECASE)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text.strip()
+
+
+def extract_think_and_answer(text: str):
+ """๋ฌธ์์ด์์ ๋ถ๋ถ๊ณผ ์ค์ ๋ต๋ณ ๋ถ๋ถ์ ์๋ฒฝํ๊ฒ ๋ถ๋ฆฌํฉ๋๋ค."""
+ if "" in text and "" in text:
+ parts = text.split("", 1)
+ think_content = parts[0].split("")[-1].strip()
+ answer_content = clean_response(parts[1])
+ return think_content, answer_content
+ elif "" in text:
+ think_content = text.split("")[-1].strip()
+ return think_content, ""
+ else:
+ return "", clean_response(text)