From 7d63bbfdb4467551fee6105aebf77f5658d0e2d0 Mon Sep 17 00:00:00 2001 From: zhizhi <928570418@qq.com> Date: Fri, 6 Feb 2026 09:28:47 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20the=20data=20processing?= =?UTF-8?q?=20image=20that=20lacked=20tiktoken=20word=20segmentation=20fil?= =?UTF-8?q?es.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pyproject.toml | 3 ++- make/data_process/Dockerfile | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index af0d10ef4..65e27107a 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -25,7 +25,8 @@ data-process = [ "celery>=5.3.6", "flower>=2.0.1", "nest_asyncio>=1.5.6", - "unstructured[csv,docx,pdf,pptx,xlsx,md]==0.18.14" + "unstructured[csv,docx,pdf,pptx,xlsx,md]==0.18.14", + "huggingface_hub>=0.19.0,<0.21.0" ] test = [ "pytest", diff --git a/make/data_process/Dockerfile b/make/data_process/Dockerfile index e7550fe2d..35d7a6c48 100644 --- a/make/data_process/Dockerfile +++ b/make/data_process/Dockerfile @@ -42,6 +42,9 @@ RUN uv sync --no-cache-dir --extra data-process $(test -n "$MIRROR" && echo "-i COPY sdk /opt/sdk RUN uv pip install --no-cache-dir /opt/sdk $(test -n "$MIRROR" && echo "-i $MIRROR") && \ uv cache clean + +# Pre-download tiktoken cl100k_base model to avoid network issues during runtime +RUN uv run python -c "import tiktoken; enc = tiktoken.get_encoding('cl100k_base')" # Layer 3: copy backend code COPY backend /opt/backend