From f7548036cb035a7ffe67180751f9c7a612d01f51 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 16:51:27 +0000 Subject: [PATCH 1/8] Add PyPI publishing support - Configure pyproject.toml for PyPI package 'vector-benchmark' - Add console script entry point 'vector-db-benchmark' - Add GitHub Actions workflow for automated PyPI publishing - Update help text examples to use 'vector-db-benchmark' and 'redis-*' - Add __init__.py files to datasets and experiments for proper packaging - Improve download messages to show destination folder - Add Redis Performance Team to authors - Add package metadata: description, keywords, classifiers, homepage, etc. --- .github/workflows/publish-pypi.yml | 43 ++++++++++++++++++++++++++ benchmark/dataset.py | 14 +++++++-- datasets/__init__.py | 2 ++ experiments/__init__.py | 2 ++ experiments/configurations/__init__.py | 2 ++ pyproject.toml | 34 ++++++++++++++++++-- run.py | 13 +++++--- 7 files changed, 100 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/publish-pypi.yml mode change 100644 => 100755 benchmark/dataset.py create mode 100644 datasets/__init__.py create mode 100644 experiments/__init__.py create mode 100644 experiments/configurations/__init__.py mode change 100644 => 100755 pyproject.toml mode change 100644 => 100755 run.py diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 000000000..9b90a3244 --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,43 @@ +name: Publish to PyPI +on: + release: + types: [published] + workflow_dispatch: # Allow manual triggering + +jobs: + pypi: + name: Publish to PyPI + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/vector-benchmark + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + version: latest + virtualenvs-create: true + virtualenvs-in-project: true + + - name: Build package + run: poetry build + + - name: Check package + run: | + pip install twine + twine check dist/* + + - name: Publish to PyPI + env: + POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} + run: poetry publish diff --git a/benchmark/dataset.py b/benchmark/dataset.py old mode 100644 new mode 100755 index 3b992ecb1..fe07f3da1 --- a/benchmark/dataset.py +++ b/benchmark/dataset.py @@ -178,7 +178,7 @@ def download(self): except botocore.exceptions.NoCredentialsError: print("Credentials not found, downloading without boto3") if not downloaded_withboto: - print(f"Downloading from URL {self.config.link}...") + print(f"Downloading from URL {self.config.link} to {target_path}...") tmp_path, _ = download_with_headers( self.config.link, None, show_progress ) @@ -190,7 +190,7 @@ def _download_file(self, relative_path: str, url: str): print(f"{target_path} already exists") return - print(f"Downloading from {url} to {target_path}") + print(f"Downloading from {url} to {target_path}...") tmp_path, _ = download_with_headers(url, None, show_progress) self._extract_or_move_file(tmp_path, target_path) @@ -201,6 +201,14 @@ def _extract_or_move_file(self, tmp_path, target_path): with tarfile.open(tmp_path) as file: file.extractall(target_path) os.remove(tmp_path) + elif tmp_path.endswith(".bz2"): + print(f"Decompressing bz2: {tmp_path} -> {target_path}") + import bz2 + Path(target_path).parent.mkdir(exist_ok=True) + with bz2.BZ2File(tmp_path, 'rb') as f_in: + with open(target_path, 'wb') as f_out: + f_out.write(f_in.read()) + os.remove(tmp_path) else: print(f"Moving: {tmp_path} -> {target_path}") Path(target_path).parent.mkdir(exist_ok=True) @@ -213,7 +221,7 @@ def _download_from_s3(self, link, target_path): tmp_path = f"/tmp/{os.path.basename(s3_key)}" print( - f"Downloading from S3: {link}... bucket_name={bucket_name}, s3_key={s3_key}" + f"Downloading from S3: {link} to {target_path}... (bucket={bucket_name}, key={s3_key})" ) object_info = s3.head_object(Bucket=bucket_name, Key=s3_key) total_size = object_info["ContentLength"] diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 000000000..a84fcfd10 --- /dev/null +++ b/datasets/__init__.py @@ -0,0 +1,2 @@ +# This file makes the datasets directory a Python package + diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 000000000..e576b6ca0 --- /dev/null +++ b/experiments/__init__.py @@ -0,0 +1,2 @@ +# This file makes the experiments directory a Python package + diff --git a/experiments/configurations/__init__.py b/experiments/configurations/__init__.py new file mode 100644 index 000000000..385ee10fd --- /dev/null +++ b/experiments/configurations/__init__.py @@ -0,0 +1,2 @@ +# This file makes the configurations directory a Python package + diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 index 0109cfb80..ef48bf06f --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,33 @@ [tool.poetry] -name = "vector-db-benchmark" +name = "vector-benchmark" version = "0.1.0" -description = "" -authors = ["Kacper Łukawski "] +description = "Benchmark suite for vector databases with Redis support" +authors = ["Kacper Łukawski ", "Redis Performance Team "] +readme = "README.md" +license = "LICENSE" +homepage = "https://github.com/redislabs/vector-db-benchmark" +repository = "https://github.com/redislabs/vector-db-benchmark" +keywords = ["vector", "database", "benchmark", "redis", "similarity-search"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Software Development :: Testing", + "Topic :: Database", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +packages = [ + { include = "benchmark" }, + { include = "dataset_reader" }, + { include = "engine" }, + { include = "datasets" }, + { include = "experiments" }, + { include = "run.py" }, +] [tool.poetry.dependencies] python = ">=3.9,<3.14" @@ -29,6 +54,9 @@ boto3 = "^1.39.4" pre-commit = "^2.20.0" pytest = "^7.1" +[tool.poetry.scripts] +vector-db-benchmark = "run:app" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/run.py b/run.py old mode 100644 new mode 100755 index 0a1bd8767..b5eba3313 --- a/run.py +++ b/run.py @@ -36,10 +36,15 @@ def run( verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed information when using --describe"), ): """ - Example: - python3 run.py --engines *-m-16-* --engines qdrant-* --datasets glove-* - python3 run.py --describe datasets - python3 run.py --describe engines --verbose + Run vector database benchmarks. + + Examples: + + vector-db-benchmark --engines redis-* --datasets glove-* + + vector-db-benchmark --describe datasets + + vector-db-benchmark --describe engines --verbose """ # Handle describe option first if describe: From 6dbff9c9bc10ea439ff20ba0c0fb666ac42792b7 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 17:18:02 +0000 Subject: [PATCH 2/8] Fix pyproject.toml package configuration for Docker builds - Move run.py from packages to include list - run.py is a module file, not a package directory - Fixes Poetry error: '/code/benchmark does not contain any element' --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7436255de..53320650b 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,8 @@ packages = [ { include = "engine" }, { include = "datasets" }, { include = "experiments" }, - { include = "run.py" }, ] +include = ["run.py"] [tool.poetry.dependencies] python = ">=3.9,<3.14" From e8dc0121c9b193a6cbcacca2a93972e21767ec9b Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 17:36:02 +0000 Subject: [PATCH 3/8] Fix Docker build for PyPI package structure - Update pyproject.toml to properly separate packages from data files - Move datasets and experiments from packages to include list - Specify exact files to include to avoid large dataset files - Update Dockerfile to copy source files before poetry install - Fix .dockerignore to allow README.md and LICENSE (needed by Poetry) - Fix empty continuation line warning in Dockerfile - Verified Docker build succeeds and container runs correctly --- .dockerignore | 6 ++++-- Dockerfile | 12 ++++++++++-- pyproject.toml | 11 ++++++++--- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.dockerignore b/.dockerignore index 95bc6558b..94e8dbe26 100644 --- a/.dockerignore +++ b/.dockerignore @@ -84,9 +84,11 @@ Thumbs.db .github/ # Documentation -README.md -LICENSE +# README.md and LICENSE are needed for Poetry package metadata +!README.md +!LICENSE *.md +!README.md docs/ # Temporary files diff --git a/Dockerfile b/Dockerfile index d2b702fc3..c9d3d606e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,6 +30,15 @@ WORKDIR /code # Copy dependency files first for better caching COPY poetry.lock pyproject.toml /code/ +COPY README.md /code/ + +# Copy package directories needed by Poetry +COPY benchmark /code/benchmark +COPY dataset_reader /code/dataset_reader +COPY engine /code/engine +COPY datasets /code/datasets +COPY experiments /code/experiments +COPY run.py /code/run.py # Configure Poetry and install dependencies RUN poetry config virtualenvs.create false \ @@ -38,7 +47,7 @@ RUN poetry config virtualenvs.create false \ # Install additional dependencies RUN pip install "boto3" -# Copy source code +# Copy remaining source code COPY . /code # Store Git information @@ -76,7 +85,6 @@ COPY --from=builder /code /app # Create directories with proper permissions RUN mkdir -p /app/results /app/datasets && \ - chmod -R 777 /app/results /app/datasets && \ chmod -R 755 /app diff --git a/pyproject.toml b/pyproject.toml index 53320650b..a65bed57b 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,10 +24,15 @@ packages = [ { include = "benchmark" }, { include = "dataset_reader" }, { include = "engine" }, - { include = "datasets" }, - { include = "experiments" }, ] -include = ["run.py"] +include = [ + "run.py", + "datasets/__init__.py", + "datasets/datasets.json", + "datasets/random-100/**/*", + "experiments/__init__.py", + "experiments/configurations/**/*", +] [tool.poetry.dependencies] python = ">=3.9,<3.14" From 91ec2c25ae2590af344b9484011349717c65088f Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 17:40:43 +0000 Subject: [PATCH 4/8] Update Dockerfile to use vector-db-benchmark command - Change working directory from /app to /code for consistency with Poetry .pth file - Update entrypoint script to use /code paths - Change default CMD from 'run.py' to 'vector-db-benchmark' command - Simplify entrypoint to just ensure results directory permissions - Now uses the Poetry-installed console script instead of direct Python execution --- Dockerfile | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index c9d3d606e..94fb592ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,30 +74,28 @@ RUN apt-get update && apt-get install -y \ # Set working directory -WORKDIR /app +WORKDIR /code # Copy Python environment from builder COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages COPY --from=builder /usr/local/bin /usr/local/bin # Copy application code -COPY --from=builder /code /app +COPY --from=builder /code /code # Create directories with proper permissions -RUN mkdir -p /app/results /app/datasets && \ - chmod -R 777 /app/results /app/datasets && \ - chmod -R 755 /app +RUN mkdir -p /code/results /code/datasets && \ + chmod -R 777 /code/results /code/datasets && \ + chmod -R 755 /code # Create entrypoint script to handle user permissions RUN echo '#!/bin/bash\n\ # Handle user permissions for volume mounts\n\ -if [ "$1" = "run.py" ]; then\n\ - # Ensure results directory is writable\n\ - mkdir -p /app/results\n\ - chmod 777 /app/results\n\ -fi\n\ -exec python "$@"' > /app/entrypoint.sh && \ - chmod +x /app/entrypoint.sh +# Ensure results directory is writable\n\ +mkdir -p /code/results\n\ +chmod 777 /code/results\n\ +exec "$@"' > /code/entrypoint.sh && \ + chmod +x /code/entrypoint.sh # Health check @@ -108,10 +106,8 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ EXPOSE 6379 6380 # Set entrypoint - -ENTRYPOINT ["/app/entrypoint.sh"] - +ENTRYPOINT ["/code/entrypoint.sh"] # Default command (show help) -CMD ["run.py", "--help"] +CMD ["vector-db-benchmark", "--help"] From 5a0769cb694c96b4f17a2c327209d6b2d7622370 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 18:14:40 +0000 Subject: [PATCH 5/8] Update test scripts to use vector-db-benchmark command - Update GitHub Actions workflow to use vector-db-benchmark instead of run.py - Update docker-test.sh to use vector-db-benchmark command - Add --entrypoint python for Python-specific tests - Update volume mount paths from /app to /code - Ensures tests work with Poetry-installed console script --- .github/workflows/docker-build-pr.yml | 10 +++++----- docker-test.sh | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/docker-build-pr.yml b/.github/workflows/docker-build-pr.yml index b96a9e82a..e963d60f7 100644 --- a/.github/workflows/docker-build-pr.yml +++ b/.github/workflows/docker-build-pr.yml @@ -93,22 +93,22 @@ jobs: # Test help command echo "Testing --help command..." - docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} run.py --help + docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} vector-db-benchmark --help # Test Python environment echo "Testing Python environment..." - docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" + docker run --rm --entrypoint python ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" # Test Redis connectivity echo "Testing Redis connectivity..." - docker run --rm --network host ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ + docker run --rm --network host --entrypoint python ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ -c "import redis; r = redis.Redis(host='localhost', port=6379); r.ping(); print('Redis connection successful')" # Test benchmark execution with specific configuration echo "Testing benchmark execution with redis-m-16-ef-64 configuration..." mkdir -p ./test-results - docker run --rm --network host -v "$(pwd)/test-results:/app/results" ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ - run.py --host localhost --engines redis --dataset random-100 --experiment redis-m-16-ef-64 --skip-upload --skip-search || echo "Benchmark test completed (expected to fail without proper dataset setup)" + docker run --rm --network host -v "$(pwd)/test-results:/code/results" ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ + vector-db-benchmark --host localhost --engines redis --dataset random-100 --experiment redis-m-16-ef-64 --skip-upload --skip-search || echo "Benchmark test completed (expected to fail without proper dataset setup)" echo "✅ Docker image tests passed!" diff --git a/docker-test.sh b/docker-test.sh index db5719650..0dd4e4841 100755 --- a/docker-test.sh +++ b/docker-test.sh @@ -61,7 +61,7 @@ print_step "Testing basic functionality..." # Test help command print_info "Testing --help command..." -if docker run --rm "$FULL_IMAGE_NAME" run.py --help > /dev/null; then +if docker run --rm "$FULL_IMAGE_NAME" vector-db-benchmark --help > /dev/null; then print_info "✅ Help command works" else print_error "❌ Help command failed" @@ -70,7 +70,7 @@ fi # Test Python environment print_info "Testing Python environment..." -if docker run --rm "$FULL_IMAGE_NAME" -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" > /dev/null; then +if docker run --rm --entrypoint python "$FULL_IMAGE_NAME" -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" > /dev/null; then print_info "✅ Python environment works" else print_error "❌ Python environment test failed" @@ -91,14 +91,14 @@ if docker run -d --name "$REDIS_CONTAINER_NAME" -p 6379:6379 redis:8.2-rc1-bookw sleep 5 # Test basic connection - if timeout 10 docker run --rm --network=host "$FULL_IMAGE_NAME" \ + if timeout 10 docker run --rm --network=host --entrypoint python "$FULL_IMAGE_NAME" \ -c "import redis; r = redis.Redis(host='localhost', port=6379); r.ping(); print('Redis connection successful')" > /dev/null 2>&1; then print_info "✅ Redis connectivity test passed" # Test benchmark execution with specific configuration print_info "Testing benchmark execution with redis-default-simple configuration..." - if timeout 120 docker run --rm --network=host -v "$(pwd)/results:/app/results" "$FULL_IMAGE_NAME" \ - run.py --host localhost --engines redis --dataset random-100 --experiment redis-default-simple > /dev/null 2>&1; then + if timeout 120 docker run --rm --network=host -v "$(pwd)/results:/code/results" "$FULL_IMAGE_NAME" \ + vector-db-benchmark --host localhost --engines redis --dataset random-100 --experiment redis-default-simple > /dev/null 2>&1; then print_info "✅ Benchmark execution test passed" else print_warning "⚠️ Benchmark execution test failed (this may be expected without proper dataset setup)" @@ -118,8 +118,8 @@ fi # Step 4: Test file output permissions print_step "Testing file output permissions..." TEMP_DIR=$(mktemp -d) -if docker run --rm -v "$TEMP_DIR:/app/results" "$FULL_IMAGE_NAME" \ - -c "import os; os.makedirs('/app/results', exist_ok=True); open('/app/results/test.txt', 'w').write('test'); print('File write successful')" > /dev/null 2>&1; then +if docker run --rm -v "$TEMP_DIR:/code/results" --entrypoint python "$FULL_IMAGE_NAME" \ + -c "import os; os.makedirs('/code/results', exist_ok=True); open('/code/results/test.txt', 'w').write('test'); print('File write successful')" > /dev/null 2>&1; then if [ -f "$TEMP_DIR/test.txt" ]; then print_info "✅ File output test passed" else @@ -137,8 +137,8 @@ print_info "Image size: $IMAGE_SIZE" # Step 6: Test benchmark configuration loading print_step "Testing benchmark configuration loading..." -if docker run --rm "$FULL_IMAGE_NAME" \ - -c "import json; import os; print('Configuration loading test'); print(os.listdir('/app'))" > /dev/null 2>&1; then +if docker run --rm --entrypoint python "$FULL_IMAGE_NAME" \ + -c "import json; import os; print('Configuration loading test'); print(os.listdir('/code'))" > /dev/null 2>&1; then print_info "✅ Configuration loading test passed" else print_warning "⚠️ Configuration loading test completed with warnings" From 867c4636bda877d962b3aef5c1843a0bc9869599 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 21:31:12 +0000 Subject: [PATCH 6/8] Update PyPI package metadata to avoid trademark concerns - Remove original author from PyPI authors field - Update description to mention this is a fork - Keeps Redis Performance Team as the package maintainer - Original attribution preserved in LICENSE and git history per Apache 2.0 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a65bed57b..760175b96 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "vector-benchmark" version = "0.1.0" -description = "Benchmark suite for vector databases with Redis support" -authors = ["Kacper Łukawski ", "Redis Performance Team "] +description = "Benchmark suite for vector databases with Redis support. Forked from the original vector-db-benchmark project." +authors = ["Redis Performance Team "] readme = "README.md" license = "LICENSE" homepage = "https://github.com/redislabs/vector-db-benchmark" From b19fea2e143c444dc4d510a1acfe1db1ba43cac6 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 21:33:07 +0000 Subject: [PATCH 7/8] Fixes per PR review: removed duplicate --- .dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 94e8dbe26..2e64290e4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -88,7 +88,6 @@ Thumbs.db !README.md !LICENSE *.md -!README.md docs/ # Temporary files From 39c34813821bf74661f55f2af68edbf42989cb77 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 11 Feb 2026 21:36:46 +0000 Subject: [PATCH 8/8] Fixed dockerignore order --- .dockerignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 2e64290e4..ee178ab9e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -85,9 +85,9 @@ Thumbs.db # Documentation # README.md and LICENSE are needed for Poetry package metadata +*.md !README.md !LICENSE -*.md docs/ # Temporary files