diff --git a/.dockerignore b/.dockerignore index 95bc6558b..ee178ab9e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -84,9 +84,10 @@ Thumbs.db .github/ # Documentation -README.md -LICENSE +# README.md and LICENSE are needed for Poetry package metadata *.md +!README.md +!LICENSE docs/ # Temporary files diff --git a/.github/workflows/docker-build-pr.yml b/.github/workflows/docker-build-pr.yml index b96a9e82a..e963d60f7 100644 --- a/.github/workflows/docker-build-pr.yml +++ b/.github/workflows/docker-build-pr.yml @@ -93,22 +93,22 @@ jobs: # Test help command echo "Testing --help command..." - docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} run.py --help + docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} vector-db-benchmark --help # Test Python environment echo "Testing Python environment..." - docker run --rm ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" + docker run --rm --entrypoint python ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" # Test Redis connectivity echo "Testing Redis connectivity..." - docker run --rm --network host ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ + docker run --rm --network host --entrypoint python ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ -c "import redis; r = redis.Redis(host='localhost', port=6379); r.ping(); print('Redis connection successful')" # Test benchmark execution with specific configuration echo "Testing benchmark execution with redis-m-16-ef-64 configuration..." mkdir -p ./test-results - docker run --rm --network host -v "$(pwd)/test-results:/app/results" ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ - run.py --host localhost --engines redis --dataset random-100 --experiment redis-m-16-ef-64 --skip-upload --skip-search || echo "Benchmark test completed (expected to fail without proper dataset setup)" + docker run --rm --network host -v "$(pwd)/test-results:/code/results" ${{ env.IMAGE_NAME }}:pr-${{ github.event.number }} \ + vector-db-benchmark --host localhost --engines redis --dataset random-100 --experiment redis-m-16-ef-64 --skip-upload --skip-search || echo "Benchmark test completed (expected to fail without proper dataset setup)" echo "✅ Docker image tests passed!" diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 000000000..9b90a3244 --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,43 @@ +name: Publish to PyPI +on: + release: + types: [published] + workflow_dispatch: # Allow manual triggering + +jobs: + pypi: + name: Publish to PyPI + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/vector-benchmark + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + version: latest + virtualenvs-create: true + virtualenvs-in-project: true + + - name: Build package + run: poetry build + + - name: Check package + run: | + pip install twine + twine check dist/* + + - name: Publish to PyPI + env: + POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} + run: poetry publish diff --git a/Dockerfile b/Dockerfile index d2b702fc3..94fb592ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,6 +30,15 @@ WORKDIR /code # Copy dependency files first for better caching COPY poetry.lock pyproject.toml /code/ +COPY README.md /code/ + +# Copy package directories needed by Poetry +COPY benchmark /code/benchmark +COPY dataset_reader /code/dataset_reader +COPY engine /code/engine +COPY datasets /code/datasets +COPY experiments /code/experiments +COPY run.py /code/run.py # Configure Poetry and install dependencies RUN poetry config virtualenvs.create false \ @@ -38,7 +47,7 @@ RUN poetry config virtualenvs.create false \ # Install additional dependencies RUN pip install "boto3" -# Copy source code +# Copy remaining source code COPY . /code # Store Git information @@ -65,31 +74,28 @@ RUN apt-get update && apt-get install -y \ # Set working directory -WORKDIR /app +WORKDIR /code # Copy Python environment from builder COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages COPY --from=builder /usr/local/bin /usr/local/bin # Copy application code -COPY --from=builder /code /app +COPY --from=builder /code /code # Create directories with proper permissions -RUN mkdir -p /app/results /app/datasets && \ - - chmod -R 777 /app/results /app/datasets && \ - chmod -R 755 /app +RUN mkdir -p /code/results /code/datasets && \ + chmod -R 777 /code/results /code/datasets && \ + chmod -R 755 /code # Create entrypoint script to handle user permissions RUN echo '#!/bin/bash\n\ # Handle user permissions for volume mounts\n\ -if [ "$1" = "run.py" ]; then\n\ - # Ensure results directory is writable\n\ - mkdir -p /app/results\n\ - chmod 777 /app/results\n\ -fi\n\ -exec python "$@"' > /app/entrypoint.sh && \ - chmod +x /app/entrypoint.sh +# Ensure results directory is writable\n\ +mkdir -p /code/results\n\ +chmod 777 /code/results\n\ +exec "$@"' > /code/entrypoint.sh && \ + chmod +x /code/entrypoint.sh # Health check @@ -100,10 +106,8 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ EXPOSE 6379 6380 # Set entrypoint - -ENTRYPOINT ["/app/entrypoint.sh"] - +ENTRYPOINT ["/code/entrypoint.sh"] # Default command (show help) -CMD ["run.py", "--help"] +CMD ["vector-db-benchmark", "--help"] diff --git a/benchmark/dataset.py b/benchmark/dataset.py old mode 100644 new mode 100755 index 968f37d0b..11c307ad1 --- a/benchmark/dataset.py +++ b/benchmark/dataset.py @@ -179,7 +179,7 @@ def download(self): except botocore.exceptions.NoCredentialsError: print("Credentials not found, downloading without boto3") if not downloaded_withboto: - print(f"Downloading from URL {self.config.link}...") + print(f"Downloading from URL {self.config.link} to {target_path}...") tmp_path, _ = download_with_headers( self.config.link, None, show_progress ) @@ -191,7 +191,7 @@ def _download_file(self, relative_path: str, url: str): print(f"{target_path} already exists") return - print(f"Downloading from {url} to {target_path}") + print(f"Downloading from {url} to {target_path}...") tmp_path, _ = download_with_headers(url, None, show_progress) self._extract_or_move_file(tmp_path, target_path) @@ -210,7 +210,7 @@ def _extract_or_move_file(self, tmp_path, target_path): final_target_path = str(target_path)[:-4] # Remove .bz2 else: final_target_path = target_path - + with bz2.BZ2File(tmp_path, 'rb') as f_in: with open(final_target_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) @@ -227,7 +227,7 @@ def _download_from_s3(self, link, target_path): tmp_path = f"/tmp/{os.path.basename(s3_key)}" print( - f"Downloading from S3: {link}... bucket_name={bucket_name}, s3_key={s3_key}" + f"Downloading from S3: {link} to {target_path}... (bucket={bucket_name}, key={s3_key})" ) object_info = s3.head_object(Bucket=bucket_name, Key=s3_key) total_size = object_info["ContentLength"] diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 000000000..a84fcfd10 --- /dev/null +++ b/datasets/__init__.py @@ -0,0 +1,2 @@ +# This file makes the datasets directory a Python package + diff --git a/docker-test.sh b/docker-test.sh index db5719650..0dd4e4841 100755 --- a/docker-test.sh +++ b/docker-test.sh @@ -61,7 +61,7 @@ print_step "Testing basic functionality..." # Test help command print_info "Testing --help command..." -if docker run --rm "$FULL_IMAGE_NAME" run.py --help > /dev/null; then +if docker run --rm "$FULL_IMAGE_NAME" vector-db-benchmark --help > /dev/null; then print_info "✅ Help command works" else print_error "❌ Help command failed" @@ -70,7 +70,7 @@ fi # Test Python environment print_info "Testing Python environment..." -if docker run --rm "$FULL_IMAGE_NAME" -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" > /dev/null; then +if docker run --rm --entrypoint python "$FULL_IMAGE_NAME" -c "import sys; print(f'Python {sys.version}'); import redis; print('Redis module available')" > /dev/null; then print_info "✅ Python environment works" else print_error "❌ Python environment test failed" @@ -91,14 +91,14 @@ if docker run -d --name "$REDIS_CONTAINER_NAME" -p 6379:6379 redis:8.2-rc1-bookw sleep 5 # Test basic connection - if timeout 10 docker run --rm --network=host "$FULL_IMAGE_NAME" \ + if timeout 10 docker run --rm --network=host --entrypoint python "$FULL_IMAGE_NAME" \ -c "import redis; r = redis.Redis(host='localhost', port=6379); r.ping(); print('Redis connection successful')" > /dev/null 2>&1; then print_info "✅ Redis connectivity test passed" # Test benchmark execution with specific configuration print_info "Testing benchmark execution with redis-default-simple configuration..." - if timeout 120 docker run --rm --network=host -v "$(pwd)/results:/app/results" "$FULL_IMAGE_NAME" \ - run.py --host localhost --engines redis --dataset random-100 --experiment redis-default-simple > /dev/null 2>&1; then + if timeout 120 docker run --rm --network=host -v "$(pwd)/results:/code/results" "$FULL_IMAGE_NAME" \ + vector-db-benchmark --host localhost --engines redis --dataset random-100 --experiment redis-default-simple > /dev/null 2>&1; then print_info "✅ Benchmark execution test passed" else print_warning "⚠️ Benchmark execution test failed (this may be expected without proper dataset setup)" @@ -118,8 +118,8 @@ fi # Step 4: Test file output permissions print_step "Testing file output permissions..." TEMP_DIR=$(mktemp -d) -if docker run --rm -v "$TEMP_DIR:/app/results" "$FULL_IMAGE_NAME" \ - -c "import os; os.makedirs('/app/results', exist_ok=True); open('/app/results/test.txt', 'w').write('test'); print('File write successful')" > /dev/null 2>&1; then +if docker run --rm -v "$TEMP_DIR:/code/results" --entrypoint python "$FULL_IMAGE_NAME" \ + -c "import os; os.makedirs('/code/results', exist_ok=True); open('/code/results/test.txt', 'w').write('test'); print('File write successful')" > /dev/null 2>&1; then if [ -f "$TEMP_DIR/test.txt" ]; then print_info "✅ File output test passed" else @@ -137,8 +137,8 @@ print_info "Image size: $IMAGE_SIZE" # Step 6: Test benchmark configuration loading print_step "Testing benchmark configuration loading..." -if docker run --rm "$FULL_IMAGE_NAME" \ - -c "import json; import os; print('Configuration loading test'); print(os.listdir('/app'))" > /dev/null 2>&1; then +if docker run --rm --entrypoint python "$FULL_IMAGE_NAME" \ + -c "import json; import os; print('Configuration loading test'); print(os.listdir('/code'))" > /dev/null 2>&1; then print_info "✅ Configuration loading test passed" else print_warning "⚠️ Configuration loading test completed with warnings" diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 000000000..e576b6ca0 --- /dev/null +++ b/experiments/__init__.py @@ -0,0 +1,2 @@ +# This file makes the experiments directory a Python package + diff --git a/experiments/configurations/__init__.py b/experiments/configurations/__init__.py new file mode 100644 index 000000000..385ee10fd --- /dev/null +++ b/experiments/configurations/__init__.py @@ -0,0 +1,2 @@ +# This file makes the configurations directory a Python package + diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 index 538c2cba2..760175b96 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,38 @@ [tool.poetry] -name = "vector-db-benchmark" +name = "vector-benchmark" version = "0.1.0" -description = "" -authors = ["Kacper Łukawski "] +description = "Benchmark suite for vector databases with Redis support. Forked from the original vector-db-benchmark project." +authors = ["Redis Performance Team "] +readme = "README.md" +license = "LICENSE" +homepage = "https://github.com/redislabs/vector-db-benchmark" +repository = "https://github.com/redislabs/vector-db-benchmark" +keywords = ["vector", "database", "benchmark", "redis", "similarity-search"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Software Development :: Testing", + "Topic :: Database", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +packages = [ + { include = "benchmark" }, + { include = "dataset_reader" }, + { include = "engine" }, +] +include = [ + "run.py", + "datasets/__init__.py", + "datasets/datasets.json", + "datasets/random-100/**/*", + "experiments/__init__.py", + "experiments/configurations/**/*", +] [tool.poetry.dependencies] python = ">=3.9,<3.14" @@ -29,6 +59,9 @@ boto3 = "^1.39.4" pre-commit = "^2.20.0" pytest = "^7.1" +[tool.poetry.scripts] +vector-db-benchmark = "run:app" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/run.py b/run.py old mode 100644 new mode 100755 index b155a31de..978cdeed9 --- a/run.py +++ b/run.py @@ -83,16 +83,15 @@ def run( verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed information when using --describe"), ): """ + Run vector database benchmarks. + Examples: - # Use pattern matching to select engines (original behavior) - python3 run.py --engines *-m-16-* --engines qdrant-* --datasets glove-* - - # Use engines from a specific JSON file - python3 run.py --engines-file my_engines.json --datasets glove-* - - # Describe available options - python3 run.py --describe datasets - python3 run.py --describe engines --verbose + + vector-db-benchmark --engines redis-* --datasets glove-* + + vector-db-benchmark --describe datasets + + vector-db-benchmark --describe engines --verbose """ # Handle describe option first if describe: