diff --git a/contrib/Makefile b/contrib/Makefile index abd780f2774..f3bce58556d 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -49,7 +49,8 @@ SUBDIRS = \ tsm_system_rows \ tsm_system_time \ unaccent \ - vacuumlo + vacuumlo \ + orioledb ifeq ($(with_ssl),openssl) SUBDIRS += pgcrypto sslinfo diff --git a/contrib/orioledb/.dockerignore b/contrib/orioledb/.dockerignore new file mode 100644 index 00000000000..abf63d2d8f5 --- /dev/null +++ b/contrib/orioledb/.dockerignore @@ -0,0 +1,100 @@ +# Exclude files and directories to minimize +# the Docker build context size. +# This practice limits the scope of COPY commands in the Dockerfile. +# When not using multi-layer builds, it can effectively reduce the final image size. + +# The first part of this file is should be the same as the .gitignore file +# The second part is the extra not needed content for .dockerignore + +######################################################## +# first part: .gitignore contents +######################################################## + +# Global excludes across all subdirectories +**/*.o +**/*.obj +**/*.bc +**/*.so +**/*.so.[0-9] +**/*.so.[0-9].[0-9] +**/*.so.[0-9].[0-9][0-9] +**/*.sl +**/*.sl.[0-9] +**/*.sl.[0-9].[0-9] +**/*.sl.[0-9].[0-9][0-9] +**/*.dylib +**/*.dll +**/*.exp +**/*.a +**/*.mo +**/*.pot +**/objfiles.txt +**/.deps/ +**/*.gcno +**/*.gcda +**/*.gcov +**/*.gcov.out +**/lcov*.info +**/coverage/ +**/coverage-html-stamp +**/*.vcproj +**/*.vcxproj +**/win32ver.rc +**/*.exe +**/lib*dll.def +**/lib*.pc + +# Local excludes in root directory +test/t/__pycache__/ +test/__pycache__/ +test/log/ +log_docker_build/ +test/results/ +test/tmp_check/ +test/tmp_check_iso/ +test/output_iso/ +include/utils/stopevents_defs.h +include/utils/stopevents_data.h +orioledb.typedefs +ci/antithesis + +# Ignore generated scripts +sql/orioledb--1.0.sql +sql/orioledb--1.4--1.5.sql +sql/orioledb--1.5--1.6.sql +sql/orioledb--1.6--1.7.sql + +####################################################### +# second part: extra .dockerignore contents +####################################################### + +# Exclude version control and continuous integration (CI) directories +.git +.github +.gitattributes +.gitignore +.style.yapf + +# Exclude Dockerfiles +docker/Dockerfile +docker/Dockerfile.ubuntu + +# Exclude OrioleDB Docker test definitions and code +# as they are not needed inside the Docker image. +test/ +ci/local_docker_matrix.sh +ci/docker_matrix.sh + +# Documentation files, which are not needed inside the Docker image. +doc/ +**/*.md + +# Exclude some files that are not needed inside the Docker image. +# but sometimes left in the directory +docker-postgis +wal2json* +**/*.log +**/_*.* + +# Misc +make.flags diff --git a/contrib/orioledb/.gitattributes b/contrib/orioledb/.gitattributes new file mode 100644 index 00000000000..e9eefdb3ac8 --- /dev/null +++ b/contrib/orioledb/.gitattributes @@ -0,0 +1 @@ +*.svg -diff diff --git a/contrib/orioledb/.github/FUNDING.yml b/contrib/orioledb/.github/FUNDING.yml new file mode 100644 index 00000000000..60e6f50e23c --- /dev/null +++ b/contrib/orioledb/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: [orioledb] diff --git a/contrib/orioledb/.github/workflows/antithesis.yml b/contrib/orioledb/.github/workflows/antithesis.yml new file mode 100644 index 00000000000..bf77a9981d5 --- /dev/null +++ b/contrib/orioledb/.github/workflows/antithesis.yml @@ -0,0 +1,170 @@ +name: antithesis + +on: + workflow_dispatch: + inputs: + long: + description: "3 hours" + required: true + type: boolean + default: true + +env: + ANTITHESIS_REPOSITORY: https://us-central1-docker.pkg.dev + ORIOLEDB_REPOSITORY: us-central1-docker.pkg.dev/molten-verve-216720/orioledb-repository + +jobs: + config_build_push: + name: Build config docker images and push to Antithesis repository + runs-on: ubuntu-24.04 + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to Antithesis Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.ANTITHESIS_REPOSITORY }} + username: _json_key + password: ${{ secrets.ANTITHESIS_JSON_KEY }} + + - name: Build regression/isolation config docker image and push to Antithesis repository + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64 + cache-from: type=gha + cache-to: type=gha,mode=max + push: true + file: ci/antithesis/Dockerfile.regress_config + tags: | + ${{ env.ORIOLEDB_REPOSITORY }}/orioledb-config:antithesis-latest + build-args: | + PGTAG=${{ env.PGTAG }} + + - name: Build testgres config docker image and push to Antithesis repository + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64 + cache-from: type=gha + cache-to: type=gha,mode=max + push: true + file: ci/antithesis/Dockerfile.testgres_config + tags: | + ${{ env.ORIOLEDB_REPOSITORY }}/orioledb-config:antithesis-testgres-latest + build-args: | + PGTAG=${{ env.PGTAG }} + app_build_push: + name: Build app/workload docker images and push to Antithesis repository + runs-on: ubuntu-24.04 + strategy: + fail-fast: true + matrix: + pg_version: [17] + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ matrix.pg_version }}: ' .pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to Antithesis Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.ANTITHESIS_REPOSITORY }} + username: _json_key + password: ${{ secrets.ANTITHESIS_JSON_KEY }} + + - name: Build regression/isolation app docker image and push to Antithesis repository + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64 + cache-from: type=gha + cache-to: type=gha,mode=max + push: true + file: ci/antithesis/Dockerfile.regress_app + tags: | + ${{ env.ORIOLEDB_REPOSITORY }}/orioledb:antithesis-pg${{ matrix.pg_version }}-latest + build-args: | + PGTAG=${{ env.PGTAG }} + + - name: Build regression/isolation workload docker image and push to Antithesis repository + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64 + cache-from: type=gha + cache-to: type=gha,mode=max + push: true + file: ci/antithesis/Dockerfile.regress_workload + tags: | + ${{ env.ORIOLEDB_REPOSITORY }}/orioledb-workload:antithesis-pg${{ matrix.pg_version }}-latest + build-args: | + PGTAG=${{ env.PGTAG }} + + - name: Build testgres app/workload docker image and push to Antithesis repository + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64 + cache-from: type=gha + cache-to: type=gha,mode=max + push: true + file: ci/antithesis/Dockerfile.testgres_app_workload + tags: | + ${{ env.ORIOLEDB_REPOSITORY }}/orioledb:antithesis-testgres-pg${{ matrix.pg_version }}-latest + build-args: | + PGTAG=${{ env.PGTAG }} + + regress_webhook: + name: Run regression/isolation tests with fault injection to test system resiliency + runs-on: ubuntu-24.04 + needs: + - config_build_push + - app_build_push + strategy: + fail-fast: true + matrix: + pg_version: [13, 14, 15] + steps: + - name: determine endpoint + run: | + if [ '${{ inputs.long == true }}' = 'true' ]; then + echo "ENDPOINT='fault-tolerance-test__orioledb__network-faults__antithesis-pg'" >> $GITHUB_ENV + else + echo "ENDPOINT='fault-tolerance-test__orioledb-short__no-faults__antithesis-pg'" >> $GITHUB_ENV + fi + - name: fault-tolerance-test regress + run: | + curl -X POST https://orioledb.antithesis.com/api/v1/launch_experiment/${{ env.ENDPOINT }}${{ matrix.pg_version }}-latest -u '${{ secrets.ANTITHESIS_API_USER }}' + testgres_webhook: + name: Run randomized testgres tests without any test harness + runs-on: ubuntu-24.04 + needs: + - config_build_push + - app_build_push + strategy: + fail-fast: true + matrix: + pg_version: [13, 14, 15] + steps: + - name: determine endpoint + run: | + if [ '${{ inputs.long == true }}' = 'true' ]; then + echo "ENDPOINT='fault-tolerance-test__orioledb__thread-pause__antithesis-testgres-pg'" >> $GITHUB_ENV + else + echo "ENDPOINT='fault-tolerance-test__orioledb-short__no-faults__antithesis-testgres-pg'" >> $GITHUB_ENV + fi + - name: fault-tolerance-test testgres + run: | + curl -X POST https://orioledb.antithesis.com/api/v1/launch_experiment/${{ env.ENDPOINT }}${{ matrix.pg_version }}-latest -u '${{ secrets.ANTITHESIS_API_USER }}' diff --git a/contrib/orioledb/.github/workflows/benchmark.yml b/contrib/orioledb/.github/workflows/benchmark.yml new file mode 100644 index 00000000000..5e7cfaa8094 --- /dev/null +++ b/contrib/orioledb/.github/workflows/benchmark.yml @@ -0,0 +1,125 @@ +name: benchmark + +on: + workflow_dispatch: + inputs: + instance_type: + description: "EC2 Instance Type" + required: true + default: "c5d.metal" + pg_version: + description: "PostgreSQL version" + required: true + default: "16" + compiler: + description: "Compiler" + required: true + default: "clang" + args: + description: "Benchmark script args" + required: true + default: "--shared_buffers=32GB --max_wal_size=4GB --clients=160,180,200,220,240 --max_connections=300 --time=60 --engines=orioledb,builtin --tests=read-only-9,read-write-proc --scale=1000 --base_dir=/mnt" +jobs: + start-runner: + name: Start self-hosted EC2 runner + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Start EC2 runner + id: start-ec2-runner + uses: orioledb/ec2-github-runner@v3 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + runner-user: ubuntu + ec2-image-id: ami-02e2a649d220de1f5 + ec2-instance-type: ${{ github.event.inputs.instance_type }} + subnet-id: subnet-04b4e8ee77472631f + security-group-id: sg-0294fb158210dc0df + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "ec2-github-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} + ] + run-benchmark: + name: Do the job on the runner + needs: start-runner # required to start the main job when the runner is ready + runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner + strategy: + fail-fast: false + env: + LLVM_VER: 18 + CHECK_TYPE: debug + COMPILER: ${{ github.event.inputs.compiler }} + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ github.event.inputs.pg_version }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL code into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: postgresql + - name: Setup prerequisites + run: bash ./orioledb/ci/prerequisites.sh + - name: Build + run: bash ./orioledb/ci/build.sh + - name: Install post build prerequisites + run: bash ./orioledb/ci/post_build_prerequisites.sh + - name: Benchmark + run: | + ulimit -n 65535 + sudo systemctl daemon-reload + if [ -b /dev/nvme1n1 ]; then + sudo mkfs -t ext4 /dev/nvme1n1 + sudo mount /dev/nvme1n1 /mnt + fi + sudo chown ubuntu:ubuntu /mnt + sudo sysctl -w vm.nr_hugepages=`grep MemTotal /proc/meminfo | awk '{print int($2/2048/2)}'` + sudo sh -c 'echo "RemoveIPC=no" >> /etc/systemd/logind.conf' + sudo apt-get -y remove -qq unattended-upgrades + sudo sh -c 'echo "/mnt/%t_%p.core" > /proc/sys/kernel/core_pattern' + PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" python3 ./orioledb/ci/pgbench.py ${{ github.event.inputs.args }} --results_dir=$GITHUB_WORKSPACE/results + - name: Check for core dumps + run: bash ./orioledb/ci/check_bench_cores.sh + if: ${{ success() || failure() }} + - name: Upload results + uses: actions/upload-artifact@v7 + with: + name: results + path: results/ + stop-runner: + name: Stop self-hosted EC2 runner + needs: + - start-runner # required to get output from the start-runner job + - run-benchmark # required to wait when the main job is done + runs-on: ubuntu-latest + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@v2 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} diff --git a/contrib/orioledb/.github/workflows/check.yml b/contrib/orioledb/.github/workflows/check.yml new file mode 100644 index 00000000000..138575e60d3 --- /dev/null +++ b/contrib/orioledb/.github/workflows/check.yml @@ -0,0 +1,206 @@ +name: check + +on: + push: + pull_request: + +jobs: + matrix: + runs-on: blacksmith-2vcpu-ubuntu-2404 + outputs: + matrix: ${{ steps.gen.outputs.matrix }} + steps: + - id: gen + shell: python # run script directly using the Python interpreter + run: | + import os, json, itertools + + # Get event type and branch name + event = os.environ["GITHUB_EVENT_NAME"] + ref = os.environ.get("GITHUB_REF_NAME", "") + + # Full matrix: used for main branch or pull requests + full = dict( + cpu = ["amd64", "arm64"], + pg_version = [16, 17], + compiler = ["clang", "gcc"], + check_type = [ + "normal", "debug", "sanitize", "check_page", + "valgrind_1", "valgrind_2", "pg_tests", "dm_log_writes", + ], + # dm-log-writes needs a kernel module that isn't available on + # the arm64 runners, so skip it there. + exclude = [ + {"cpu": "arm64", "check_type": "dm_log_writes"}, + ], + ) + + # Reduced matrix: used for all other push events + branch_combos = [ + ("amd64", 16, "gcc"), + ("amd64", 17, "clang"), + ("arm64", 16, "clang"), + ("arm64", 17, "gcc"), + ] + + # Reverse matrix used for pull requests + reverse_combos = [ + ("amd64", 16, "clang"), + ("amd64", 17, "gcc"), + ("arm64", 16, "gcc"), + ("arm64", 17, "clang"), + ] + + # Make matrix for the PR + if os.environ["GITHUB_EVENT_NAME"] == "pull_request": + with open(os.environ["GITHUB_EVENT_PATH"], "r") as f: + event = json.load(f) + + head_repo = event["pull_request"]["head"]["repo"]["full_name"] + base_repo = event["pull_request"]["base"]["repo"]["full_name"] + + # If the PR is from a fork use full matrix + if head_repo != base_repo: + matrix = full + # If the PR is from the main repository use reduced reverse matrix + else: + matrix = { + "include": [ + {"cpu": c, "pg_version": pg, "compiler": comp, "check_type": ct} + for (c, pg, comp), ct in itertools.product(reverse_combos, full["check_type"]) + if not (c == "arm64" and ct == "dm_log_writes") + ] + } + # Make matrix for branches + else: + # Use full matrix for the main branch + if ref == "main": + matrix = full + # Use reduced matrix for other branches + else: + matrix = { + "include": [ + {"cpu": c, "pg_version": pg, "compiler": comp, "check_type": ct} + for (c, pg, comp), ct in itertools.product(branch_combos, full["check_type"]) + if not (c == "arm64" and ct == "dm_log_writes") + ] + } + + # Export as job output + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"matrix={json.dumps(matrix)}\n") + + check: + needs: matrix + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.matrix.outputs.matrix) }} + + # Select runner based on CPU architecture + runs-on: ${{ matrix.cpu == 'arm64' && 'blacksmith-8vcpu-ubuntu-2404-arm' || 'blacksmith-8vcpu-ubuntu-2404' }} + env: + LLVM_VER: 18 + CPU: ${{ matrix.cpu }} + CHECK_TYPE: ${{ matrix.check_type }} + PG_VERSION: ${{ matrix.pg_version }} + COMPILER: ${{ matrix.compiler }} + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ matrix.pg_version }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL code into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: postgresql + - name: Setup prerequisites + run: bash ./orioledb/ci/prerequisites.sh + - name: Build + run: bash ./orioledb/ci/build.sh + - name: Install post build prerequisites + run: bash ./orioledb/ci/post_build_prerequisites.sh + + - name: Check + timeout-minutes: ${{ startsWith(matrix.check_type, 'valgrind_') && 150 || (matrix.check_type == 'dm_log_writes' && 60 || (matrix.check_type == 'sanitize' && 40 || 20)) }} + run: bash ./orioledb/ci/check.sh + - name: Check output + run: bash ./orioledb/ci/check_output.sh + if: ${{ success() || failure() }} + + - name: Upload regression tests diffs + uses: actions/upload-artifact@v7 + if: ${{ failure() }} + with: + name: ${{ matrix.pg_version }}_${{ matrix.cpu }}_${{ matrix.compiler }}_${{ matrix.check_type }}_regression.diffs + path: | + ./orioledb/test/**/regression.diffs + ./orioledb/test/**/isolation_filtered.diffs + ./orioledb/test/log/*.log + ./postgresql/src/test/**/regression.diffs + ./postgresql/src/test/**/isolation_filtered.diffs + ./postgresql/dump_diff_*.txt + retention-days: 2 + if-no-files-found: ignore + - name: Upload tests logs + uses: actions/upload-artifact@v7 + if: ${{ failure() }} + with: + name: ${{ matrix.pg_version }}_${{ matrix.cpu }}_${{ matrix.compiler }}_${{ matrix.check_type }}_testgres.logs + path: | + ./orioledb/test/tmp_check_t/*/logs/*log + ./orioledb/test/tmp_check/log/*log + ./orioledb/test/tmp_check_iso/log/*log + ./postgresql/pg.log + ./postgresql/rep_pg.log + ./postgresql/src/test/**/tmp_check/*.dump + ./postgresql/src/test/**/tmp_check/log/* + retention-days: 2 + if-no-files-found: ignore + + - name: Show stuck processes + run: bash ./orioledb/ci/list_stuck.sh + if: ${{ always() }} + + - name: Run lcov + if: ${{ matrix.check_type != 'sanitize' && matrix.check_type != 'check_page' }} + run: bash ./orioledb/ci/lcov.sh + - name: Create artifact for coverage.info + if: ${{ matrix.check_type != 'sanitize' && matrix.check_type != 'check_page' }} + uses: actions/upload-artifact@v7 + with: + name: ${{ matrix.pg_version }}_${{ matrix.cpu }}_${{ matrix.compiler }}_${{ matrix.check_type }}_coverage.info + path: ./orioledb/coverage.info + retention-days: 1 + overwrite: true + + finish: + needs: check + runs-on: blacksmith-2vcpu-ubuntu-2404 + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Retrieve saved coverage.infos + uses: actions/download-artifact@v8 + - name: Merge coverage files + run: bash ./orioledb/ci/lcov_merge.sh + - name: coveralls + uses: coverallsapp/github-action@v2 + with: + files: ./orioledb/coverage.info + + cleanup: + needs: finish + runs-on: blacksmith-2vcpu-ubuntu-2404 + steps: + - name: remove artifacts + uses: geekyeggo/delete-artifact@v6 + with: + name: "*coverage.info" diff --git a/contrib/orioledb/.github/workflows/docker.yml b/contrib/orioledb/.github/workflows/docker.yml new file mode 100644 index 00000000000..b3666330f68 --- /dev/null +++ b/contrib/orioledb/.github/workflows/docker.yml @@ -0,0 +1,136 @@ +name: dockerhub + +on: + release: + types: [published] + workflow_dispatch: + inputs: + tag: + description: "Dockerhub tags" + required: false + default: "" + +env: + REGISTRY_IMAGE: orioledb/orioledb + +jobs: + build: + strategy: + fail-fast: false + matrix: + cpu: [amd64, arm64] + postgres: [16, 17] + distr: [alpine, ubuntu] + + # Define runner and distr_version depending on cpu and distr + include: + - cpu: amd64 + distr: alpine + runner: ubuntu-24.04 + distr_version: 3.21 + - cpu: amd64 + distr: ubuntu + runner: ubuntu-24.04 + distr_version: noble + - cpu: arm64 + distr: alpine + runner: ubuntu-24.04-arm + distr_version: 3.21 + - cpu: arm64 + distr: ubuntu + runner: ubuntu-24.04-arm + distr_version: noble + + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v6 + + - uses: docker/setup-buildx-action@v3 + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY_IMAGE }} + flavor: | + latest=true + suffix=-pg${{ matrix.postgres }}${{ matrix.distr == 'ubuntu' && '-ubuntu' || '' }},onlatest=true + tags: | + ${{ inputs.tag && format('type=raw,value={0}', inputs.tag) || '' }} + + - name: Prepare Dockerfiles + run: | + cp docker/Dockerfile ${{ runner.temp }}/Dockerfile + cp docker/Dockerfile.ubuntu ${{ runner.temp }}/Dockerfile.ubuntu + + # Build for one platform and push digest only + - id: build + uses: docker/build-push-action@v6 + with: + context: . + file: ${{ matrix.distr == 'ubuntu' && format('{0}/Dockerfile.ubuntu', runner.temp) || format('{0}/Dockerfile', runner.temp) }} + platforms: linux/${{ matrix.cpu }} + build-args: | + PG_MAJOR=${{ matrix.postgres }} + ALPINE_VERSION=${{ matrix.distr == 'alpine' && matrix.distr_version || '' }} + UBUNTU_VERSION=${{ matrix.distr == 'ubuntu' && matrix.distr_version || '' }} + outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push=true,push-by-digest=true,name-canonical=true + labels: ${{ steps.meta.outputs.labels }} + provenance: false # (как было) + sbom: false + + # Export digest and meta + - name: Export digest + meta + run: | + mkdir -p ${{ runner.temp }}/out + echo "${{ steps.build.outputs.digest }}" \ + > ${{ runner.temp }}/out/${{ matrix.cpu }}-${{ matrix.postgres }}-${{ matrix.distr }}.digest + echo '${{ steps.meta.outputs.json }}' \ + > ${{ runner.temp }}/out/${{ matrix.cpu }}-${{ matrix.postgres }}-${{ matrix.distr }}.meta.json + - uses: actions/upload-artifact@v7 + with: + name: digests-${{ matrix.cpu }}-${{ matrix.postgres }}-${{ matrix.distr }} + path: ${{ runner.temp }}/out/* + retention-days: 1 + + # Merge manifests for the digests built + manifest: + needs: build + runs-on: ubuntu-24.04 + + steps: + - uses: actions/download-artifact@v8 + with: + path: digests + merge-multiple: true + pattern: digests-* + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - uses: docker/setup-buildx-action@v3 + + - name: Create & push multi-arch manifests + shell: bash + run: | + set -euo pipefail + declare -A TAGS + while IFS= read -r -d '' meta; do + digest_file="${meta%.meta.json}.digest" + digest=$(cat "$digest_file") + while IFS= read -r tag; do + TAGS["$tag"]="${TAGS[$tag]:-} $REGISTRY_IMAGE@$digest" + done < <(jq -r '.tags[]' "$meta") + done < <(find digests -type f -name '*.meta.json' -print0) + + for tag in "${!TAGS[@]}"; do + echo "⮑ $tag ⇒ ${TAGS[$tag]}" + docker buildx imagetools create -t "$tag" ${TAGS[$tag]} + done diff --git a/contrib/orioledb/.github/workflows/dockertest.yml b/contrib/orioledb/.github/workflows/dockertest.yml new file mode 100644 index 00000000000..b878b64b4af --- /dev/null +++ b/contrib/orioledb/.github/workflows/dockertest.yml @@ -0,0 +1,70 @@ +name: dockerTEST + +on: + push: + pull_request: + +defaults: + run: + shell: bash + +jobs: + test-docker-builds: + strategy: + fail-fast: true + matrix: + postgres: [16, 17] + compiler: [clang] + distr: [alpine, ubuntu] + include: + - distr-version: "3.21" + distr: alpine + - distr-version: "noble" + distr: ubuntu + + # Only run on push events OR pull requests from forks + if: | + github.event_name == 'push' || + github.event.pull_request.head.repo.full_name != github.repository + runs-on: blacksmith-4vcpu-ubuntu-2404 + continue-on-error: ${{ matrix.distr-version == 'edge' || matrix.distr-version == 'devel' }} + + steps: + - name: Checkout source + uses: actions/checkout@v6 + + - name: docker build orioletest:${{ matrix.postgres }}-${{ matrix.compiler }}-${{ matrix.distr }}-${{ matrix.distr-version }} + uses: docker/build-push-action@v6 + # https://github.com/docker/build-push-action + with: + context: . + file: ${{ matrix.distr == 'ubuntu' && './docker/Dockerfile.ubuntu' || './docker/Dockerfile' }} + platforms: linux/amd64 + push: false + pull: true + tags: orioletest:${{ matrix.postgres }}-${{ matrix.compiler }}-${{ matrix.distr }}-${{ matrix.distr-version }} + build-args: | + ALPINE_VERSION=${{ matrix.distr == 'alpine' && matrix.distr-version || '' }} + UBUNTU_VERSION=${{ matrix.distr == 'ubuntu' && matrix.distr-version || '' }} + PG_MAJOR=${{ matrix.postgres }} + BUILD_CC_COMPILER=${{ matrix.compiler }} + DOCKER_PG_LLVM_DEPS=llvm-dev clang + DEBUG_MODE=false + + # docker imgage testing with https://github.com/docker-library/official-images.git + # to check if the image is compatible with the official-images test suite + # the special orioledb test config is in the ./test/tests/orioledb-config.sh + # Read more: ./test/README.md + - name: Run Docker-official-postgres tests + minimal orioledb test + run: | + OFFIMG_LOCAL_CLONE=./log_docker_build/official-images + OFFIMG_REPO_URL=https://github.com/docker-library/official-images.git + mkdir -p "$OFFIMG_LOCAL_CLONE" + git clone --depth=1 --branch=master "$OFFIMG_REPO_URL" "$OFFIMG_LOCAL_CLONE" + "${OFFIMG_LOCAL_CLONE}/test/run.sh" \ + -c "${OFFIMG_LOCAL_CLONE}/test/config.sh" \ + -c "docker/orioledb-config.sh" \ + orioletest:${{ matrix.postgres }}-${{ matrix.compiler }}-${{ matrix.distr }}-${{ matrix.distr-version }} + + # if you want to push the tested image + # check this example: https://docs.docker.com/build/ci/github-actions/test-before-push/ diff --git a/contrib/orioledb/.github/workflows/perf-test.yml b/contrib/orioledb/.github/workflows/perf-test.yml new file mode 100644 index 00000000000..7171f05ce78 --- /dev/null +++ b/contrib/orioledb/.github/workflows/perf-test.yml @@ -0,0 +1,108 @@ +name: TPC-C Performance Test + +on: + pull_request: + workflow_dispatch: + inputs: + pg_version: + description: "PostgreSQL version" + required: false + default: "17" + compiler: + description: "Compiler" + required: false + default: "gcc" + bench_duration: + description: "Duration per benchmark run" + required: false + default: "5m" + bench_runs: + description: "Number of benchmark runs" + required: false + default: "1" + warehouses: + description: "Number of warehouses / TPC-C scale factor (comma-separated for multiple, e.g. 1,10,100)" + required: false + default: "1" + vus_scale: + description: "VU scale multiplier (1 = 99 VUs, 0.5 ≈ 50, 0.1 ≈ 11)" + required: false + default: "1" + pool_size: + description: "Connection pool size (max and min connections)" + required: false + default: "100" + +jobs: + setup: + name: Generate run matrix + runs-on: ubuntu-latest + outputs: + run-matrix: ${{ steps.gen.outputs.run-matrix }} + steps: + - id: gen + env: + RUNS: ${{ inputs.bench_runs || '1' }} + WAREHOUSES: ${{ inputs.warehouses || '1' }} + run: | + echo "run-matrix=$(python3 -c " + import json, os + runs = int(os.environ['RUNS']) + whs = [s.strip() for s in os.environ['WAREHOUSES'].split(',')] + print(json.dumps({'run': list(range(1, runs+1)), 'warehouses': whs})) + ")" >> $GITHUB_OUTPUT + + bench-head: + name: "Bench head ${{ matrix.warehouses }}W #${{ matrix.run }}" + needs: setup + runs-on: perf-runner + strategy: + matrix: ${{ fromJson(needs.setup.outputs.run-matrix) }} + max-parallel: 1 + env: + LLVM_VER: 18 + CHECK_TYPE: normal + COMPILER: ${{ inputs.compiler || 'gcc' }} + PG_VERSION: ${{ inputs.pg_version || '17' }} + DURATION: ${{ inputs.bench_duration || '5m' }} + WAREHOUSES: ${{ matrix.warehouses }} + VUS_SCALE: ${{ inputs.vus_scale || '1' }} + POOL_SIZE: ${{ inputs.pool_size || '100' }} + steps: + - name: Checkout extension code (head branch) + uses: actions/checkout@v6 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ env.PG_VERSION }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: postgresql + - name: Setup prerequisites + run: bash ./orioledb/ci/prerequisites.sh + - name: Build (with local cache) + run: bash ./orioledb/ci/perf_build.sh + - name: Start PostgreSQL + run: bash ./orioledb/ci/perf_pg_start.sh + - name: Verify PostgreSQL is ready + run: | + export PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" + pg_isready -t 5 + psql -d postgres -c "SELECT orioledb_version();" + - name: Resolve current user + run: echo "PGUSER=$(whoami)" >> $GITHUB_ENV + - name: Run TPC-C benchmark + uses: stroppy-io/stroppy-action@main + with: + preset: tpcc + driver-url: postgres://${{ env.PGUSER }}@localhost:5432/postgres?sslmode=disable + artifact-name: perf-results-head-${{ matrix.warehouses }}W-${{ matrix.run }} + - name: Stop PostgreSQL + if: always() + run: bash ./orioledb/ci/perf_pg_stop.sh diff --git a/contrib/orioledb/.github/workflows/pgindent.yml b/contrib/orioledb/.github/workflows/pgindent.yml new file mode 100644 index 00000000000..6e7ed7ce622 --- /dev/null +++ b/contrib/orioledb/.github/workflows/pgindent.yml @@ -0,0 +1,47 @@ +name: pgindent + +on: + push: + pull_request: + +jobs: + pgindent: + # Only run on push events OR pull requests from forks + if: | + github.event_name == 'push' || + github.event.pull_request.head.repo.full_name != github.repository + runs-on: + - blacksmith-4vcpu-ubuntu-2404 + strategy: + fail-fast: false + matrix: + pg_version: [17] + env: + LLVM_VER: 18 + CPU: ${{ matrix.cpu }} + CHECK_TYPE: pgindent + PG_VERSION: ${{ matrix.pg_version }} + COMPILER: gcc + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ matrix.pg_version }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL code into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: postgresql + - name: Setup prerequisites + run: bash ./orioledb/ci/prerequisites.sh + - name: Build + run: bash ./orioledb/ci/build.sh + - name: Install post build prerequisites + run: bash ./orioledb/ci/post_build_prerequisites.sh + - name: PGIndent + run: bash ./orioledb/ci/pgindent.sh diff --git a/contrib/orioledb/.github/workflows/rpm.yml b/contrib/orioledb/.github/workflows/rpm.yml new file mode 100644 index 00000000000..03106785329 --- /dev/null +++ b/contrib/orioledb/.github/workflows/rpm.yml @@ -0,0 +1,67 @@ +name: rpm + +on: + workflow_dispatch: + inputs: + version: + description: 'Version to build' + required: true + default: '14' + target: + description: 'Target to build' + required: true + default: 'nosignbuild14' + +jobs: + build_rpms: + runs-on: [self-hosted, X64, CentOS] + strategy: + fail-fast: true + steps: + - name: Setup prerequisites + run: | + sudo yum install -y llvm llvm-devel clang clang-devel python3-devel \ + libxml2-devel libxslt-devel libuuid-devel \ + zlib-devel flex bison curl git rpmdevtools \ + glibc-devel perl readline-devel pgdg-srpm-macros \ + libicu-devel llvm-toolset-7-clang \ + krb5-devel e2fsprogs-devel openldap-devel \ + pam-devel perl-ExtUtils-Embed tcl-devel \ + systemtap-sdt-devel libselinux-devel \ + openssl-devel systemd-devel llvm-toolset-7-clang \ + llvm5.0-devel gcc gcc-c++ libzstd-devel lz4-devel + - name: Checkout PGRPMs into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/pgrpms + ref: orioledb + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ github.event.inputs.version }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL code into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: rpm/redhat/${{ github.event.inputs.version }}/postgresql-${{ github.event.inputs.version }}/EL-7/pgsrc + - name: Archive sources + run: | + mv orioledb rpm/redhat/${{ github.event.inputs.version }}/postgresql-${{ github.event.inputs.version }}/EL-7/pgsrc/contrib + cd rpm/redhat/${{ github.event.inputs.version }}/postgresql-${{ github.event.inputs.version }}/EL-7 + tar -czf orioledb.tar.gz -C pgsrc . + rm -rf pgsrc + - name: Build + run: | + rm -rf ~/rpm${{ github.event.inputs.version }} ~/rpmbuild + cd rpm/redhat/${{ github.event.inputs.version }}/postgresql-${{ github.event.inputs.version }}/EL-7 + CC=/opt/rh/llvm-toolset-7/root/usr/bin/clang make ${{ github.event.inputs.target }} + - name: Upload image + uses: actions/upload-artifact@v7 + with: + name: rpms + path: ~/rpm${{ github.event.inputs.version }}/RPMS/x86_64/*.rpm diff --git a/contrib/orioledb/.github/workflows/static.yml b/contrib/orioledb/.github/workflows/static.yml new file mode 100644 index 00000000000..45b4b969ef9 --- /dev/null +++ b/contrib/orioledb/.github/workflows/static.yml @@ -0,0 +1,55 @@ +name: static + +on: + push: + pull_request: + +jobs: + static: + # Only run on push events OR pull requests from forks + if: | + github.event_name == 'push' || + github.event.pull_request.head.repo.full_name != github.repository + runs-on: + - blacksmith-4vcpu-ubuntu-2404 + strategy: + fail-fast: false + matrix: + pg_version: [16, 17] + compiler: [clang, gcc] + env: + LLVM_VER: 18 + CPU: ${{ matrix.cpu }} + CHECK_TYPE: static + PG_VERSION: ${{ matrix.pg_version }} + COMPILER: ${{ matrix.compiler }} + steps: + - name: Checkout extension code into workspace directory + uses: actions/checkout@v6 + with: + path: orioledb + - name: Get the required tag name + shell: bash + run: | + echo "PGTAG=$(grep '^${{ matrix.pg_version }}: ' orioledb/.pgtags | cut -d' ' -f2-)" >> $GITHUB_ENV + - name: Checkout PostgreSQL code into workspace directory + uses: actions/checkout@v6 + with: + repository: orioledb/postgres + ref: ${{ env.PGTAG }} + path: postgresql + - name: Setup prerequisites + run: bash ./orioledb/ci/prerequisites.sh + - name: Build + run: bash ./orioledb/ci/build.sh + - name: Static analysis + run: bash ./orioledb/ci/static.sh + - name: Upload clang reports + uses: actions/upload-artifact@v7 + if: ${{ failure() && matrix.compiler == 'clang' }} + with: + name: ${{ matrix.pg_version }}_${{ matrix.compiler }}_report + path: | + /tmp/scan-build-* + retention-days: 2 + if-no-files-found: ignore diff --git a/contrib/orioledb/.gitignore b/contrib/orioledb/.gitignore new file mode 100644 index 00000000000..21e69d55418 --- /dev/null +++ b/contrib/orioledb/.gitignore @@ -0,0 +1,61 @@ +# When adding new definitions here, +# please also include them in the first part of the .dockerignore file. + +# Global excludes across all subdirectories +*.o +*.obj +*.bc +*.so +*.so.[0-9] +*.so.[0-9].[0-9] +*.so.[0-9].[0-9][0-9] +*.sl +*.sl.[0-9] +*.sl.[0-9].[0-9] +*.sl.[0-9].[0-9][0-9] +*.dylib +*.dll +*.exp +*.a +*.mo +*.pot +objfiles.txt +.deps/ +*.gcno +*.gcda +*.gcov +*.gcov.out +lcov*.info +coverage/ +coverage-html-stamp +*.vcproj +*.vcxproj +win32ver.rc +*.exe +lib*dll.def +lib*.pc +make.flags + +# Local excludes in root directory +/test/t/__pycache__/ +/test/__pycache__/ +/test/log/ +/log_docker_build/ +/test/results/ +/test/tmp_check/ +/test/tmp_check_iso/ +/test/tmp_check_t/ +/test/output_iso/ +/include/utils/stopevents_defs.h +/include/utils/stopevents_data.h +/orioledb.typedefs +!ci/antithesis/libvoidstar.so + +# Ignore generated scripts (also update .dockerignore when adding new entries here) +sql/orioledb--1.0.sql +sql/orioledb--1.4--1.5.sql +sql/orioledb--1.5--1.6.sql +sql/orioledb--1.6--1.7.sql +sql/orioledb--1.7--1.8.sql +# Exclude the PostGIS Docker build directory +/docker-postgis diff --git a/contrib/orioledb/.pgtags b/contrib/orioledb/.pgtags new file mode 100644 index 00000000000..4c27372a8b7 --- /dev/null +++ b/contrib/orioledb/.pgtags @@ -0,0 +1,2 @@ +17: 5091ee89c8b46933b7dceada9be92ac77be172e4 +16: 9619f2a58655819a858d95fca4a6ef9bbe11b2b0 diff --git a/contrib/orioledb/.style.yapf b/contrib/orioledb/.style.yapf new file mode 100644 index 00000000000..0261df5dc86 --- /dev/null +++ b/contrib/orioledb/.style.yapf @@ -0,0 +1,3 @@ +[style] +based_on_style = pep8 +use_tabs = True \ No newline at end of file diff --git a/contrib/orioledb/LICENSE-APACHE.txt b/contrib/orioledb/LICENSE-APACHE.txt new file mode 100644 index 00000000000..0a5d1dc844d --- /dev/null +++ b/contrib/orioledb/LICENSE-APACHE.txt @@ -0,0 +1,203 @@ +Copyright 2025-2026 Supabase + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/contrib/orioledb/LICENSE-POSTGRESQL.txt b/contrib/orioledb/LICENSE-POSTGRESQL.txt new file mode 100644 index 00000000000..12b08eb40db --- /dev/null +++ b/contrib/orioledb/LICENSE-POSTGRESQL.txt @@ -0,0 +1,18 @@ +Copyright (c) 2025-2026, Supabase Inc. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL SUPABASE INC. BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, +SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, +ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF +SUPABASE INC. HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SUPABASE INC. SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +BASIS, AND SUPABASE INC. HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, +UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + diff --git a/contrib/orioledb/Makefile b/contrib/orioledb/Makefile new file mode 100644 index 00000000000..d55e17298b4 --- /dev/null +++ b/contrib/orioledb/Makefile @@ -0,0 +1,386 @@ +# contrib/orioledb/Makefile +# Import make flags if make.flags file is present +-include make.flags +export + +MODULE_big = orioledb +EXTENSION = orioledb +PGFILEDESC = "orioledb - orioledb transactional storage engine via TableAm" +SHLIB_LINK += -lzstd -lcurl -lssl -lcrypto + +DATA_built = $(patsubst %_prod.sql,%.sql,$(wildcard sql/*_prod.sql)) +DATA = $(filter-out $(wildcard sql/*_*.sql) $(DATA_built), $(wildcard sql/*sql)) + +EXTRA_CLEAN = include/utils/stopevents_defs.h \ + include/utils/stopevents_data.h +OBJS = src/btree/btree.o \ + src/btree/build.o \ + src/btree/check.o \ + src/btree/fastpath.o \ + src/btree/find.o \ + src/btree/insert.o \ + src/btree/io.o \ + src/btree/iterator.o \ + src/btree/merge.o \ + src/btree/modify.o \ + src/btree/page_chunks.o \ + src/btree/page_contents.o \ + src/btree/page_state.o \ + src/btree/print.o \ + src/btree/scan.o \ + src/btree/split.o \ + src/btree/undo.o \ + src/catalog/ddl.o \ + src/catalog/free_extents.o \ + src/catalog/indices.o \ + src/catalog/o_aggregate_cache.o \ + src/catalog/o_amop_cache.o \ + src/catalog/o_amproc_cache.o \ + src/catalog/o_class_cache.o \ + src/catalog/o_enum_cache.o \ + src/catalog/o_collation_cache.o \ + src/catalog/o_database_cache.o \ + src/catalog/o_indices.o \ + src/catalog/o_operator_cache.o \ + src/catalog/o_opclass_cache.o \ + src/catalog/o_proc_cache.o \ + src/catalog/o_range_cache.o \ + src/catalog/o_sys_cache.o \ + src/catalog/o_tables.o \ + src/catalog/o_type_cache.o \ + src/catalog/o_tablespace_cache.o \ + src/catalog/sys_trees.o \ + src/checkpoint/checkpoint.o \ + src/checkpoint/control.o \ + src/indexam/handler.o \ + src/orioledb.o \ + src/recovery/logical.o \ + src/recovery/recovery.o \ + src/recovery/wal.o \ + src/recovery/wal_reader.o \ + src/recovery/worker.o \ + src/rewind/rewind.o \ + src/s3/archive.o \ + src/s3/checkpoint.o \ + src/s3/control.o \ + src/s3/checksum.o \ + src/s3/headers.o \ + src/s3/queue.o \ + src/s3/requests.o \ + src/s3/worker.o \ + src/tableam/bitmap_scan.o \ + src/tableam/descr.o \ + src/tableam/func.o \ + src/tableam/handler.o \ + src/tableam/index_scan.o \ + src/tableam/key_range.o \ + src/tableam/key_bitmap.o \ + src/tableam/operations.o \ + src/tableam/scan.o \ + src/tableam/tree.o \ + src/tableam/vacuum.o \ + src/transam/undo.o \ + src/transam/oxid.o \ + src/tuple/format.o \ + src/tuple/toast.o \ + src/tuple/slot.o \ + src/tuple/sort.o \ + src/workers/bgwriter.o \ + src/workers/interrupt.o \ + src/utils/compress.o \ + src/utils/o_buffers.o \ + src/utils/orphaned.o \ + src/utils/page_pool.o \ + src/utils/planner.o \ + src/utils/seq_buf.o \ + src/utils/stopevent.o \ + src/utils/ucm.o \ + $(WIN32RES) + +REGRESSCHECKS = btree_sys_check \ + alter_type \ + alter_storage \ + alter_index \ + bitmap_scan \ + btree_compression \ + btree_print \ + createas \ + database \ + ddl \ + exclude \ + explain \ + fillfactor \ + foreign_keys \ + generated \ + getsomeattrs \ + index_bridging \ + indices \ + indices_build \ + inherits \ + ioc \ + joins \ + nulls \ + opclass \ + parallel_scan \ + partial \ + partition \ + primary_key \ + row_level_locks \ + row_security \ + sanitizers \ + serializable \ + stats \ + subquery \ + subtransactions \ + tableam \ + tablespace \ + temp \ + toast \ + toast_column_compress \ + trigger \ + truncate \ + types \ + rewind +ISOLATIONCHECKS = bitmap_hist_scan \ + btree_iterate \ + btree_iterate_split \ + btree_print_backend_id \ + btree_scan \ + concurrent_update_delete \ + fkeys \ + included \ + insert_fails \ + ioc_deadlock \ + ioc_lost_update \ + isol_ddl \ + isol_merge \ + isol_rc \ + isol_rr \ + isol_rr_bscan \ + isol_rr_fk \ + isol_rr_seqscan \ + isol_serializable \ + load_refind_page \ + merge \ + partition_move \ + rightlink \ + rll \ + rll_deadlock \ + rll_mix \ + rll_subtrans \ + table_lock_test \ + concurrent_truncate \ + uniq +TESTGRESCHECKS_PART_1 = test/t/checkpointer_test.py \ + test/t/correlation_test.py \ + test/t/eviction_bgwriter_test.py \ + test/t/eviction_compression_test.py \ + test/t/eviction_test.py \ + test/t/file_operations_test.py \ + test/t/files_test.py \ + test/t/functions_test.py \ + test/t/index_bridging_test.py \ + test/t/incomplete_split_test.py \ + test/t/merge_test.py \ + test/t/o_tables_test.py \ + test/t/orphaned_test.py \ + test/t/o_tables_2_test.py \ + test/t/recovery_test.py \ + test/t/recovery_opclass_test.py \ + test/t/recovery_worker_test.py \ + test/t/replication_test.py \ + test/t/types_test.py \ + test/t/undo_eviction_test.py \ + test/t/rewind_xid_test.py \ + test/t/rewind_xid_evict_large_test.py \ + test/t/page_fit_items_test.py +TESTGRESCHECKS_PART_2 = test/t/checkpoint_concurrent_test.py \ + test/t/checkpoint_eviction_test.py \ + test/t/checkpoint_same_trx_test.py \ + test/t/checkpoint_split1_test.py \ + test/t/checkpoint_split2_test.py \ + test/t/checkpoint_split3_test.py \ + test/t/checkpoint_update_compress_test.py \ + test/t/checkpoint_update_test.py \ + test/t/ddl_test.py \ + test/t/eviction_full_memory_test.py \ + test/t/include_indices_test.py \ + test/t/indices_build_test.py \ + test/t/logical_test.py \ + test/t/logical_xid_subxacts_test.py \ + test/t/merge_into_test.py \ + test/t/not_supported_yet_test.py \ + test/t/pg_dump_restore_test.py \ + test/t/parallel_test.py \ + test/t/reindex_test.py \ + test/t/s3_test.py \ + test/t/schema_test.py \ + test/t/temp_local_pool_test.py \ + test/t/toast_index_test.py \ + test/t/trigger_test.py \ + test/t/unlogged_test.py \ + test/t/vacuum_test.py \ + test/t/transaction_test.py \ + test/t/page_pool_test.py +TESTGRESCHECKS_PART_3 = test/t/rewind_time_test.py + +PG_REGRESS_ARGS=--no-locale --inputdir=test --outputdir=test --temp-instance=./test/tmp_check +PG_ISOLATION_REGRESS_ARGS=--no-locale --inputdir=test --outputdir=test/output_iso --temp-instance=./test/tmp_check_iso + +ifdef IS_DEV +sql/%.sql: + @cat sql/$*_prod.sql sql/$*_dev.sql > $@ +else +sql/%.sql: + @cat sql/$*_prod.sql > $@ +endif + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +override PG_CPPFLAGS += -I$(CURDIR)/include +include $(PGXS) + +# NO_INSTALL=1 skips installation steps before running tests. +# Useful for CI environments, non-root testing, and package-managed installations. +# Example: make NO_INSTALL=1 USE_PGXS=1 installcheck +# See: https://www.postgresql.org/docs/17/extend-pgxs.html#EXTEND-PGXS-NO-INSTALL +ifdef NO_INSTALL +INSTALL_REQUIREMENT = +TEMP_INSTALL_COMMAND = +else +INSTALL_REQUIREMENT = | install +TEMP_INSTALL_COMMAND = $(with_temp_install) +endif + +regresscheck: $(INSTALL_REQUIREMENT) + $(pg_regress_check) \ + --temp-config test/orioledb_regression.conf \ + $(PG_REGRESS_ARGS) \ + $(REGRESSCHECKS) + +isolationcheck: $(INSTALL_REQUIREMENT) + $(pg_isolation_regress_check) \ + --temp-config test/orioledb_isolation.conf \ + $(PG_ISOLATION_REGRESS_ARGS) \ + $(ISOLATIONCHECKS) + +$(TESTGRESCHECKS_PART_1) $(TESTGRESCHECKS_PART_2) $(TESTGRESCHECKS_PART_3): $(INSTALL_REQUIREMENT) + $(TEMP_INSTALL_COMMAND) \ + python3 -W ignore::DeprecationWarning -m unittest -v $@ + +ifdef IS_DEV +installcheck: regresscheck isolationcheck testgrescheck + echo "All checks are successful!" +else +installcheck: + echo "Checks skipped! Build and run installcheck with IS_DEV=1" +endif + +else +subdir = contrib/orioledb +top_builddir = ../.. +override PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/include +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk + +regresscheck: | submake-regress submake-orioledb temp-install + $(pg_regress_check) \ + --temp-config $(top_srcdir)/contrib/orioledb/test/orioledb_regression.conf \ + $(PG_REGRESS_ARGS) \ + $(REGRESSCHECKS) + +isolationcheck: | submake-isolation submake-orioledb temp-install + $(pg_isolation_regress_check) \ + --temp-config $(top_srcdir)/contrib/orioledb/test/orioledb_isolation.conf \ + $(PG_ISOLATION_REGRESS_ARGS) \ + $(ISOLATIONCHECKS) + +$(TESTGRESCHECKS_PART_1) $(TESTGRESCHECKS_PART_2) $(TESTGRESCHECKS_PART_3): | submake-orioledb temp-install + PG_CONFIG="$(abs_top_builddir)/tmp_install$(bindir)/pg_config" \ + $(with_temp_install) \ + python3 -m unittest -v $@ + +ifdef IS_DEV +check: regresscheck isolationcheck testgrescheck + echo "All checks are successful!" +else +check: + echo "Checks skipped! Build and run check with IS_DEV=1" +endif +endif + +# Retrieve the current commit hash from the Git repository. +# If the .git environment does not exist (e.g., in a Docker environment or a non-Git setup), +# fallback to a default "fake" commit hash (all zeros) to avoid errors. +COMMIT_HASH := $(shell git rev-parse HEAD 2>/dev/null) +ifeq ($(strip $(COMMIT_HASH)),) + COMMIT_HASH := 0000000000000000000000000000000000000000 +endif +override CFLAGS_SL += -DCOMMIT_HASH=$(COMMIT_HASH) -Wno-error=deprecated-declarations + +ifdef VALGRIND +override with_temp_install += PGCTLTIMEOUT=3000 PG_TEST_TIMEOUT_DEFAULT=500 \ + valgrind --vgdb=yes --leak-check=no \ + --num-callers=20 --suppressions=$(CURDIR)/valgrind.supp --time-stamp=yes \ + --log-file=$(CURDIR)/pid-%p.log --trace-children=yes \ + --trace-children-skip=*/initdb +else +override with_temp_install += PGCTLTIMEOUT=900 +endif + +ifdef USE_DM_LOG_WRITES +override with_temp_install += USE_DM_LOG_WRITES=1 +endif + +include/utils/stopevents_data.h: include/utils/stopevents_defs.h + +include/utils/stopevents_defs.h: stopevents.txt stopevents_gen.py + python3 stopevents_gen.py + + +ifndef ORIOLEDB_PATCHSET_VERSION +ORIOLEDB_PATCHSET_VERSION=1 +endif + +check_patchset_version: + @python3 check_patchset_version.py $(MAJORVERSION) $(ORIOLEDB_PATCHSET_VERSION) + +$(OBJS): include/utils/stopevents_defs.h check_patchset_version + +submake-regress: + $(MAKE) -C $(top_builddir)/src/test/regress all + +submake-isolation: + $(MAKE) -C $(top_builddir)/src/test/isolation all + +submake-orioledb: + $(MAKE) -C $(top_builddir)/contrib/orioledb + +testgrescheck: $(TESTGRESCHECKS_PART_1) $(TESTGRESCHECKS_PART_2) $(TESTGRESCHECKS_PART_3) + +testgrescheck_part_1: $(TESTGRESCHECKS_PART_1) + +testgrescheck_part_2: $(TESTGRESCHECKS_PART_2) + +testgrescheck_part_3: $(TESTGRESCHECKS_PART_3) + +temp-install: EXTRA_INSTALL=contrib/orioledb + +orioledb.typedefs: $(OBJS) + ./typedefs_gen.py + +pgindent: orioledb.typedefs + pgindent --typedefs=orioledb.typedefs \ + src/*.c \ + src/*/*.c \ + include/*.h \ + include/*/*.h + +yapf: + yapf -i test/t/*.py + yapf -i *.py + +.PHONY: submake-orioledb submake-regress check \ + regresscheck isolationcheck testgrescheck pgindent \ + $(TESTGRESCHECKS_PART_1) $(TESTGRESCHECKS_PART_2) $(TESTGRESCHECKS_PART_3) diff --git a/contrib/orioledb/PATENTS.txt b/contrib/orioledb/PATENTS.txt new file mode 100644 index 00000000000..6ae0b5c9efb --- /dev/null +++ b/contrib/orioledb/PATENTS.txt @@ -0,0 +1,23 @@ +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement +OrioleDB. Supabase hereby grants to you a perpetual, worldwide, +non-exclusive, no-charge, royalty-free, irrevocable (except as stated +in this section) patent license to make, have made, use, offer to +sell, sell, import, transfer, and otherwise run, modify, and propagate +the contents of these implementations of OrioleDB, where such License +applies only to those patent claims, both currently owned by Supabase +and acquired in the future, licensable by Supabase that are +necessarily infringed by these implementations of OrioleDB. This grant +does not include claims that would be infringed only as a consequence +of further modification of these implementations. If you or your agent +or exclusive licensee institute or order or agree to the institution +of patent litigation or any other patent enforcement activity against +any entity (including a cross-claim or counterclaim in a lawsuit) +alleging that any of these implementations of OrioleDB or any code +incorporated within any of these implementations of OrioleDB +constitute direct or contributory patent infringement, or inducement +of patent infringement, then any patent rights granted to you under +this License for these implementations of OrioleDB shall terminate as +of the date such litigation is filed. diff --git a/contrib/orioledb/README.md b/contrib/orioledb/README.md new file mode 100644 index 00000000000..02397caedbe --- /dev/null +++ b/contrib/orioledb/README.md @@ -0,0 +1,174 @@ +# OrioleDB – a cloud-native storage engine for PostgreSQL +(A solution to PostgreSQL’s wicked problems) + +[![check status](https://github.com/orioledb/orioledb/actions/workflows/check.yml/badge.svg)](https://github.com/orioledb/orioledb/actions) +[![Coverage Status](https://coveralls.io/repos/github/orioledb/orioledb/badge.svg?branch=main)](https://coveralls.io/github/orioledb/orioledb?branch=main) +[![dockerhub](https://github.com/orioledb/orioledb/actions/workflows/docker.yml/badge.svg)](https://hub.docker.com/r/orioledb/orioledb/tags) + + +OrioleDB is pronounced as _OR-ee-ohl-DEE-BEE_ (IPA _/ˈɔːr.i.oʊl diː biː/_) and +is named after the [golden oriole](https://en.wikipedia.org/wiki/Golden_oriole), +a bird which inhabits a range of habitats and migrates in spring symbolizing renewal. + +OrioleDB is a new storage engine for PostgreSQL, bringing a modern approach to +database capacity, capabilities and performance to the world's most-loved +database platform. + +OrioleDB consists of an extension, building on the innovative table access +method framework and other standard Postgres extension interfaces. By extending +and enhancing the current table access methods, OrioleDB opens the door to +a future of more powerful storage models that are optimized for cloud and +modern hardware architectures. + +1. Designed for modern hardware. OrioleDB design avoids legacy CPU bottlenecks + on modern servers containing dozens and hundreds CPU cores, providing + optimized usage of modern storage technologies such as SSD and NVRAM. + +2. Reduced maintenance needs. OrioleDB implements the concepts of undo log + and page-mergins, eliminating the need for dedicated garbage collection + processes. Additionally, OrioleDB implements default 64-bit transaction + identifiers, thus eliminating the well-known and painful wraparound problem. + +3. Designed to be distributed. OrioleDB implements a row-level write-ahead + log with support for parallel apply. This log architecture is optimized + for raft consensus-based replication allowing the implementation of + active-active multimaster. + +The key technical differentiations of OrioleDB are as follows: + +1. No buffer mapping and lock-less page reading. In-memory pages in OrioleDB + are connected with direct links to the storage pages. This eliminates the + need for in-buffer mapping along with its related bottlenecks. Additionally, + in OrioleDB in-memory page reading doesn't involve atomic operations. + Together, these design decisions bring vertical scalability for Postgres + to the whole new level. + +2. MVCC is based on the UNDO log concept. In OrioleDB, old versions of tuples + do not cause bloat in the main storage system, but eviction into the undo + log comprising undo chains. Page-level undo records allow the system + to easily reclaim space occupied by deleted tuples as soon as possible. + Together with page-mergins, these mechanisms eliminate bloat in the majority + of cases. Dedicated VACUUMing of tables is not needed as well, removing + a significant and common cause of system performance deterioration and + database outages. + +3. Copy-on-write checkpoints and row-level WAL. OrioleDB utilizes + copy-on-write checkpoints, which provides a structurally consistent snapshot + of data every moment of time. This is friendly for modern SSDs and allows + row-level WAL logging. In turn, row-level WAL logging is easy to + parallelize (done), compact and suitable for active-active + multimaster (planned). + +See [introduction](doc/intro.mdx), [getting started](doc/usage/getting-started.mdx), and [architecture](doc/architecture/overview.mdx) + documentation as well as +[PostgresBuild 2021 slides](https://www.slideshare.net/AlexanderKorotkov/solving-postgresql-wicked-problems). To start the development see [OrioleDB development quickstart](doc/contributing/local-builds.mdx), and [project structure](doc/contributing/structure.mdx). + +## License + +OrioleDB is dual-licensed under the Apache License 2.0 and the PostgreSQL License. + +You may choose either license to govern your use of this work. +1. [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0) +2. [PostgreSQL License](https://www.postgresql.org/about/licence/) + +`SPDX-License-Identifier: Apache-2.0 OR PostgreSQL` + +All contributions to OrioleDB are made under both the Apache License 2.0 and +the PostgreSQL License. + +See [LICENSE-APACHE.txt](./LICENSE-APACHE.txt) and +[LICENSE-POSTGRESQL.txt](./LICENSE-POSTGRESQL.txt) for details. + +**Patent Grant:**. Supabase provides a separate patent grant for OrioleDB. +See [PATENTS.txt](./PATENTS.txt) for details. + +## Status + +OrioleDB now has public beta status. It is recommended for experiments, +testing, benchmarking, etc., but is not recommended for production usage. +If you are interested in OrioleDB's benefits in production, please +[contact us](mailto:sales@orioledb.com). + +## Installation + +### Use docker container + +We provide docker images for `amd64` and `arm64v8` architectures under Alpine Linux. + +``` +docker pull orioledb/orioledb:latest-pg17 +``` +For example it can be started same as postgres server: +```bash +# !Don't forget to set default locale to C, POSIX or use icu-locale +docker run --name some-postgres -e POSTGRES_PASSWORD=... -e POSTGRES_INITDB_ARGS="--locale=C" -d -p5432:5432 orioledb/orioledb:latest-pg17 +``` + +See [our dockerhub](https://hub.docker.com/r/orioledb/orioledb) for details on our docker container usage. See [the docker build guide](doc/contributing/docker-builds.mdx) for information on how to build the docker images locally. + +### Build from source + +Before building and installing OrioleDB, one should ensure to have the following: + + * [PostgreSQL with extensibility patches](https://github.com/orioledb/postgres): [16 (tag: patches16_34)](https://github.com/orioledb/postgres/tree/patches16_34) or [17 (tag: patches17_6)](https://github.com/orioledb/postgres/tree/patches17_6); + * Development package of libzstd; + * python 3.5+ with testgres package. + +Typical installation procedure may look like this: + +```bash + $ git clone https://github.com/orioledb/orioledb + $ cd orioledb + # Make sure that postgres bin directory is in PATH before running + $ make USE_PGXS=1 + # IS_DEV=1 needed for tests to success + $ make USE_PGXS=1 install IS_DEV=1 + $ make USE_PGXS=1 installcheck +``` + +Before starting working with OrioleDB, adding the following line to +`postgresql.conf` is required. This change requires a restart of +the PostgreSQL database server. + +``` +shared_preload_libraries = 'orioledb.so' +``` + +## Collations +OrioleDB tables support only ICU, C, and POSIX collations. + +So that you don't have to write COLLATE for every "text" field of tables you have options: +### Create whole cluster with one of these collations: +```bash +initdb --locale=C -D.. +# OR +initdb --locale=POSIX -D.. +# OR +initdb --locale-provider=icu --icu-locale=en -D... +``` + +### Create new database with default collation from template0 +```bash +createdb --locale=C --template template0 ... +# OR +createdb --locale=POSIX --template template0 ... +# OR +createdb --locale-provider=icu --icu-locale=en --template template0 ... +``` +Or using `CREATE DATABASE` with `LOCALE` or `ICU_LOCALE` parameters. + +## Setup + +Run the following SQL query on the database to enable the OrioleDB engine. + + +```sql +CREATE EXTENSION orioledb; +``` + +Once the above steps are complete, you can start using OrioleDB's tables. +See [getting started](doc/usage/getting-started.mdx) documentation for details. + +```sql +CREATE TABLE table_name (...) USING orioledb; +``` diff --git a/contrib/orioledb/check_patchset_version.py b/contrib/orioledb/check_patchset_version.py new file mode 100644 index 00000000000..9b1fe3b756e --- /dev/null +++ b/contrib/orioledb/check_patchset_version.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import sys +import re + + +def main(): + if len(sys.argv) != 3: + sys.exit(1) + + major_version = sys.argv[1] + provided_version = sys.argv[2] + + # Read .pgtags file + with open('.pgtags', 'r') as f: + for line in f: + if line.startswith(f'{major_version}:'): + expected = line.split()[1].strip() + break + else: + print(f"No version found for PostgreSQL {major_version}") + sys.exit(1) + + # Extract numeric version if format is like "patches17_14" + if '_' in expected: + expected_num = expected.split('_')[1] + tag_format = f"tag 'patches{major_version}_{expected_num}'" + else: + expected_num = expected + tag_format = f"commit '{expected}'" + + # Check if provided version is numeric (not a hash) + is_numeric = re.match( + r'^\d+$', provided_version) is not None and len(provided_version) < 6 + # Check if provided version is part of `git describe --tags` + is_describe_tag = re.match(r'^\d+-\d+-g[0-9a-f]+$', + provided_version) is not None + + # Compare appropriately + if is_numeric or is_describe_tag: + actual_mismatch = (expected_num != provided_version) + else: + actual_mismatch = (expected != provided_version) + + if actual_mismatch: + print( + f"Wrong orioledb patchset version: expected {expected_num}, got {provided_version}" + ) + print( + f"Rebuild and install patched orioledb/postgres using {tag_format}" + ) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/contrib/orioledb/ci/antithesis/Dockerfile.regress_app b/contrib/orioledb/ci/antithesis/Dockerfile.regress_app new file mode 100644 index 00000000000..0c51b43f9f0 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/Dockerfile.regress_app @@ -0,0 +1,205 @@ +FROM debian:bullseye-slim + +ARG PGTAG + +ENV TZ=UTC + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get full-upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + locales \ + tzdata \ + gosu \ + ; \ + localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8; + +ENV LANG=en_US.utf8 + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \ + apt-get -o Dpkg::Options::="--force-confdef" \ + -o Dpkg::Options::="--force-confold" \ + -y install \ + build-essential flex bison pkg-config \ + libreadline-dev make gdb libipc-run-perl \ + libicu-dev python3 python3-dev python3-pip \ + python3-setuptools python3-testresources \ + libzstd1 libzstd-dev llvm-13 clang-13 clang-tools-13 \ + curl wget liblz4-1 liblz4-dev libuuid1 uuid-dev git \ + ; + +RUN mkdir /docker-entrypoint-initdb.d + +RUN set -eux; \ + groupadd -r postgres --gid=999; \ + useradd -r -g postgres --uid=999 --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; \ + mkdir -p /var/lib/postgresql; \ + chown postgres:postgres /var/lib/postgresql; \ +# also create the postgres user's home directory with appropriate permissions +# see https://github.com/docker-library/postgres/issues/274 + mkdir -p /var/lib/postgresql; \ + chown -R postgres:postgres /var/lib/postgresql + +RUN mkdir -p /usr/src/postgresql/contrib/orioledb +COPY . /usr/src/postgresql/contrib/orioledb +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/libvoidstar.so /usr/lib + +ENV PGTAG $PGTAG +RUN [ -z "$PGTAG" ] && echo "PGTAG is required" && exit 1 || true +RUN MAJORVERSION=$(echo $PGTAG | \ + sed 's/[^[:digit:]]\+\([[:digit:]]\+\)_.*/\1/'); \ + EXPECTED_PATCHVERSION=$(echo $PGTAG | cut -d'_' -f2); \ + CURRENT_PATCHVERSION=$(grep "$MAJORVERSION" \ + /usr/src/postgresql/contrib/orioledb/.pgtags | \ + cut -d'_' -f2); \ + [ $EXPECTED_PATCHVERSION -ne $CURRENT_PATCHVERSION ] && \ + echo "patchset version in orioledb/.pgtags differs from" \ + "PGTAG variable for version $MAJORVERSION:" \ + "expected $EXPECTED_PATCHVERSION," \ + "got $CURRENT_PATCHVERSION" && exit 1 || true + +RUN set -eux; \ + curl -o postgresql.tar.gz \ + --header "Accept: application/vnd.github.v3.raw" \ + --remote-name \ + --location https://github.com/orioledb/postgres/tarball/$PGTAG; \ + mkdir -p /usr/src/postgresql; \ + tar \ + --extract \ + --file postgresql.tar.gz \ + --directory /usr/src/postgresql \ + --strip-components 1 \ + ; \ + rm postgresql.tar.gz; \ + cd /usr/src/postgresql; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ +# explicitly update autoconf config.guess and config.sub so they support more arches/libcs + wget -O config/config.guess 'https://git.savannah.gnu.org/cgit/config.git/plain/config.guess?id=7d3d27baf8107b630586c962c057e22149653deb'; \ + wget -O config/config.sub 'https://git.savannah.gnu.org/cgit/config.git/plain/config.sub?id=7d3d27baf8107b630586c962c057e22149653deb'; \ +# configure options taken from: +# https://anonscm.debian.org/cgit/pkg-postgresql/postgresql.git/tree/debian/rules?h=9.5 + ( CC=clang-13 LLVM_CONFIG=llvm-config-13 CLANG=clang-13 \ + LLVM_SYMBOLIZER=llvm-symbolizer-13 \ + LDFLAGS="-lvoidstar -Wl,--build-id" \ + CFLAGS_SL="-fsanitize-coverage=trace-pc-guard" \ + LDFLAGS_SL="-lvoidstar -Wl,--build-id" \ + ./configure \ + --build="$gnuArch" \ +# "/usr/src/postgresql/src/backend/access/common/tupconvert.c:105: undefined reference to `libintl_gettext'" +# --enable-nls \ + --enable-integer-datetimes \ + --enable-thread-safety \ + --enable-tap-tests \ +# skip debugging info -- we want tiny size instead + --enable-debug \ + --enable-cassert \ + --disable-rpath \ + --with-uuid=e2fs \ + --with-gnu-ld \ + --with-pgport=5432 \ + --with-system-tzdata=/usr/share/zoneinfo \ + --prefix=/usr/local \ + --with-includes=/usr/local/include \ + --with-libraries=/usr/local/lib \ + --with-icu \ + --with-llvm \ + --with-lz4 \ + || cat config.log ); \ + echo "ORIOLEDB_PATCHSET_VERSION = `echo $PGTAG | cut -d'_' -f2`" >> src/Makefile.global; \ + echo '\n%.o: %.c\n\t$(CC) -c $(CFLAGS) -fsanitize-coverage=trace-pc-guard $(CPPFLAGS) -o $@ $<' >> src/Makefile.global; \ + make -j "$(nproc)"; \ + make -C contrib -j "$(nproc)"; \ + make install; \ + make -C contrib install; \ + make -C contrib/orioledb -j "$(nproc)" install; \ + \ + ldd $(which postgres) | grep "libvoidstar"; \ + nm $(which postgres) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd $(which psql) | grep "libvoidstar"; \ + nm $(which psql) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd contrib/orioledb/orioledb.so | grep "libvoidstar"; \ + nm contrib/orioledb/orioledb.so | grep "sanitizer_cov_trace_pc_guard"; \ + \ + chown -R postgres:postgres /usr/src/postgresql; \ + cd /; \ + rm -rf \ + /usr/local/share/doc \ + /usr/local/share/man \ + ; \ + \ + postgres --version; \ + pip3 install --upgrade psycopg2 six testgres; \ + apt-get -y remove \ + build-essential flex bison pkg-config \ + libreadline-dev libipc-run-perl \ + libicu-dev python3-dev python3-pip \ + libzstd-dev \ + curl wget liblz4-dev uuid-dev git \ + ; \ + apt-get -y autoremove; \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/*; \ + apt-get clean; + +# make the sample config easier to munge (and "correct by default") +RUN set -eux; \ + cp -v /usr/local/share/postgresql/postgresql.conf.sample /usr/local/share/postgresql/postgresql.conf.sample.orig; \ + sed -ri "s!^#?(listen_addresses)\s*=\s*\S+.*!\1 = '*'!" /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "shared_preload_libraries = 'orioledb'" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "orioledb.shared_pool_size = 512MB" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "orioledb.undo_size = 256MB" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "max_wal_size = 8GB" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "checkpoint_timeout = 86400" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + echo "orioledb.debug_disable_bgwriter = true" >> /usr/local/share/postgresql/postgresql.conf.sample; \ + grep -F "listen_addresses = '*'" /usr/local/share/postgresql/postgresql.conf.sample + +RUN mkdir -p /var/run/postgresql && chown -R postgres:postgres /var/run/postgresql && chmod 2777 /var/run/postgresql + +ENV PGDATA /var/lib/postgresql/data +# this 777 will be replaced by 700 at runtime (allows semi-arbitrary "--user" values) +RUN mkdir -p "$PGDATA" && chown -R postgres:postgres "$PGDATA" && chmod 777 "$PGDATA" +VOLUME /var/lib/postgresql/data + +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/entrypoint-regress-app.sh /usr/local/bin/ +ENTRYPOINT ["entrypoint-regress-app.sh"] + +ENV PG_MAX_WAL_SENDERS 8 +ENV PG_WAL_KEEP_SIZE 128 +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/setup-replication.sh /docker-entrypoint-initdb.d/ + +RUN chmod +x /docker-entrypoint-initdb.d/setup-replication.sh \ + /usr/local/bin/entrypoint-regress-app.sh + +# We set the default STOPSIGNAL to SIGINT, which corresponds to what PostgreSQL +# calls "Fast Shutdown mode" wherein new connections are disallowed and any +# in-progress transactions are aborted, allowing PostgreSQL to stop cleanly and +# flush tables to disk, which is the best compromise available to avoid data +# corruption. +# +# Users who know their applications do not keep open long-lived idle connections +# may way to use a value of SIGTERM instead, which corresponds to "Smart +# Shutdown mode" in which any existing sessions are allowed to finish and the +# server stops when all sessions are terminated. +# +# See https://www.postgresql.org/docs/12/server-shutdown.html for more details +# about available PostgreSQL server shutdown signals. +# +# See also https://www.postgresql.org/docs/12/server-start.html for further +# justification of this as the default value, namely that the example (and +# shipped) systemd service files use the "Fast Shutdown mode" for service +# termination. +# +STOPSIGNAL SIGINT +# +# An additional setting that is recommended for all users regardless of this +# value is the runtime "--stop-timeout" (or your orchestrator/runtime's +# equivalent) for controlling how long to wait between sending the defined +# STOPSIGNAL and sending SIGKILL (which is likely to cause data corruption). +# +# The default in most runtimes (such as Docker) is 10 seconds, and the +# documentation at https://www.postgresql.org/docs/12/server-start.html notes +# that even 90 seconds may not be long enough in many instances. + +EXPOSE 5432 +CMD ["postgres"] \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/Dockerfile.regress_config b/contrib/orioledb/ci/antithesis/Dockerfile.regress_config new file mode 100644 index 00000000000..a14f7582d3d --- /dev/null +++ b/contrib/orioledb/ci/antithesis/Dockerfile.regress_config @@ -0,0 +1,3 @@ +FROM scratch + +COPY ci/antithesis/docker-compose.yml . \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/Dockerfile.regress_workload b/contrib/orioledb/ci/antithesis/Dockerfile.regress_workload new file mode 100644 index 00000000000..84b23f9d091 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/Dockerfile.regress_workload @@ -0,0 +1,180 @@ +FROM debian:bullseye-slim + +ARG PGTAG + +ENV TZ=UTC + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get full-upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + locales \ + tzdata \ + gosu \ + ; \ + localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8; + +ENV LANG=en_US.utf8 + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \ + apt-get -o Dpkg::Options::="--force-confdef" \ + -o Dpkg::Options::="--force-confold" \ + -y install \ + build-essential flex bison pkg-config \ + libreadline-dev make gdb libipc-run-perl \ + libicu-dev python3 python3-dev python3-pip \ + python3-setuptools python3-testresources \ + libzstd1 libzstd-dev llvm-13 clang-13 clang-tools-13 \ + curl wget liblz4-1 liblz4-dev libuuid1 uuid-dev git \ + ; + +RUN mkdir /docker-entrypoint-initdb.d + +RUN set -eux; \ + groupadd -r postgres --gid=999; \ + useradd -r -g postgres --uid=999 --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; \ + mkdir -p /var/lib/postgresql; \ + chown postgres:postgres /var/lib/postgresql; \ +# also create the postgres user's home directory with appropriate permissions +# see https://github.com/docker-library/postgres/issues/274 + mkdir -p /var/lib/postgresql; \ + chown -R postgres:postgres /var/lib/postgresql + +RUN mkdir -p /usr/src/postgresql/contrib/orioledb +COPY . /usr/src/postgresql/contrib/orioledb +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/libvoidstar.so /usr/lib + +ENV PGTAG $PGTAG +RUN [ -z "$PGTAG" ] && echo "PGTAG is required" && exit 1 || true +RUN MAJORVERSION=$(echo $PGTAG | \ + sed 's/[^[:digit:]]\+\([[:digit:]]\+\)_.*/\1/'); \ + EXPECTED_PATCHVERSION=$(echo $PGTAG | cut -d'_' -f2); \ + CURRENT_PATCHVERSION=$(grep "$MAJORVERSION" \ + /usr/src/postgresql/contrib/orioledb/.pgtags | \ + cut -d'_' -f2); \ + [ $EXPECTED_PATCHVERSION -ne $CURRENT_PATCHVERSION ] && \ + echo "patchset version in orioledb/.pgtags differs from" \ + "PGTAG variable for version $MAJORVERSION:" \ + "expected $EXPECTED_PATCHVERSION," \ + "got $CURRENT_PATCHVERSION" && exit 1 || true + +RUN set -eux; \ + curl -o postgresql.tar.gz \ + --header "Accept: application/vnd.github.v3.raw" \ + --remote-name \ + --location https://github.com/orioledb/postgres/tarball/$PGTAG; \ + mkdir -p /usr/src/postgresql; \ + tar \ + --extract \ + --file postgresql.tar.gz \ + --directory /usr/src/postgresql \ + --strip-components 1 \ + ; \ + rm postgresql.tar.gz; \ + cd /usr/src/postgresql; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ +# explicitly update autoconf config.guess and config.sub so they support more arches/libcs + wget -O config/config.guess 'https://git.savannah.gnu.org/cgit/config.git/plain/config.guess?id=7d3d27baf8107b630586c962c057e22149653deb'; \ + wget -O config/config.sub 'https://git.savannah.gnu.org/cgit/config.git/plain/config.sub?id=7d3d27baf8107b630586c962c057e22149653deb'; \ +# configure options taken from: +# https://anonscm.debian.org/cgit/pkg-postgresql/postgresql.git/tree/debian/rules?h=9.5 + ( CC=clang-13 LLVM_CONFIG=llvm-config-13 CLANG=clang-13 \ + LLVM_SYMBOLIZER=llvm-symbolizer-13 \ + LDFLAGS="-lvoidstar -Wl,--build-id" \ + CFLAGS_SL="-fsanitize-coverage=trace-pc-guard" \ + LDFLAGS_SL="-lvoidstar -Wl,--build-id" \ + ./configure \ + --build="$gnuArch" \ +# "/usr/src/postgresql/src/backend/access/common/tupconvert.c:105: undefined reference to `libintl_gettext'" +# --enable-nls \ + --enable-integer-datetimes \ + --enable-thread-safety \ + --enable-tap-tests \ +# skip debugging info -- we want tiny size instead + --enable-debug \ + --enable-cassert \ + --disable-rpath \ + --with-uuid=e2fs \ + --with-gnu-ld \ + --with-pgport=5432 \ + --with-system-tzdata=/usr/share/zoneinfo \ + --prefix=/usr/local \ + --with-includes=/usr/local/include \ + --with-libraries=/usr/local/lib \ + --with-icu \ + --with-llvm \ + --with-lz4 \ + || cat config.log ); \ + echo "ORIOLEDB_PATCHSET_VERSION = `echo $PGTAG | cut -d'_' -f2`" >> src/Makefile.global; \ + echo '\n%.o: %.c\n\t$(CC) -c $(CFLAGS) -fsanitize-coverage=trace-pc-guard $(CPPFLAGS) -o $@ $<' >> src/Makefile.global; \ + make -j "$(nproc)"; \ + make -C contrib -j "$(nproc)"; \ + make install; \ + make -C contrib install; \ + make -C contrib/orioledb -j "$(nproc)" install; \ + \ + ldd $(which postgres) | grep "libvoidstar"; \ + nm $(which postgres) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd $(which psql) | grep "libvoidstar"; \ + nm $(which psql) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd contrib/orioledb/orioledb.so | grep "libvoidstar"; \ + nm contrib/orioledb/orioledb.so | grep "sanitizer_cov_trace_pc_guard"; \ + \ + chown -R postgres:postgres /usr/src/postgresql; \ + cd /; \ + rm -rf \ + /usr/local/share/doc \ + /usr/local/share/man \ + ; \ + \ + postgres --version; \ + pip3 install --upgrade psycopg2 six testgres; \ + apt-get -y remove \ + build-essential flex bison pkg-config \ + libreadline-dev libipc-run-perl \ + libicu-dev python3-dev python3-pip \ + libzstd-dev \ + curl wget liblz4-dev uuid-dev git \ + ; \ + apt-get -y autoremove; \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/*; \ + apt-get clean; + +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/entrypoint-regress-workload.sh /usr/local/bin/ +ENTRYPOINT ["entrypoint-regress-workload.sh"] + +RUN chmod +x /usr/local/bin/entrypoint-regress-workload.sh + +# We set the default STOPSIGNAL to SIGINT, which corresponds to what PostgreSQL +# calls "Fast Shutdown mode" wherein new connections are disallowed and any +# in-progress transactions are aborted, allowing PostgreSQL to stop cleanly and +# flush tables to disk, which is the best compromise available to avoid data +# corruption. +# +# Users who know their applications do not keep open long-lived idle connections +# may way to use a value of SIGTERM instead, which corresponds to "Smart +# Shutdown mode" in which any existing sessions are allowed to finish and the +# server stops when all sessions are terminated. +# +# See https://www.postgresql.org/docs/12/server-shutdown.html for more details +# about available PostgreSQL server shutdown signals. +# +# See also https://www.postgresql.org/docs/12/server-start.html for further +# justification of this as the default value, namely that the example (and +# shipped) systemd service files use the "Fast Shutdown mode" for service +# termination. +# +STOPSIGNAL SIGINT +# +# An additional setting that is recommended for all users regardless of this +# value is the runtime "--stop-timeout" (or your orchestrator/runtime's +# equivalent) for controlling how long to wait between sending the defined +# STOPSIGNAL and sending SIGKILL (which is likely to cause data corruption). +# +# The default in most runtimes (such as Docker) is 10 seconds, and the +# documentation at https://www.postgresql.org/docs/12/server-start.html notes +# that even 90 seconds may not be long enough in many instances. + +CMD ["workload"] \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/Dockerfile.testgres_app_workload b/contrib/orioledb/ci/antithesis/Dockerfile.testgres_app_workload new file mode 100644 index 00000000000..7888b82ce56 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/Dockerfile.testgres_app_workload @@ -0,0 +1,180 @@ +FROM debian:bullseye-slim + +ARG PGTAG + +ENV TZ=UTC + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get full-upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + locales \ + tzdata \ + gosu \ + ; \ + localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8; + +ENV LANG=en_US.utf8 + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \ + apt-get -o Dpkg::Options::="--force-confdef" \ + -o Dpkg::Options::="--force-confold" \ + -y install \ + build-essential flex bison pkg-config \ + libreadline-dev make gdb libipc-run-perl \ + libicu-dev python3 python3-dev python3-pip \ + python3-setuptools python3-testresources \ + libzstd1 libzstd-dev llvm-13 clang-13 clang-tools-13 \ + curl wget liblz4-1 liblz4-dev libuuid1 uuid-dev git \ + ; + +RUN mkdir /docker-entrypoint-initdb.d + +RUN set -eux; \ + groupadd -r postgres --gid=999; \ + useradd -r -g postgres --uid=999 --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; \ + mkdir -p /var/lib/postgresql; \ + chown postgres:postgres /var/lib/postgresql; \ +# also create the postgres user's home directory with appropriate permissions +# see https://github.com/docker-library/postgres/issues/274 + mkdir -p /var/lib/postgresql; \ + chown -R postgres:postgres /var/lib/postgresql + +RUN mkdir -p /usr/src/postgresql/contrib/orioledb +COPY . /usr/src/postgresql/contrib/orioledb +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/libvoidstar.so /usr/lib + +ENV PGTAG $PGTAG +RUN [ -z "$PGTAG" ] && echo "PGTAG is required" && exit 1 || true +RUN MAJORVERSION=$(echo $PGTAG | \ + sed 's/[^[:digit:]]\+\([[:digit:]]\+\)_.*/\1/'); \ + EXPECTED_PATCHVERSION=$(echo $PGTAG | cut -d'_' -f2); \ + CURRENT_PATCHVERSION=$(grep "$MAJORVERSION" \ + /usr/src/postgresql/contrib/orioledb/.pgtags | \ + cut -d'_' -f2); \ + [ $EXPECTED_PATCHVERSION -ne $CURRENT_PATCHVERSION ] && \ + echo "patchset version in orioledb/.pgtags differs from" \ + "PGTAG variable for version $MAJORVERSION:" \ + "expected $EXPECTED_PATCHVERSION," \ + "got $CURRENT_PATCHVERSION" && exit 1 || true + +RUN set -eux; \ + curl -o postgresql.tar.gz \ + --header "Accept: application/vnd.github.v3.raw" \ + --remote-name \ + --location https://github.com/orioledb/postgres/tarball/$PGTAG; \ + mkdir -p /usr/src/postgresql; \ + tar \ + --extract \ + --file postgresql.tar.gz \ + --directory /usr/src/postgresql \ + --strip-components 1 \ + ; \ + rm postgresql.tar.gz; \ + cd /usr/src/postgresql; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ +# explicitly update autoconf config.guess and config.sub so they support more arches/libcs + wget -O config/config.guess 'https://git.savannah.gnu.org/cgit/config.git/plain/config.guess?id=7d3d27baf8107b630586c962c057e22149653deb'; \ + wget -O config/config.sub 'https://git.savannah.gnu.org/cgit/config.git/plain/config.sub?id=7d3d27baf8107b630586c962c057e22149653deb'; \ +# configure options taken from: +# https://anonscm.debian.org/cgit/pkg-postgresql/postgresql.git/tree/debian/rules?h=9.5 + ( CC=clang-13 LLVM_CONFIG=llvm-config-13 CLANG=clang-13 \ + LLVM_SYMBOLIZER=llvm-symbolizer-13 \ + LDFLAGS="-lvoidstar -Wl,--build-id" \ + CFLAGS_SL="-fsanitize-coverage=trace-pc-guard" \ + LDFLAGS_SL="-lvoidstar -Wl,--build-id" \ + ./configure \ + --build="$gnuArch" \ +# "/usr/src/postgresql/src/backend/access/common/tupconvert.c:105: undefined reference to `libintl_gettext'" +# --enable-nls \ + --enable-integer-datetimes \ + --enable-thread-safety \ + --enable-tap-tests \ +# skip debugging info -- we want tiny size instead + --enable-debug \ + --enable-cassert \ + --disable-rpath \ + --with-uuid=e2fs \ + --with-gnu-ld \ + --with-pgport=5432 \ + --with-system-tzdata=/usr/share/zoneinfo \ + --prefix=/usr/local \ + --with-includes=/usr/local/include \ + --with-libraries=/usr/local/lib \ + --with-icu \ + --with-llvm \ + --with-lz4 \ + || cat config.log ); \ + echo "ORIOLEDB_PATCHSET_VERSION = `echo $PGTAG | cut -d'_' -f2`" >> src/Makefile.global; \ + echo '\n%.o: %.c\n\t$(CC) -c $(CFLAGS) -fsanitize-coverage=trace-pc-guard $(CPPFLAGS) -o $@ $<' >> src/Makefile.global; \ + make -j "$(nproc)"; \ + make -C contrib -j "$(nproc)"; \ + make install; \ + make -C contrib install; \ + make -C contrib/orioledb -j "$(nproc)" install; \ + \ + ldd $(which postgres) | grep "libvoidstar"; \ + nm $(which postgres) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd $(which psql) | grep "libvoidstar"; \ + nm $(which psql) | grep "sanitizer_cov_trace_pc_guard"; \ + ldd contrib/orioledb/orioledb.so | grep "libvoidstar"; \ + nm contrib/orioledb/orioledb.so | grep "sanitizer_cov_trace_pc_guard"; \ + \ + chown -R postgres:postgres /usr/src/postgresql; \ + cd /; \ + rm -rf \ + /usr/local/share/doc \ + /usr/local/share/man \ + ; \ + \ + postgres --version; \ + pip3 install --upgrade psycopg2 six testgres; \ + apt-get -y remove \ + build-essential flex bison pkg-config \ + libreadline-dev libipc-run-perl \ + libicu-dev python3-dev python3-pip \ + libzstd-dev \ + curl wget liblz4-dev uuid-dev git \ + ; \ + apt-get -y autoremove; \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/*; \ + apt-get clean; + +RUN cp /usr/src/postgresql/contrib/orioledb/ci/antithesis/entrypoint-testgres-workload.sh /usr/local/bin/ +ENTRYPOINT ["entrypoint-testgres-workload.sh"] + +RUN chmod +x /usr/local/bin/entrypoint-testgres-workload.sh + +# We set the default STOPSIGNAL to SIGINT, which corresponds to what PostgreSQL +# calls "Fast Shutdown mode" wherein new connections are disallowed and any +# in-progress transactions are aborted, allowing PostgreSQL to stop cleanly and +# flush tables to disk, which is the best compromise available to avoid data +# corruption. +# +# Users who know their applications do not keep open long-lived idle connections +# may way to use a value of SIGTERM instead, which corresponds to "Smart +# Shutdown mode" in which any existing sessions are allowed to finish and the +# server stops when all sessions are terminated. +# +# See https://www.postgresql.org/docs/12/server-shutdown.html for more details +# about available PostgreSQL server shutdown signals. +# +# See also https://www.postgresql.org/docs/12/server-start.html for further +# justification of this as the default value, namely that the example (and +# shipped) systemd service files use the "Fast Shutdown mode" for service +# termination. +# +STOPSIGNAL SIGINT +# +# An additional setting that is recommended for all users regardless of this +# value is the runtime "--stop-timeout" (or your orchestrator/runtime's +# equivalent) for controlling how long to wait between sending the defined +# STOPSIGNAL and sending SIGKILL (which is likely to cause data corruption). +# +# The default in most runtimes (such as Docker) is 10 seconds, and the +# documentation at https://www.postgresql.org/docs/12/server-start.html notes +# that even 90 seconds may not be long enough in many instances. + +CMD ["workload"] \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/Dockerfile.testgres_config b/contrib/orioledb/ci/antithesis/Dockerfile.testgres_config new file mode 100644 index 00000000000..a81eb76a83c --- /dev/null +++ b/contrib/orioledb/ci/antithesis/Dockerfile.testgres_config @@ -0,0 +1,3 @@ +FROM scratch + +COPY ci/antithesis/docker-compose-testgres.yml ./docker-compose.yml \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/docker-compose-testgres.yml b/contrib/orioledb/ci/antithesis/docker-compose-testgres.yml new file mode 100644 index 00000000000..2f4f7e01d92 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/docker-compose-testgres.yml @@ -0,0 +1,17 @@ +version: '3.0' + +services: + primary: + container_name: primary + hostname: primary + restart: always + image: "orioledb:antithesis-testgres-pg${PG_VERSION:-14}-latest" + +# The subnet provided here is an example +# An alternate /24 can be used +networks: + antithesis-net: + driver: bridge + ipam: + config: + - subnet: 10.20.20.0/24 diff --git a/contrib/orioledb/ci/antithesis/docker-compose.yml b/contrib/orioledb/ci/antithesis/docker-compose.yml new file mode 100644 index 00000000000..4330a016adf --- /dev/null +++ b/contrib/orioledb/ci/antithesis/docker-compose.yml @@ -0,0 +1,66 @@ +version: '3.0' + +volumes: + postgres-primary: + driver: local + postgres-standby: + driver: local + +services: + primary: + container_name: primary + hostname: primary + restart: always + image: "orioledb:antithesis-pg${PG_VERSION:-14}-latest" + ports: + - '5432:5432' + volumes: + - 'postgres-primary:/var/lib/postgresql/data' + environment: + PGUSER: postgres + PGPASSWORD: mysecretpassword + networks: + antithesis-net: + ipv4_address: 10.20.20.2 + + standby: + hostname: standby + restart: always + image: "orioledb:antithesis-pg${PG_VERSION:-14}-latest" + ports: + - '5432' + volumes: + - 'postgres-standby:/var/lib/postgresql/data' + environment: + PGUSER: postgres + PGPASSWORD: mysecretpassword + REPLICATE_FROM: primary + depends_on: + - primary + networks: + antithesis-net: + ipv4_address: 10.20.20.4 + + workload: + container_name: workload + hostname: workload + restart: always + image: "orioledb-workload:antithesis-pg${PG_VERSION:-14}-latest" + environment: + PGHOST: primary + PGUSER: postgres + PGPASSWORD: mysecretpassword + depends_on: + - primary + networks: + antithesis-net: + ipv4_address: 10.20.20.6 + +# The subnet provided here is an example +# An alternate /24 can be used +networks: + antithesis-net: + driver: bridge + ipam: + config: + - subnet: 10.20.20.0/24 diff --git a/contrib/orioledb/ci/antithesis/entrypoint-regress-app.sh b/contrib/orioledb/ci/antithesis/entrypoint-regress-app.sh new file mode 100755 index 00000000000..7d4fd5b89d3 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/entrypoint-regress-app.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# Backwards compatibility for old variable names (deprecated) +if [ "x$PGUSER" != "x" ]; then + POSTGRES_USER=$PGUSER +fi +if [ "x$PGPASSWORD" != "x" ]; then + POSTGRES_PASSWORD=$PGPASSWORD +fi + +# Forwards-compatibility for old variable names (pg_basebackup uses them) +if [ "x$PGPASSWORD" = "x" ]; then + export PGPASSWORD=$POSTGRES_PASSWORD +fi + +# Based on official postgres package's entrypoint script (https://hub.docker.com/_/postgres/) +# Modified to be able to set up a replica. The docker-entrypoint-initdb.d hook provided is inadequate. + +set -e + +if [ "${1:0:1}" = '-' ]; then + set -- postgres "$@" +fi + +if [ "$1" = 'postgres' ]; then + mkdir -p "$PGDATA" + chmod 700 "$PGDATA" + chown -R postgres "$PGDATA" + + mkdir -p /run/postgresql + chmod g+s /run/postgresql + chown -R postgres /run/postgresql + + # look specifically for PG_VERSION, as it is expected in the DB dir + if [ ! -s "$PGDATA/PG_VERSION" ]; then + if [ "x$REPLICATE_FROM" == "x" ]; then + eval "gosu postgres initdb $POSTGRES_INITDB_ARGS" + else + ( + trap loop_exit SIGINT + loop_exit() { + echo "Waiting stopped manually" + exit + } + until pg_isready -h ${REPLICATE_FROM} + do + echo "Waiting for primary to ping..." + sleep 1s + done + ) + ( + trap loop_exit SIGINT + loop_exit() { + echo "Waiting stopped manually" + exit + } + until gosu postgres pg_basebackup -h ${REPLICATE_FROM} -D ${PGDATA} -U ${POSTGRES_USER} -vP -w + do + echo "Waiting for primary to connect..." + sleep 1s + done + ) + fi + + # check password first so we can output the warning before postgres + # messes it up + if [ "$POSTGRES_PASSWORD" ]; then + pass="PASSWORD '$POSTGRES_PASSWORD'" + authMethod=md5 + else + # The - option suppresses leading tabs but *not* spaces. :) + cat >&2 <<-'EOWARN' + **************************************************** + WARNING: No password has been set for the database. + This will allow anyone with access to the + Postgres port to access your database. In + Docker's default configuration, this is + effectively any other container on the same + system. + Use "-e POSTGRES_PASSWORD=password" to set + it in "docker run". + **************************************************** + EOWARN + + pass= + authMethod=trust + fi + + { echo; echo "host replication all 0.0.0.0/0 $authMethod"; } | gosu postgres tee -a "$PGDATA/pg_hba.conf" > /dev/null + { echo; echo "host all all 0.0.0.0/0 $authMethod"; } | gosu postgres tee -a "$PGDATA/pg_hba.conf" > /dev/null + + if [ "x$REPLICATE_FROM" == "x" ]; then + + # internal start of server in order to allow set-up using psql-client + # does not listen on external TCP/IP and waits until start finishes + gosu postgres pg_ctl -D "$PGDATA" \ + -o "-c listen_addresses='localhost'" \ + -w start + + : ${POSTGRES_USER:=postgres} + : ${POSTGRES_DB:=$POSTGRES_USER} + export POSTGRES_USER POSTGRES_DB + + psql=( psql -v ON_ERROR_STOP=1 ) + + if [ "$POSTGRES_DB" != 'postgres' ]; then + "${psql[@]}" --username postgres <<-EOSQL + CREATE DATABASE "$POSTGRES_DB" ; + EOSQL + echo + fi + + if [ "$POSTGRES_USER" = 'postgres' ]; then + op='ALTER' + else + op='CREATE' + fi + "${psql[@]}" --username postgres <<-EOSQL + $op USER "$POSTGRES_USER" WITH SUPERUSER $pass ; + EOSQL + echo + + fi + + psql+=( --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" ) + + echo + for f in /docker-entrypoint-initdb.d/*; do + case "$f" in + *.sh) echo "$0: running $f"; . "$f" ;; + *.sql) echo "$0: running $f"; "${psql[@]}" < "$f"; echo ;; + *.sql.gz) echo "$0: running $f"; gunzip -c "$f" | "${psql[@]}"; echo ;; + *) echo "$0: ignoring $f" ;; + esac + echo + done + + if [ "x$REPLICATE_FROM" == "x" ]; then + gosu postgres pg_ctl -D "$PGDATA" -m fast -w stop + fi + + echo + echo 'PostgreSQL init process complete; ready for start up.' + echo + fi + + exec gosu postgres "$@" +fi + +exec "$@" \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/entrypoint-regress-workload.sh b/contrib/orioledb/ci/antithesis/entrypoint-regress-workload.sh new file mode 100755 index 00000000000..b4b7cdc1753 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/entrypoint-regress-workload.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +set -e + +if [ "${1}" == "workload" ]; then + until pg_isready; do + echo "Waiting for server to connect..."; + sleep 1s; + done; + + cd /usr/src/postgresql/contrib/orioledb/ + MAJORVERSION=$( pg_config --version | sed 's/.* \([[:digit:]]\+\).*/\1/' ) + REGRESSION_TESTS=( $(ls -1 sql) ) + ISOLATION_TESTS=( $(ls -1 specs) ) + STATUS=0 + + pg_config --version + + # Filter out btree_sys_check because it depends on test order and will always fail + # Filter out composite_pk_quals because it is not a test + for index in "${!REGRESSION_TESTS[@]}" ; do + case ${REGRESSION_TESTS[index]} in + "btree_sys_check.sql" | \ + "composite_pk_quals.sql") + unset -v 'REGRESSION_TESTS[$index]' + ;; + esac + done + REGRESSION_TESTS=("${REGRESSION_TESTS[@]}") + + # Filter out version specific tests + if [ $MAJORVERSION -lt 14 ]; then + for index in "${!REGRESSION_TESTS[@]}" ; do + if [ ${REGRESSION_TESTS[index]} == "toast_column_compress.sql" ]; then + unset -v 'REGRESSION_TESTS[$index]' + fi + done + REGRESSION_TESTS=("${REGRESSION_TESTS[@]}") + fi + + if [ $MAJORVERSION -lt 15 ]; then + for index in "${!ISOLATION_TESTS[@]}" ; do + if [ ${ISOLATION_TESTS[index]} == "isol_merge.spec" ]; then + unset -v 'ISOLATION_TESTS[$index]' + fi + done + ISOLATION_TESTS=("${ISOLATION_TESTS[@]}") + fi + + function run_regression_test() + { + REGRESSION_TEST=$1 + echo "Regression test '$REGRESSION_TEST' started" + ../../src/test/regress/pg_regress \ + --inputdir=$(pwd) \ + --bindir='$(pg_config --bindir)' \ + $REGRESSION_TEST || STATUS=$? + if [ $STATUS -ne 0 ]; then + echo "regression.diffs contents start" + cat regression.diffs + echo "regression.diffs contents end" + fi + echo "Regression test '$REGRESSION_TEST' ended" + } + + while true; do + RANDOM_NUMBER=$(od -vAn -N1 -tu1 < /dev/urandom) + REGRESSION_TEST_NUM=$(( $RANDOM_NUMBER % ${#REGRESSION_TESTS[@]} )) + ISOLATION_TEST_NUM=$(( $RANDOM_NUMBER % ${#ISOLATION_TESTS[@]} )) + REGRESSION_TEST=${REGRESSION_TESTS[$REGRESSION_TEST_NUM]%.*} + ISOLATION_TEST=${ISOLATION_TESTS[$ISOLATION_TEST_NUM]%.*} + + run_regression_test $REGRESSION_TEST + + echo "Isolation test '$ISOLATION_TEST' started" + ../../src/test/isolation/pg_isolation_regress \ + --inputdir=$(pwd) --outputdir=output_iso \ + --bindir='$(pg_config --bindir)' \ + $ISOLATION_TEST || STATUS=$? + if [ $STATUS -ne 0 ]; then + echo "output_iso/regression.diffs contents start" + cat output_iso/regression.diffs + echo "output_iso/regression.diffs contents end" + fi + echo "Isolation test '$ISOLATION_TEST' ended" + done +else + # An unknown command (debugging the container?): Forward as is + exec ${@} +fi \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/entrypoint-testgres-workload.sh b/contrib/orioledb/ci/antithesis/entrypoint-testgres-workload.sh new file mode 100755 index 00000000000..2fe97ea3cf6 --- /dev/null +++ b/contrib/orioledb/ci/antithesis/entrypoint-testgres-workload.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -e + +if [ "${1}" == "workload" ]; then + RANDOM_NUMBER=$(od -vAn -N1 -tu1 < /dev/urandom) + cd /usr/src/postgresql/contrib/orioledb + MAJORVERSION=$( pg_config --version | sed 's/.* \([[:digit:]]\+\).*/\1/' ) + TESTGRES_TESTS=( $(ls -1 t) ) + + # Filter out base_test because it not contains test + for index in "${!TESTGRES_TESTS[@]}" ; do + case ${TESTGRES_TESTS[index]} in + *base_test.py) + unset -v 'TESTGRES_TESTS[$index]' + ;; + esac + done + TESTGRES_TESTS=("${TESTGRES_TESTS[@]}") + + # Filter out version specific tests + if [ $MAJORVERSION -lt 15 ]; then + for index in "${!TESTGRES_TESTS[@]}" ; do + if [ ${TESTGRES_TESTS[index]} == "merge_into_test.py" ]; then + unset -v 'TESTGRES_TESTS[$index]' + fi + done + TESTGRES_TESTS=("${TESTGRES_TESTS[@]}") + fi + + while true; do + RANDOM_NUMBER=$(od -vAn -N1 -tu1 < /dev/urandom) + TESTGRES_TEST_FILE_NUM=$(( $RANDOM_NUMBER % ${#TESTGRES_TESTS[@]} )) + TESTGRES_TEST_FILE=${TESTGRES_TESTS[$TESTGRES_TEST_FILE_NUM]} + TEST_CLASS=$(cat t/$TESTGRES_TEST_FILE| sed -n 's/class \(.*\)(.*BaseTest):.*/\1/p') + TESTS=( $(cat t/$TESTGRES_TEST_FILE | sed -n '/def test_/p' | sed 's/.*def \(test_.*\)(.*/\1/') ) + TEST_NUM=$(( $RANDOM_NUMBER % ${#TESTS[@]} )) + TEST="t.${TESTGRES_TEST_FILE%.*}.$TEST_CLASS.${TESTS[$TEST_NUM]}" + echo "Testgres test '$TEST' started" + gosu postgres make testgrescheck_part_1 TESTGRESCHECKS_PART_1="$TEST" + echo "Testgres test '$TEST' ended" + STATUS=$? + if [ $STATUS -ne 0 ]; then + echo "output_iso/regression.diffs contents start" + cat output_iso/regression.diffs + echo "output_iso/regression.diffs contents end" + fi + done +else + # An unknown command (debugging the container?): Forward as is + exec ${@} +fi \ No newline at end of file diff --git a/contrib/orioledb/ci/antithesis/libvoidstar.so b/contrib/orioledb/ci/antithesis/libvoidstar.so new file mode 100644 index 00000000000..0f8a0f23c3f Binary files /dev/null and b/contrib/orioledb/ci/antithesis/libvoidstar.so differ diff --git a/contrib/orioledb/ci/antithesis/setup-replication.sh b/contrib/orioledb/ci/antithesis/setup-replication.sh new file mode 100755 index 00000000000..018129ae92a --- /dev/null +++ b/contrib/orioledb/ci/antithesis/setup-replication.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +if [ "x$REPLICATE_FROM" == "x" ]; then + +cat >> ${PGDATA}/postgresql.conf < ${PGDATA}/postgresql.conf <> src/Makefile.global; \ +fi ; +make -sj `nproc` +make -sj `nproc` install +make -C contrib -sj `nproc` +make -C contrib -sj `nproc` install + +if [ $PG_VERSION = "17" ]; then + make -C src/test/modules/injection_points -sj `nproc` install +fi +cd .. + +if [ $CHECK_TYPE = "static" ] && [ $COMPILER = "clang" ]; then + sed -i.bak "s/ -Werror=unguarded-availability-new//g" pgsql/lib/pgxs/src/Makefile.global +fi + +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" + +cd orioledb +if [ $CHECK_TYPE = "sanitize" ]; then + make -j `nproc` USE_PGXS=1 IS_DEV=1 CFLAGS_SL="$(pg_config --cflags_sl) -Werror -fno-omit-frame-pointer -fsanitize=alignment -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=nonnull-attribute -fstack-protector" LDFLAGS_SL="-lubsan -fsanitize=address -fsanitize=undefined -lasan" +elif [ $CHECK_TYPE = "check_page" ]; then + make -j `nproc` USE_PGXS=1 IS_DEV=1 CFLAGS_SL="$(pg_config --cflags_sl) -Werror -DCHECK_PAGE_STRUCT -DCHECK_PAGE_STATS" +elif [ $CHECK_TYPE = "valgrind_1" ] || [ $CHECK_TYPE = "valgrind_2" ]; then + make -j `nproc` USE_PGXS=1 IS_DEV=1 CFLAGS_SL="$(pg_config --cflags_sl) -Werror -coverage -fprofile-update=atomic -flto" +elif [ $CHECK_TYPE != "static" ]; then + make -j `nproc` USE_PGXS=1 IS_DEV=1 CFLAGS_SL="$(pg_config --cflags_sl) -Werror -coverage -fprofile-update=atomic" +fi +if [ $CHECK_TYPE != "static" ]; then + make -j `nproc` USE_PGXS=1 IS_DEV=1 install +fi +cd .. diff --git a/contrib/orioledb/ci/check.sh b/contrib/orioledb/ci/check.sh new file mode 100755 index 00000000000..5e9a05a981a --- /dev/null +++ b/contrib/orioledb/ci/check.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +set -eu +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$GITHUB_WORKSPACE/python3-venv/bin:$PATH" + +# unsets limit for coredumps size +ulimit -c unlimited -S +# sets a coredump file pattern +mkdir -p /tmp/cores-$GITHUB_SHA-$TIMESTAMP +sudo sh -c "echo \"/tmp/cores-$GITHUB_SHA-$TIMESTAMP/%t_%p.core\" > /proc/sys/kernel/core_pattern" + +# remember number of oom-killer visits in syslog before test +[ -f /var/log/system.log ] && syslogfile=/var/log/system.log || syslogfile=/var/log/syslog +[ -f $syslogfile ] && cat $syslogfile | grep oom-kill | wc -l > ./ooms.tmp \ + || { echo "Syslog file not found"; status=1; } + + +status=0 + +cd orioledb +if [ $CHECK_TYPE = "valgrind_1" ]; then + make USE_PGXS=1 IS_DEV=1 VALGRIND=1 regresscheck isolationcheck testgrescheck_part_1 -j $(nproc) || status=$? +elif [ $CHECK_TYPE = "valgrind_2" ]; then + make USE_PGXS=1 IS_DEV=1 VALGRIND=1 testgrescheck_part_2 -j $(nproc) || status=$? +elif [ $CHECK_TYPE = "sanitize" ]; then + if [ $COMPILER = "clang" ]; then + FAKE_STACK=1 + else + FAKE_STACK=0 # it is really slow for gcc + fi + + UBSAN_OPTIONS="log_path=$PWD/ubsan.log" \ + ASAN_OPTIONS=$(cat <<-END + verify_asan_link_order=0: + detect_stack_use_after_return=$FAKE_STACK: + detect_leaks=0: + abort_on_error=1: + disable_coredump=0: + strict_string_checks=1: + check_initialization_order=1: + strict_init_order=1: + detect_odr_violation=0: + log_path=$PWD/asan.log: + max_uar_stack_size_log=25: + END + ) \ + make USE_PGXS=1 IS_DEV=1 installcheck -j $(nproc) || status=$? +elif [ $CHECK_TYPE = "pg_tests" ]; then + cd ../postgresql + cat src/test/regress/parallel_schedule | sed "s/indirect_toast//" >$GITHUB_WORKSPACE/parallel_schedule_no_segfaults + # Backport float tests patch + wget -O float-patch.patch "https://git.postgresql.org/gitweb/?p=postgresql.git;a=patch;h=da83b1ea10c2b7937d4c9e922465321749c6785b" + git apply float-patch.patch + # Initialize data directory and set OrioleDB as default AM + initdb -N --encoding=UTF-8 --locale=C -D $GITHUB_WORKSPACE/pgsql/pgdata + + echo "wal_level = logical" >> $GITHUB_WORKSPACE/pgsql/pgdata/postgresql.conf + echo "shared_preload_libraries = 'orioledb'" >> $GITHUB_WORKSPACE/pgsql/pgdata/postgresql.conf + # We don't support SSI. Run regression/isolation tests with in 'error' + # mode to catch the explicit errors. + echo "orioledb.serializable = 'error'" >> $GITHUB_WORKSPACE/pgsql/pgdata/postgresql.conf + pg_ctl -D $GITHUB_WORKSPACE/pgsql/pgdata -l pg.log start + make -C src/test/regress installcheck -j $(nproc) || status=$? + make -C src/test/isolation installcheck -j $(nproc) || status=$? + make -C src/test/subscription installcheck -j $(nproc) || status=$? + + if [ $status -eq 0 ]; then + echo "default_table_access_method = 'orioledb'" >> $GITHUB_WORKSPACE/pgsql/pgdata/postgresql.conf + git apply patches/limit.patch + if [ $PG_VERSION = "17" ]; then + git apply patches/subscription_enable_oriole.diff + git apply patches/027_stream_regress.patch + fi + pg_ctl -D $GITHUB_WORKSPACE/pgsql/pgdata -l pg.log restart + + pg_basebackup -D $GITHUB_WORKSPACE/pgsql/rep_pgdata -Fp -Xs -P + touch $GITHUB_WORKSPACE/pgsql/rep_pgdata/standby.signal + echo "port = 5433" >> $GITHUB_WORKSPACE/pgsql/rep_pgdata/postgresql.conf + echo "primary_conninfo = 'host=/tmp port=5432'" >> $GITHUB_WORKSPACE/pgsql/rep_pgdata/postgresql.conf + echo "allow_in_place_tablespaces = true" >> $GITHUB_WORKSPACE/pgsql/rep_pgdata/postgresql.conf + pg_ctl -D $GITHUB_WORKSPACE/pgsql/rep_pgdata -l rep_pg.log start + + cd src/test/regress + make installcheck-tests EXTRA_REGRESS_OPTS="--load-extension=orioledb --schedule=$GITHUB_WORKSPACE/parallel_schedule_no_segfaults" TESTS="" -j $(nproc) || true + cd ../../.. + + # Run Postgress regression tests + make -C src/test/regress EXTRA_REGRESS_OPTS="--load-extension=orioledb" installcheck-oriole -j $(nproc) || true + if [ -f src/test/regress/regression.diffs ]; then + python3 ../orioledb/ci/filter_regression_diff.py --diff src/test/regress/regression.diffs > src/test/regress_filtered.diffs + rm src/test/regress/regression.diffs + [ -s src/test/regress_filtered.diffs ] || rm -f src/test/regress_filtered.diffs src/test/regress/regression.diffs + fi + + echo "orioledb.strict_mode = true" >> $GITHUB_WORKSPACE/pgsql/pgdata/postgresql.conf + pg_ctl -D $GITHUB_WORKSPACE/pgsql/pgdata -l pg.log restart + make -C src/test/isolation EXTRA_REGRESS_OPTS="--load-extension=orioledb" installcheck -j $(nproc) || true + if [ -f src/test/isolation/output_iso/regression.diffs ]; then + python3 ../orioledb/ci/filter_isolation_diff.py --diff src/test/isolation/output_iso/regression.diffs > src/test/isolation_filtered.diffs + [ -s src/test/isolation_filtered.diffs ] || rm src/test/isolation_filtered.diffs src/test/isolation/output_iso/regression.diffs + fi + + psql postgres -p 5432 -c 'CREATE EXTENSION orioledb;' || true + + # Wait for replica to synchronize with primary after tests + replica_synced=0 + for i in $(seq 1 60); do + primary_lsn=$(psql postgres -p 5432 -tA -c "SELECT pg_current_wal_lsn();" 2>/dev/null || echo "N/A") + replica_lsn=$(psql postgres -p 5433 -tA -c "SELECT pg_last_wal_replay_lsn();" 2>/dev/null || echo "N/A") + if [ "$primary_lsn" != "N/A" ] && [ "$replica_lsn" != "N/A" ] && \ + [ "$primary_lsn" = "$replica_lsn" ]; then + echo "Replica synchronized (primary: $primary_lsn, replica: $replica_lsn)" + replica_synced=1 + break + fi + echo "Waiting for replica to synchronize... ($i/60): primary=$primary_lsn replica=$replica_lsn" + sleep 1 + done + if [ $replica_synced -eq 0 ]; then + echo "ERROR: Replica failed to synchronize within 30 seconds" + exit 1 + fi + + echo "=== Replica xid_meta ===" + psql postgres -p 5433 -x -c "SELECT * FROM orioledb_get_xid_meta();" || true + echo "=== Replica undo_meta ===" + psql postgres -p 5433 -x -c "SELECT * FROM orioledb_get_undo_meta();" || true + echo "=== Replica proc retain undo locations ===" + psql postgres -p 5433 -c "SELECT * FROM orioledb_get_proc_retain_undo_locations();" || true + + echo "=== Checking xid_meta: nextXid == runXmin + 1 ===" + xid_check=$(psql postgres -p 5433 -tA -c "SELECT nextxid = runxmin + 1 FROM orioledb_get_xid_meta();" 2>/dev/null || echo "error") + if [ "$xid_check" != "t" ]; then + echo "ERROR: nextXid != runXmin + 1" + psql postgres -p 5433 -x -c "SELECT * FROM orioledb_get_xid_meta();" || true + status=1 + fi + + echo "=== Checking undo_meta: lastUsedLocation == minProcRetainLocation ===" + undo_check=$(psql postgres -p 5433 -tA -c "SELECT bool_and(lastusedlocation = minprocretainlocation) FROM orioledb_get_undo_meta();" 2>/dev/null || echo "error") + if [ "$undo_check" != "t" ]; then + echo "ERROR: lastUsedLocation != minProcRetainLocation for some undo type" + psql postgres -p 5433 -x -c "SELECT * FROM orioledb_get_undo_meta();" || true + status=1 + fi + + echo "=== Comparing primary and replica data ===" + for db in regression isolation_regression; do + if ! psql -p 5432 -tA -c "SELECT 1" "$db" >/dev/null 2>&1; then + echo "Skipping $db: database does not exist" + continue + fi + + echo "--- Comparing $db schema ---" + pg_dump -s -p 5432 "$db" | grep -v '^\\\(un\)\{0,1\}restrict' > "/tmp/primary_${db}_schema.sql" + pg_dump -s -p 5433 "$db" | grep -v '^\\\(un\)\{0,1\}restrict' > "/tmp/replica_${db}_schema.sql" + if ! diff -u "/tmp/primary_${db}_schema.sql" "/tmp/replica_${db}_schema.sql" > "dump_diff_${db}_schema.txt"; then + echo "ERROR: $db schema differs between primary and replica" + head -200 "dump_diff_${db}_schema.txt" + status=1 + else + echo "$db schema matches" + rm -f "dump_diff_${db}_schema.txt" + fi + + echo "--- Comparing $db data ---" + # Build --exclude-table flags for unlogged tables + exclude_flags="" + for tbl in $(psql -p 5432 -tA -c "SELECT schemaname || '.' || tablename FROM pg_tables WHERE tableowner = current_user AND tablename IN (SELECT relname FROM pg_class WHERE relpersistence = 'u')" "$db" 2>/dev/null); do + exclude_flags="$exclude_flags --exclude-table=$tbl" + done + pg_dump -a -Fd --compress=0 -p 5432 $exclude_flags -f "/tmp/primary_${db}_data" "$db" + pg_dump -a -Fd --compress=0 -p 5433 $exclude_flags -f "/tmp/replica_${db}_data" "$db" + python3 ../orioledb/ci/sort_dump.py "/tmp/primary_${db}_data" "/tmp/primary_${db}_sorted" + python3 ../orioledb/ci/sort_dump.py "/tmp/replica_${db}_data" "/tmp/replica_${db}_sorted" + if ! diff -ru "/tmp/primary_${db}_sorted" "/tmp/replica_${db}_sorted" > "dump_diff_${db}_data.txt"; then + echo "ERROR: $db data differs between primary and replica" + head -200 "dump_diff_${db}_data.txt" + status=1 + else + echo "$db data matches" + rm -f "dump_diff_${db}_data.txt" + fi + done + + pg_ctl -D $GITHUB_WORKSPACE/pgsql/rep_pgdata -l rep_pg.log stop + if [ $PG_VERSION = "17" ]; then + make -C src/test/subscription installcheck-oriole -j $(nproc) || status=$? + make -C src/test/recovery installcheck-oriole -j $(nproc) || status=$? + fi + fi + pg_ctl -D $GITHUB_WORKSPACE/pgsql/pgdata -l pg.log stop +elif [ $CHECK_TYPE = "dm_log_writes" ]; then + # Run only the recovery tests with OS buffer loss simulation enabled. + # Each crash point uses dm-log-writes: only writes that reached the + # block device before the mark survive, discarding OS-buffered data. + make USE_PGXS=1 IS_DEV=1 USE_DM_LOG_WRITES=1 test/t/recovery_test.py || status=$? +else + make USE_PGXS=1 IS_DEV=1 installcheck -j $(nproc) || status=$? +fi +cd .. + +exit $status diff --git a/contrib/orioledb/ci/check_bench_cores.sh b/contrib/orioledb/ci/check_bench_cores.sh new file mode 100644 index 00000000000..9ece2809ed9 --- /dev/null +++ b/contrib/orioledb/ci/check_bench_cores.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -eu + +status=0 + +# check core dumps if any +cores=$(find /mnt/ -name '*.core' 2>/dev/null || true) + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + echo dumping $corefile for $binary + gdb --batch --quiet \ + -ex "thread apply all bt full" \ + -ex 'eval "p *((LWLockHandle (*) [%u]) held_lwlocks)", num_held_lwlocks' \ + -ex 'eval "p *((MyLockedPage (*) [%u]) myLockedPages)", numberOfMyLockedPages' \ + -ex "quit" \ + $binary $corefile + status=1 + fi + done +fi + +exit $status diff --git a/contrib/orioledb/ci/check_output.sh b/contrib/orioledb/ci/check_output.sh new file mode 100755 index 00000000000..12ba0cf203b --- /dev/null +++ b/contrib/orioledb/ci/check_output.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +set -eu + +status=0 + +[ -f /var/log/system.log ] && syslogfile=/var/log/system.log || syslogfile=/var/log/syslog +[ -f $syslogfile ] || { echo "Syslog file not found"; status=1; } +oomcount=$(cat $syslogfile | grep oom-kill | wc -l) +[ -f ./ooms.tmp ] && { oomsbefore=$(cat ./ooms.tmp); rm ./ooms.tmp; } || \ + { oomsbefore=0; echo "File ooms.tmp not found. check.sh should be run before check-output.sh"; status=1;} +if [ $oomcount != $oomsbefore ]; then + echo "======== OOM-killer came during the tests" + status=1 +fi + +# show diff if it exists +for f in ` find ./orioledb/test ./postgresql/src/test -type f \( -name 'regression.diffs' -o -name 'regress_filtered.diffs' -o -name 'isolation_filtered.diffs' \) ` ; do + echo "========= Contents of $f (first 500 lines)" + head -n 500 $f + line_count=$(wc -l < $f) + if [ $line_count -gt 500 ]; then + echo "... (truncated $(($line_count - 500)) lines)" + echo "Full log available as artifact" + fi + status=1 +done + +# show valgrind logs if needed +if [ $CHECK_TYPE = "valgrind_1" ] || [ $CHECK_TYPE = "valgrind_2" ]; then + for f in ` find . -name pid-*.log ` ; do + if grep -q 'Command: [^ ]*/postgres' $f && grep -E -q '(Process terminating|ERROR SUMMARY: [1-9])' $f; then + echo "========= Contents of $f" + cat $f + status=1 + fi + done +fi + +# check core dumps if any +if [ $CHECK_TYPE = "valgrind_1" ] || [ $CHECK_TYPE = "valgrind_2" ]; then + cores=$(find orioledb/ -name '*.core.*' 2>/dev/null) +else + cores=$(find /tmp/cores-$GITHUB_SHA-$TIMESTAMP/ -name '*.core' 2>/dev/null) +fi + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + if [ $CHECK_TYPE = "valgrind_1" ] || [ $CHECK_TYPE = "valgrind_2" ]; then + # Valgring core dumps have not auxiliary vector. We can't detect a binary file dynamically + # but this value is valid for most cases. + binary=tmp_install/usr/local/pgsql/bin/postgres + else + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + fi + echo dumping $corefile for $binary + gdb --batch --quiet -x ./orioledb/ci/cmds.gdb $binary $corefile + status=1 + fi + done + tar -czf /tmp/cores-$GITHUB_SHA-$TIMESTAMP.tar.gz . $cores +fi + +rm -rf /tmp/cores-$GITHUB_SHA-$TIMESTAMP + +for f in ` find . -name 'ubsan.log.*' ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +for f in ` find . -name 'asan.log.*' ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +exit $status diff --git a/contrib/orioledb/ci/cmds.gdb b/contrib/orioledb/ci/cmds.gdb new file mode 100644 index 00000000000..37feaff1f40 --- /dev/null +++ b/contrib/orioledb/ci/cmds.gdb @@ -0,0 +1,10 @@ +thread apply all bt full +eval "p *((LWLockHandle (*) [%u]) held_lwlocks)", num_held_lwlocks +eval "p *((MyLockedPage (*) [%u]) myLockedPages)", numberOfMyLockedPages +up 99999 +set $i=0 +set $end=argc +while ($i < $end) +p argv[$i++] +end +quit \ No newline at end of file diff --git a/contrib/orioledb/ci/codecov.sh b/contrib/orioledb/ci/codecov.sh new file mode 100644 index 00000000000..06af8fe6a75 --- /dev/null +++ b/contrib/orioledb/ci/codecov.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -eu + +cd orioledb +bash <(curl -s https://codecov.io/bash) -X gcov +cd .. diff --git a/contrib/orioledb/ci/cppcheck-suppress b/contrib/orioledb/ci/cppcheck-suppress new file mode 100644 index 00000000000..45728134708 --- /dev/null +++ b/contrib/orioledb/ci/cppcheck-suppress @@ -0,0 +1,5 @@ +redundantAssignment:* +uselessAssignmentPtrArg:* +incorrectStringBooleanError:* +nullPointerRedundantCheck:* +integerOverflowCond:*/atomics/generic.h diff --git a/contrib/orioledb/ci/docker_matrix.sh b/contrib/orioledb/ci/docker_matrix.sh new file mode 100755 index 00000000000..d832c798f9b --- /dev/null +++ b/contrib/orioledb/ci/docker_matrix.sh @@ -0,0 +1,296 @@ +#!/usr/bin/env bash +set -Eeo pipefail + +# Default values +BASE_MATRIX="ubuntu:24.04" +PG_MAJOR="17" +COMPILER="clang" +DEBUG="false" +DRY_RUN="false" + +# Define base lists +declare -A base_lists +base_lists[all-oldest]="ubuntu:22.04 alpine:3.18" +base_lists[all-latest]="ubuntu:25.04 alpine:3.21" +base_lists[all-dev]="ubuntu:devel alpine:edge" +base_lists[all-alpine]="alpine:edge alpine:3.21 alpine:3.20 alpine:3.19 alpine:3.18" +base_lists[all-debian]="ubuntu:devel ubuntu:25.04 ubuntu:24.10 ubuntu:24.04 ubuntu:22.04" +base_lists[all]="${base_lists[all-alpine]} ${base_lists[all-debian]}" + +# Valid Alpine, Ubuntu, PG and Compiler versions +VALID_ALPINE_VERSIONS="edge 3.21 3.20 3.19 3.18 latest" +VALID_UBUNTU_VERSIONS="devel 25.04 24.10 24.04 22.04 plucky oracular noble jammy latest rolling" +VALID_PG_MAJOR_VERSIONS="17 16" +VALID_COMPILERS="clang gcc" + + +# Function to display help message +show_help() { + cat << EOF +Usage: ./ci/docker_matrix.sh [options] +This script should be run from the project root directory! + +Experimental OrioleDB Docker matrix build command + for testing multiple: PostgreSQL versions, compilers, and base images. + +Options: + --base MATRIX|IMAGE Specify the base/matrix option or individual image + Matrix options: + all-alpine # [ ${base_lists[all-alpine]} ], + all-debian # [ ${base_lists[all-debian]} ], + all-oldest # [ ${base_lists[all-oldest]} ], + all-latest # [ ${base_lists[all-latest]} ], + all-dev # [ ${base_lists[all-dev]} ], + all, + Individual image examples: + alpine:* [ $VALID_ALPINE_VERSIONS ], + ubuntu:* [ $VALID_UBUNTU_VERSIONS ], + Default: $BASE_MATRIX + --pg-major VERSION Specify PostgreSQL major version + Valid options: [ all $VALID_PG_MAJOR_VERSIONS ] + Default: $PG_MAJOR + --compiler TYPE Specify compiler type + Valid options: [ all $VALID_COMPILERS ] + Default: $COMPILER + --debug BOOL Enable debug mode and preserve the build environments. + In this case, each image size exceeds +1GB + Valid options: [ all true false ] + Default: $DEBUG + --dry-run Only print commands without executing + Default: $([ "$DRY_RUN" = true ] && echo "enabled" || echo "disabled") + --clean Clean Docker images ( remove orioletest:* ) + [ --dry-run mode is not working with this option! ] + --help Display this help message + +For the details: check the "--dry-run" and the Dockerfiles in the root directory + - alpine Dockerfile : ./docker/Dockerfile + - ubuntu Dockerfile : ./docker/Dockerfile.ubuntu + +The Docker build logs generated in the ./log_docker_build directory, +and you can check the build logs with: + grep -i -C 1 warning: ./log_docker_build/*/*.build.log + +Examples: + ./ci/docker_matrix.sh --base all-dev --pg-major all --compiler clang + ./ci/docker_matrix.sh --base alpine:3.21 --pg-major 17 --compiler gcc --debug true + ./ci/docker_matrix.sh --base ubuntu:noble --pg-major 16 --compiler all --debug false + +Default behavior: + ./ci/docker_matrix.sh --base $BASE_MATRIX --pg-major $PG_MAJOR --compiler $COMPILER --debug $DEBUG + +EOF +} + +# Function to check if the script is run from the correct directory +check_directory() { + if [[ "$(basename "$(pwd)")" == "ci" ]]; then + echo "Error: This script should be run from the project root directory, not the 'ci' directory." + echo "Please change to the project root directory and run: ./ci/docker_matrix.sh" + exit 1 + fi + + if [[ ! -f "./ci/docker_matrix.sh" ]]; then + echo "Error: This script should be run from the project root directory." + echo "Please change to the project root directory and run: ./ci/docker_matrix.sh" + exit 1 + fi +} + +# Function to check if Docker is installed +check_docker() { + if ! command -v docker &> /dev/null; then + echo "Warning: Docker is not installed or not in PATH. Please install Docker and try again." + exit 1 + fi +} + +# Function to clean Docker images +clean_docker_images() { + echo "Cleaning Docker images..." + docker images | grep 'orioletest' | awk '{print $3}' | sort -u | xargs -r docker rmi -f +} + +## Function to format and optionally execute a command +execute_command() { + echo + echo "# -----------" + local cmd="$*" + + # Formatting: insert line breaks before certain options + # and at '|' and 'tee' for readability + local formatted_cmd=$(echo "$cmd" | sed -E ' + s/(\s)(\||tee|2>&1)/ \\\n \2/g; + s/ (\-[^ ]+)/ \\\n \1/g + ') + + # Process each line to replace multiple spaces before backslash at end of lines + formatted_cmd=$(echo "$formatted_cmd" | sed -E 's/[[:space:]]+\\$/ \\/') + + echo "$formatted_cmd" + + if [ "$DRY_RUN" = false ]; then + eval "$cmd" + fi +} + + + +# Function to validate and process the base parameter +process_base_parameter() { + local base="$1" + if [[ "${base_lists[$base]}" ]]; then + echo "${base_lists[$base]}" + elif [[ $base == alpine:* ]]; then + local version="${base#alpine:}" + if [[ " $VALID_ALPINE_VERSIONS " == *" $version "* ]]; then + echo "$base" + else + echo "Invalid Alpine version: $version" >&2 + exit 1 + fi + elif [[ $base == ubuntu:* ]]; then + local version="${base#ubuntu:}" + if [[ " $VALID_UBUNTU_VERSIONS " == *" $version "* ]]; then + echo "$base" + else + echo "Invalid Ubuntu version: $version" >&2 + exit 1 + fi + else + echo "Invalid base parameter: $base" >&2 + exit 1 + fi +} + +# Main build logic +main() { + local pg_major_list compiler_list base_list + + # Set up lists based on input parameters + [[ $PG_MAJOR == "all" ]] && pg_major_list=( $VALID_PG_MAJOR_VERSIONS ) || pg_major_list=( $PG_MAJOR ) + [[ $COMPILER == "all" ]] && compiler_list=( $VALID_COMPILERS ) || compiler_list=( $COMPILER ) + base_list=($(process_base_parameter "$BASE_MATRIX")) + + # Prepare log directory + local logpath="./log_docker_build/$(date +%Y-%m-%d-%H%M%S)-pid-$$" + execute_command "mkdir -p $logpath" + execute_command "rm -f ${logpath}/*.log" + + # Clone or update docker-library/official-images repository - for testing + local OFFIMG_LOCAL_CLONE="./log_docker_build/official-images" + local OFFIMG_REPO_URL="https://github.com/docker-library/official-images.git" + execute_command "mkdir -p $OFFIMG_LOCAL_CLONE" + if [ -d "$OFFIMG_LOCAL_CLONE/.git" ]; then + execute_command "pushd $OFFIMG_LOCAL_CLONE && git pull origin master && popd" + else + execute_command "git clone $OFFIMG_REPO_URL $OFFIMG_LOCAL_CLONE" + fi + + # Build and test loop + for pg_major in "${pg_major_list[@]}"; do + for compiler in "${compiler_list[@]}"; do + for base in "${base_list[@]}"; do + for debug in $([[ $DEBUG == "all" ]] && echo "true false" || echo "$DEBUG"); do + + local base_os="${base%%:*}" + local base_tag="${base##*:}" + local base_os_upper="${base_os^^}" + local dockerfile="./docker/Dockerfile" + [[ $base_os == "ubuntu" ]] && dockerfile="./docker/Dockerfile.ubuntu" + local docker_tag="${pg_major}-${compiler}-${base_os}-${base_tag}-debug-${debug}" + + echo "#------------ $docker_tag ------------------" + + # Build Docker image + execute_command "docker build --pull --network=host --progress=plain \ + --build-arg ${base_os_upper}_VERSION=\"$base_tag\" \ + --build-arg BUILD_CC_COMPILER=\"$compiler\" \ + --build-arg PG_MAJOR=\"$pg_major\" \ + --build-arg DEBUG_MODE=\"$debug\" \ + -f \"$dockerfile\" \ + -t \"orioletest:${docker_tag}\" . \ + 2>&1 | \ + tee \"${logpath}/${docker_tag}.build.log\"" + + # Run Docker tests + execute_command "\"${OFFIMG_LOCAL_CLONE}/test/run.sh\" \ + -c \"${OFFIMG_LOCAL_CLONE}/test/config.sh\" \ + -c \"docker/orioledb-config.sh\" \"orioletest:${docker_tag}\" \ + 2>&1 | \ + tee \"${logpath}/${docker_tag}.test.log\"" + + #TODO - add regression test - running inside in the docker container + + done + done + done + done + + execute_command "docker images orioletest:* | sort" +} + +# Run the main function if not sourced +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + # Check if the script is run from the correct directory + check_directory + + # Check if Docker is installed + check_docker + + # Parse command line arguments + if [ $# -eq 0 ]; then + show_help + exit 0 + fi + + while [[ $# -gt 0 ]]; do + case $1 in + --base) + BASE_MATRIX="$2" + shift 2 + ;; + --pg-major) + PG_MAJOR="$2" + shift 2 + ;; + --compiler) + COMPILER="$2" + shift 2 + ;; + --debug) + DEBUG="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --clean) + clean_docker_images + exit 0 + ;; + --help) + show_help + exit 0 + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac + done + + # Run the main function + main + + echo + echo "#----------------" + echo "# Build process completed. You can check the build logs with:" + echo "# grep -i -C 1 warning: ./log_docker_build/*/*.build.log" + echo + echo "# To remove test images, run:" + echo "# docker images | grep orioletest | awk '{print \$3}' | sort -u | xargs docker rmi -f" + echo "# ----------------------------------" + +fi diff --git a/contrib/orioledb/ci/dump_stuck_pages.py b/contrib/orioledb/ci/dump_stuck_pages.py new file mode 100644 index 00000000000..55b7581cde5 --- /dev/null +++ b/contrib/orioledb/ci/dump_stuck_pages.py @@ -0,0 +1,165 @@ +""" +gdb helper: when a backtrace contains one of the orioledb page-locking +functions, dump the BTreePageHeader of the page being waited on plus +the chain of waiters queued on it. Helps diagnose recovery-time +hangs where pages stay in locked / read-disabled state with no +backend recording the lock locally. + +Loaded by ci/list_stuck.sh via `gdb -ex "source dump_stuck_pages.py"`. +""" + +import gdb + +LOCK_FUNCS = { + "lock_page": "blkno", + "lock_page_with_tuple": "*blkno", + "relock_page": "blkno", + "page_wait_for_read_enable": "blkno", +} + +# Mirror of include/btree/page_state.h. +PAGE_STATE_LIST_TAIL_MASK = 0x3FFFF +PAGE_STATE_INVALID_PROCNO = PAGE_STATE_LIST_TAIL_MASK +ORIOLEDB_BLCKSZ = 8192 +O_BLKNO_MASK = 0x7FFFFFFF + + +def safe_eval(expr): + try: + return gdb.parse_and_eval(expr) + except gdb.error as e: + return f"" + + +def page_address(blkno): + """Return shared-buffers address for blkno, or None for local pages.""" + if (blkno >> 31) & 1: + # Local page — its content is in this process's local pool only. + return None + sb = safe_eval("o_shared_buffers") + if isinstance(sb, str): + return None + return int(sb) + (blkno & O_BLKNO_MASK) * ORIOLEDB_BLCKSZ + + +def get_blkno_from_frame(frame, arg_expr): + """Read the blkno argument from a frame. arg_expr is "blkno" for + a direct OInMemoryBlkno argument or "*blkno" when the arg is a + pointer (lock_page_with_tuple). Falls back to None when the + argument was optimised out.""" + try: + block = frame.block() + except RuntimeError: + return None + want = arg_expr.lstrip("*") + for sym in block: + if not sym.is_argument or sym.name != want: + continue + try: + val = sym.value(frame) + if arg_expr.startswith("*"): + val = val.dereference() + return int(val) + except (gdb.error, gdb.MemoryError): + return None + return None + + +def dump_page(blkno, seen): + """Print BTreePageHeader and waiter chain for blkno (once).""" + if blkno in seen: + return + seen.add(blkno) + + if not (0 <= blkno <= 0xFFFFFFFE): + print(f" blkno={blkno!r}: invalid") + return + + addr = page_address(blkno) + if addr is None: + print(f" blkno={blkno}: local page — no shared dump") + return + + print(f" blkno={blkno} @ 0x{addr:x}") + hdr = safe_eval(f"*(BTreePageHeader *) {addr}") + print(f" BTreePageHeader = {hdr}") + + state_expr = ( + f"((BTreePageHeader *) {addr})->o_header.state.value") + state = safe_eval(state_expr) + if isinstance(state, str): + print(f" state read failed: {state}") + return + state_val = int(state) + print(f" state = 0x{state_val:016x}") + + proc = state_val & PAGE_STATE_LIST_TAIL_MASK + if proc == PAGE_STATE_INVALID_PROCNO: + print(" waiter chain: ") + return + + max_procs_val = safe_eval("max_procs") + try: + max_procs_int = int(max_procs_val) + except (TypeError, gdb.error): + max_procs_int = None + print(f" waiter chain (head → tail), max_procs={max_procs_int}:") + steps = 0 + while proc != PAGE_STATE_INVALID_PROCNO: + if steps > 1024: + print(" … chain too long, aborting") + break + if max_procs_int is not None and proc >= max_procs_int: + print(f" [proc {proc}] out of range " + f"(>= max_procs {max_procs_int}); chain " + f"likely corrupt, aborting") + break + ws = safe_eval(f"lockerStates[{proc}]") + print(f" [proc {proc}] = {ws}") + nxt = safe_eval(f"lockerStates[{proc}].next") + try: + proc = int(nxt) & PAGE_STATE_LIST_TAIL_MASK + except (TypeError, gdb.error): + print(f" next read failed: {nxt}") + break + steps += 1 + + +def main(): + seen = set() + inferior = gdb.selected_inferior() + threads = inferior.threads() + if not threads: + return + saved = gdb.selected_thread() + for thread in threads: + try: + thread.switch() + except gdb.error: + continue + frame = gdb.newest_frame() + while frame is not None: + try: + fname = frame.name() or "" + except gdb.error: + fname = "" + if fname in LOCK_FUNCS: + blkno = get_blkno_from_frame(frame, LOCK_FUNCS[fname]) + print(f"--- {fname}(blkno={blkno}) in thread " + f"{thread.num} ---") + if blkno is not None: + dump_page(blkno, seen) + else: + print(" (blkno optimised out)") + try: + frame = frame.older() + except gdb.error: + break + if saved is not None: + try: + saved.switch() + except gdb.error: + pass + + +main() diff --git a/contrib/orioledb/ci/filter_isolation_diff.py b/contrib/orioledb/ci/filter_isolation_diff.py new file mode 100755 index 00000000000..16a2d7af29c --- /dev/null +++ b/contrib/orioledb/ci/filter_isolation_diff.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +from unidiff import PatchSet +import argparse +import sys +import io +import re +import os + +def get_permutation_lines(filepath): + output = [] + f = open(filepath, 'r') + i = 1 + for line in f.readlines(): + if line.startswith('starting permutation: '): + output.append(i) + i = i + 1 + return output + +ap = argparse.ArgumentParser(description="Найти первую отличающуюся строку по permutation-блокам .out-файлов из unified diff.") +ap.add_argument("--diff", "-d", help="Путь к файлу diff (если не указан — читаем из stdin).") +args = ap.parse_args() + +if args.diff: + with open(args.diff, "r", encoding="utf-8", errors="replace") as f: + diff_text = f.read() +else: + diff_text = sys.stdin.read() + +patch_set = PatchSet(diff_text) + +allowedRegexes = { + r"ERROR: orioledb does not support SERIALIZABLE isolation level": ["*"], + r"ERROR: tuple to be locked has its primary key changed due to concurrent update": ["*"], + r"ERROR: Not implemented: orioledb_tuple_tid_valid": ["*"], + r"ERROR: Not implemented: orioledb_set_tidrange": ["*"], + r"ERROR: REINDEX CONCURRENTLY is not supported for orioledb tables yet": ["*"], + r"ERROR: orioledb tables does not support CLUSTER": ["*"], + r"ERROR: orioledb table \"[a-z0-9_]+\" does not support VACUUM FULL": ["*"], + r"ERROR: cannot use PREPARE TRANSACTION in transaction that uses orioledb table": ["*"], + r"c1 |(0,1) |0|0|4": ['eval-plan-qual'], + r"QUERY PLAN": ['eval-plan-qual', 'merge-join', 'drop-index-concurrently-1'], + r"\s+\d+(|\s+\d+)*": ['vacuum-no-cleanup-lock', 'stats', 'horizons'], + r"setup of session s1 failed: ERROR: current transaction is aborted, commands ignored until end of transaction block": ['stats'], + r"key |data": ['eval-plan-qual-trigger'], + r"ERROR: tuple concurrently updated": ['intra-grant-inplace'], + r"\s*\d+": ['inherit-temp'], + r"setup failed: ERROR: function \"[a-z0-9_]+\" cannot be used here": ['insert-conflict-specconflict'], + r" 3|setup1 updated by merge1 source not matched by merge2a": ['merge-update'], + r" 2|setup1 updated by merge1": ['merge-update'], + r"ERROR: concurrent index creation is not supported for orioledb tables yet": ['*'], + r"ERROR: REFRESH MATERIALIZED VIEW CONCURRENTLY is not supported for orioledb tables yet": ['*'], + r"ERROR: could not serialize access due to concurrent delete": ['partition-key-update-3'], + r"ERROR: orioledb does not support TID scan": ['*'], + r"ERROR: orioledb does not support TID range scan": ['*'], +} + +def is_allowed_line(testName, line): + for regex, tests in allowedRegexes.items(): + if re.match(regex, line): + for test in tests: + if test == '*' or test == testName: + return True + return False + + +for patched_file in patch_set: + permutationLines = get_permutation_lines(patched_file.target_file) + index = 0 + clean = True + lines = io.StringIO() + testName = os.path.splitext(os.path.basename(patched_file.target_file))[0] + for hunk in patched_file: + for line in hunk: + if line.is_added: + while index < len(permutationLines) - 1 and line.target_line_no > permutationLines[index + 1]: + index = index + 1 + clean = True + if clean: + value = line.value + if value.startswith('step ') or value == "\n" or re.match(r"[a-z0-9]+: (NOTICE|WARNING|DETAIL): ", value): + pass + elif not is_allowed_line(testName, value): + lines.write(f"{value}") + clean = False + else: + clean = False + value = lines.getvalue() + if len(value) > 0: + print(patched_file.target_file) + print(value) + lines.close() diff --git a/contrib/orioledb/ci/filter_regression_diff.py b/contrib/orioledb/ci/filter_regression_diff.py new file mode 100644 index 00000000000..2ce135f839a --- /dev/null +++ b/contrib/orioledb/ci/filter_regression_diff.py @@ -0,0 +1,951 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# This script filters out for regression.diffs: +# - all known ERROR messages +# - all NOTICE|WARNING|DETAIL|INFO|HINT messages +# - all tables differences that only differ in order +# - all plans that basically equal for postgres and orioledb +# but have some different node name +# - known plan differences which are processed in compare_trees +# - also for now all differences for \d output are ignored, +# more smart filter should be added later + +from unidiff import PatchSet +import argparse +import sys +import io +import re +import os +from enum import Enum + +ap = argparse.ArgumentParser( + description= + "Filter out known expected error from pg_regress regression.diffs output") +ap.add_argument("--diff", "-d", help="path to regression.diffs") +args = ap.parse_args() + +if args.diff: + with open(args.diff, "r", encoding="utf-8", errors="replace") as f: + diff_text = f.read() +else: + diff_text = sys.stdin.read() + +patch_set = PatchSet(diff_text) + +knownErrors = { + r"ERROR: orioledb does not support SERIALIZABLE isolation level": ["*"], + r"ERROR: tuple to be locked has its primary key changed due to concurrent update": + ["*"], + r"ERROR: Not implemented: orioledb_tuple_tid_valid": ["*"], + r"ERROR: REINDEX CONCURRENTLY is not supported for orioledb tables yet": + ["*"], + r"ERROR: orioledb tables does not support CLUSTER": ["*"], + r"ERROR: orioledb table \"[a-z0-9_]+\" does not support VACUUM FULL": + ["*"], + r"ERROR: cannot use PREPARE TRANSACTION in transaction that uses orioledb table": + ["*"], + r"ERROR: replica identity type INDEX is not supported for OrioleDB tables yet": + ["*"], + r"Options: index_bridging=true": + ["*"], + + # errors for tests with EXPLAIN usage + r"^[-]+$": ["*"], + r"\s+QUERY PLAN": ["*"], + + # alter_table specific errors + r"ERROR: unsupported alter table subcommand": ["alter_table"], + r"ERROR: table \"[a-z0-9_]+\" has different type for column \"[a-z0-9_]+\"": + ["alter_table"], + r"ERROR: typed tables cannot inherit": ["alter_table"], + r"ERROR: table is missing column \"[a-z0-9_]+\"": ["alter_table"], + r"ERROR: could not change table \"[a-z0-9_]+\" to logged because it references unlogged table \"[a-z0-9_]+\"": + ["alter_table"], + # this probably could be easily fixed + r"ERROR: check constraint \"atacc1_chk\" of relation \"atacc1\" is violated by some row": + ["alter_table"], + # some errors related to ALTER TABLE ... OF ... + r"ERROR: table has column \"y\" where type requires \"x\"": + ["alter_table"], + r"ERROR: table has extra column \"z\"": ["alter_table"], + r"ERROR: could not change table \"logged1\" to unlogged because it references logged table \"logged2\"": + ["alter_table"], + + # create_index specific errors + # some errors related to not working CONCURRENT index build right now + r"ERROR: could not create unique index \"[a-z0-9_]+\"": ["create_index"], + r"ERROR: relation \"[a-z0-9_]+\" does not exist": ["create_index"], + r"ERROR: index \"[a-z0-9_]+\" does not exist": ["create_index"], + + # insert specific error + # error related to OrioleDB vs heap internal page size + r"ERROR: index row size \d+ exceeds orioledb maximum 2688 for table \"large_tuple_test\"" : ["insert"], + + # insert_conflict specific errors + # error does not appear, because serializable isolation level error aborts transaction that must emit it + r"ERROR: ON CONFLICT DO UPDATE command cannot affect row a second time": ["insert_conflict"], + + # psql specific errors + r"^\s*List of access methods": ["psql"], + r"Susie": ["psql"], + + # foreign_key specific errors + r"ERROR: duplicate key value violates unique constraint" : ["foreign_key"], + r"ERROR: (insert or update|update or delete) on table \"[a-z]+\" violates foreign key constraint": ["foreign_key"], + + # privileges specific errors + r"ERROR: function \"sro_ifun\" cannot be used here": ["privileges"], + r"ERROR: (index|relation) \"(sro_idx|sro_cluster_idx|sro_pidx)\" does not exist": ["privileges"], + r"ERROR: permission denied for (table|materialized view) (maintain_test|refresh_test)": ["privileges"], + + # union specific error + r"ERROR: function \"expensivefunc\" cannot be used here": ["union"], + + # collate.icu.utf8 specific error. o_find_collation_dependencies() + # stops at the first hit, so depending on pg_depend scan order the + # error names either the table or one of its indexes — match both. + r"ERROR: cannot refresh collation \"en-x-icu\" because orioledb (table|index) \"[a-z0-9_]+\" uses it": ["collate.icu.utf8"] +} + +# Regexps that allow us to completely skip comparasion of hunks containing these regexprs +skip_hunk_errors = { + r"ERROR: orioledb tuples does not have system attribute: xm(in|ax)": + ["update"], +} + +def can_drop_hunk(testName, line): + for regex, tests in skip_hunk_errors.items(): + if re.match(regex, line): + if '*' in tests or testName in tests: + return True + return False + + +def is_known_error(testName, line): + for regex, tests in knownErrors.items(): + if re.match(regex, line): + for test in tests: + if test == '*' or test == testName: + return True + return False + + +# TODO: explain every table diff +known_table_diffs = { + "create_index": [ + [[], + [[ + 'materialized view concur_reindex_matview', + 'access method orioledb', 'n' + ], ['table concur_reindex_tab', 'access method orioledb', 'n']]], + [[['concur_replident_i_idx', 't']], [['concur_replident_i_idx', 'f']]], + [[], + [['table concur_reindex_part_0_1', 'access method orioledb', 'n'], + ['table concur_reindex_part_0_2', 'access method orioledb', 'n']]], + [[['table1', 'r', 'relfilenode is unchanged'], + ['table2', 'r', 'relfilenode is unchanged']], + [['pg_toast_TABLE', 't', 'relfilenode has changed'], + ['pg_toast_TABLE', 't', 'relfilenode has changed'], + ['pg_toast_TABLE_index', 'i', 'relfilenode has changed'], + ['pg_toast_TABLE_index', 'i', 'relfilenode has changed'], + ['table1', 'r', 'relfilenode has changed'], + ['table2', 'r', 'relfilenode has changed']]], + ], + "alter_table": [ + [[['fkdd2', '"RI_FKey_noaction_upd"', '17', 't', 't'], + ['fkdi2', '"RI_FKey_noaction_upd"', '17', 't', 'f'], + ['fknd2', '"RI_FKey_noaction_upd"', '17', 'f', 'f']], + [['fkdd2', '"RI_FKey_noaction_upd"', '17', 'f', 'f'], + ['fkdi2', '"RI_FKey_noaction_upd"', '17', 'f', 'f'], + ['fknd2', '"RI_FKey_noaction_upd"', '17', 't', 't']]], + [[['fkdd2', '"RI_FKey_check_ins"', '5', 't', 't'], + ['fkdd2', '"RI_FKey_check_upd"', '17', 't', 't'], + ['fkdi2', '"RI_FKey_check_ins"', '5', 't', 'f'], + ['fkdi2', '"RI_FKey_check_upd"', '17', 't', 'f'], + ['fknd2', '"RI_FKey_check_ins"', '5', 'f', 'f'], + ['fknd2', '"RI_FKey_check_upd"', '17', 'f', 'f']], + [['fkdd2', '"RI_FKey_check_ins"', '5', 'f', 'f'], + ['fkdd2', '"RI_FKey_check_upd"', '17', 'f', 'f'], + ['fkdi2', '"RI_FKey_check_ins"', '5', 'f', 'f'], + ['fkdi2', '"RI_FKey_check_upd"', '17', 'f', 'f'], + ['fknd2', '"RI_FKey_check_ins"', '5', 't', 't'], + ['fknd2', '"RI_FKey_check_upd"', '17', 't', 't']]], + [[['f']], [['t']]], + [[], [['alterlock_pkey', 'AccessShareLock']]], + [[], [['alterlock_pkey', 'AccessShareLock']]], + [[['unlogged1', 'r', 'p'], ['unlogged1 toast index', 'i', 'p'], + ['unlogged1 toast table', 't', 'p'], ['unlogged1_f1_seq', 'S', 'p'], + ['unlogged1_pkey', 'i', 'p']], + [['unlogged1', 'r', 'u'], ['unlogged1 toast index', 'i', 'u'], + ['unlogged1 toast table', 't', 'u'], ['unlogged1_f1_seq', 'S', 'u'], + ['unlogged1_pkey', 'i', 'u']]], + [[['logged1', 'r', 'u'], ['logged1 toast index', 'i', 'u'], + ['logged1 toast table', 't', 'u'], ['logged1_f1_seq', 'S', 'u'], + ['logged1_pkey', 'i', 'u']], + [['logged1', 'r', 'p'], ['logged1 toast index', 'i', 'p'], + ['logged1 toast table', 't', 'p'], ['logged1_f1_seq', 'S', 'p'], + ['logged1_pkey', 'i', 'p']]], + ], + "insert_conflict" : [ + [[['3', '1']], []], + ], + "create_am" : [ + [[], [['orioledb', 'orioledb_tableam_handler', 't']]] + ], + "psql" : [ + [[['spgist', 'Index']], [['orioledb', 'Table'], ['spgist', 'Index']]], + [[['spgist', 'Index', 'spghandler', 'SP-GiST index access method']], + [['orioledb', 'Table', 'orioledb_tableam_handler', ''], + ['spgist', 'Index', 'spghandler', 'SP-GiST index access method']]] + ], + "explain" : [ + [[['Bitmap Heap Scan on tenk1 (cost=N.N..N.N rows=N width=N)']], + [['Bitmap heap scan'], ['Custom Scan (o_scan) on tenk1 (cost=N.N..N.N rows=N width=N)']]] + ], + "strings" : [ + [[['f']], [['t']]] + ], + "misc_functions": [ + [[['t', 't']], [['t', 'f']]] + ], + "vacuum_parallel": [ + [[['t']], [['f']]] + ], + "brin_bloom" : [ + [[['0']], [['97']]] + ], + "brin_multi" : [ + [[['0']], [['97']]] + ], + "reloptions": [ + [[['t']], [['f']]] + ] +} + +known_stmt_diff = { + "limit" : [ + [['declare c1 cursor for select * from int8_tbl limit 10;'], + ['declare c1 scroll cursor for select * from int8_tbl limit 10;']], + [['declare c2 cursor for select * from int8_tbl limit 3;'], + ['declare c2 scroll cursor for select * from int8_tbl limit 3;']], + [['declare c3 cursor for select * from int8_tbl offset 3;'], + ['declare c3 scroll cursor for select * from int8_tbl offset 3;']], + [['declare c4 cursor for select * from int8_tbl offset 10;'], + ['declare c4 scroll cursor for select * from int8_tbl offset 10;']], + [['declare c5 cursor for select * from int8_tbl order by q1 fetch first 2 rows with ties;'], + ['declare c5 scroll cursor for select * from int8_tbl order by q1 fetch first 2 rows with ties;']] + ] +} + + +# Recursively prints a query plan subtree rooted at `node`. +# Each node is [level, value, properties, children]. +# Output is indented according to the tree depth. +def dump_subtree(node: list): + prefix = " " * node[0] + print(f"{prefix}[{node[0]}]{node[1]}") + for prop in node[2]: + print(f"{prefix} {prop}") + for child in node[3]: + dump_subtree(child) + + +# Check if two condition strings are equal up to commutativity of operands. +# E.g. "Hash Cond: (a.tenthous = i4.f1)" matches "Hash Cond: (i4.f1 = a.tenthous)" +def is_commutative_cond_eq(s1: str, s2: str) -> bool: + if s1 == s2: + return True + m1 = re.match(r"^(.+:\s*\()(.+?)\s*=\s*(.+?)\)$", s1) + m2 = re.match(r"^(.+:\s*\()(.+?)\s*=\s*(.+?)\)$", s2) + if not m1 or not m2: + return False + return (m1.group(1) == m2.group(1) + and m1.group(2).strip() == m2.group(3).strip() + and m1.group(3).strip() == m2.group(2).strip()) + + +def compare_trees(src_tree: list, target_tree: list, test_name: str): + def goto_down_level(stack: list, cur_elem: list) -> bool: + if len(cur_elem[3]) > 0: + stack.insert(0, [cur_elem[3], 0]) + return True + else: + return False + + def normalize_src_value(value: str) -> str: + # Skip all heap specific substrings inside source value + return re.sub(r"^Parallel\s+", "", value) + + def is_conds_eq(src_cond: str, target_cond: str): + return src_cond.replace("Index Cond", "Conds") == target_cond + + equal = True + src_stack = [[src_tree, 0]] + target_stack = [[target_tree, 0]] + while equal and len(src_stack) > 0 and len(target_stack) > 0: + src_down = False + src_up = False + target_down = False + target_up = False + src_cur = src_stack[0][0][src_stack[0][1]] + src_cur_value = normalize_src_value(src_cur[1]) + target_cur = target_stack[0][0][target_stack[0][1]] + + # print("--- src_cur ---") + # dump_subtree(src_cur) + # print("--- target_cur ---") + # dump_subtree(target_cur) + + if target_cur[1].startswith("Result"): + # It is possible for target output to has smth like: + # + # -> Result + # One-Time Filter: (InitPlan 1).col1 + # + # In that case One-Time Filter is just a property for Result stmt, + # but it is clear, that we can't simply go to the next level without + # verifying "One-Time Filter" expr. So replace "Result" with it's property + # + # TODO: May be better to rewrite tree builder? + if len(target_cur[2]): + target_stack[0][0][target_stack[0][1]][1] = target_cur[2][0] + else: + target_down = True + elif src_cur_value.startswith("Result"): + src_down = True + elif (test_name == 'select' + and src_cur_value == 'Bitmap Heap Scan on onek2' + and target_cur[1] == 'Index Scan using onek2_u2_prtl on onek2'): + src_up = True + target_up = True + elif (test_name == 'updatable_views' + and src_cur_value.startswith('Bitmap Heap Scan') + and target_cur[1].startswith('Custom Scan') + and target_cur[2][0].startswith('Forward index scan')): + src_up = True + target_up = True + elif (test_name == 'fast_default' + and src_cur_value == 'Bitmap Heap Scan on fast_default.t' + and target_cur[1] == 'Custom Scan (o_scan) on fast_default.t'): + src_up = True + target_up = True + elif src_cur_value.startswith('Bitmap Heap Scan'): + if target_cur[1].startswith('Custom Scan'): + if target_cur[2][0] != 'Bitmap heap scan': + equal = False + else: + src_down = True + target_down = True + else: + equal = False + elif is_commutative_cond_eq(src_cur_value, target_cur[1]): + src_down = True + target_down = True + elif (src_cur_value == 'Unique' + and target_cur[1] == 'HashAggregate'): + # As source output uses Unique, it may need to sort values after scan, + # but target output use HashAggregate directly on scanned value, so + # need to skip "Sort" level for source + if (len(src_cur[3]) > 0 + and src_cur[3][0][1] == 'Sort'): + if goto_down_level(src_stack, src_cur) == False: + raise RuntimeError("Failed to skip 'Sort' level in source query plan") + src_cur = src_stack[0][0][src_stack[0][1]] + + src_down = True + target_down = True + elif (src_cur_value == 'Merge Append' + and target_cur[1] == 'Append'): + src_down = True + target_down = True + elif src_cur_value == target_cur[1]: + src_down = True + target_down = True + else: + # processing known real plan differences, + # that are omitted for now but should be checked in the future + # put checks for certain files at the end, to process all other + # checks before + + if (src_cur_value.startswith('Index Only Scan') + and target_cur[1].startswith('Custom Scan') + and (target_cur[2][0] == 'Bitmap heap scan' + or (target_cur[2][0].startswith('Filter') + and target_cur[2][1] == 'Bitmap heap scan') + or (target_cur[2][0].startswith("Forward index only scan")))): + # sometimes we have bitmap heap scan instead index scan + src_up = True + target_up = True + elif (src_cur_value.startswith('Index Only Scan') + and target_cur[1].startswith('Seq Scan')): + # sometimes we have seq scan instead of index scan + src_up = True + target_up = True + elif (src_cur_value.startswith('Index Only Scan') + and target_cur[1].startswith('Sort')): + # Sometimes we have sort, because we use bitmap heap scan + # instead of index scan + target_down = True + # different processing of or clauses for our tables + elif (test_name == 'create_index' + and (src_cur_value == 'BitmapAnd' + and target_cur[1] == 'Bitmap Index Scan on tenk1_hundred')): + src_up = True + target_up = True + elif (src_cur_value.startswith('Index Scan') + and target_cur[1].startswith('Custom Scan') + and target_cur[2][0].startswith('Forward index scan')): + if (len(src_cur[2]) == 0 and len(target_cur[2]) == 1) or is_conds_eq(src_cur[2][0], target_cur[2][1]): + src_down = True + target_down = True + else: + equal = False + elif re.sub(r"_\d+$", "", src_cur_value) == re.sub(r"_\d+$", "", target_cur[1]): + # lines differ only by auto-generated alias suffix (e.g. tinner_2 vs tinner_1) + src_down = True + target_down = True + elif (src_cur_value == target_cur[1].replace('rowid', 'ctid')): + src_up = True + target_up = True + elif (src_cur_value == target_cur[1].replace('bytea', 'tid')): + src_down = True + target_down = True + elif (test_name in ['equivclass', 'inherit', 'aggregates'] + and (src_cur_value.startswith('Index Scan') + and target_cur[1].startswith('Index Only Scan'))): + src_up = True + target_up = True + elif (test_name in ['partition_prune', 'inherit', 'updatable_views'] + and (src_cur_value.startswith('Index Only Scan') + and target_cur[1].startswith('Custom Scan') + and target_cur[2][0].startswith('Forward index only scan'))): + if is_conds_eq(src_cur[2][0], target_cur[2][1]): + src_down = True + target_down = True + else: + equal = False + elif (test_name == 'partition_prune' + and re.sub(r"actual rows=\d+ loops=\d+\)$", "", src_cur_value) == re.sub(r"actual rows=\d+ loops=\d+\)$", "", target_cur[1])): + src_down = True + target_down = True + elif (test_name == 'memoize' + and src_cur_value.startswith('Finalize Aggregate') + and target_cur[1].startswith('Aggregate')): + # Need to skip report about parallel heap execution, because OrioleDB + # currently does not support parallel scans except sequential + for _ in range(2): + if goto_down_level(src_stack, src_cur) == False: + raise RuntimeError("Aggregate explain structure is invalid") + src_cur = src_stack[0][0][src_stack[0][1]] + src_down = True + target_down = True + elif test_name in ['subselect', 'window']: + # We have massive EXPLAIN diff, research it latter + src_up = True + target_up = True + elif (test_name == 'with' + and src_cur_value == 'Merge Semi Join' + and target_cur[1] == 'Hash Semi Join'): + src_up = True + target_up = True + elif (test_name == 'generated' + and src_cur_value.startswith('Index Scan') + and target_cur[1].startswith('Index Scan')): + src_up = True + target_up = True + elif (test_name == 'updatable_views' + and ((re.match(r"Hash (Right )*Join", src_cur_value) + and target_cur[1].startswith("Nested Loop")) + or (src_cur_value == 'Hash Left Join' + and target_cur[1] == 'Merge Left Join'))): + src_up = True + target_up = True + elif (test_name == 'aggregates' + and src_cur[1].startswith("Parallel Index Only") + and target_cur[1].startswith("Parallel Seq Scan")): + src_down = True + target_down = True + elif (test_name == 'incremental_sort' + and ((src_cur_value.startswith('Gather') + or (len(src_cur[3]) > 0 + and src_cur[3][0][1].startswith('Gather'))) + or (src_cur_value == 'Finalize Aggregate' + and target_cur[1] == 'Aggregate'))): + src_up = True + target_up = True + elif (test_name == 'union' + and src_cur_value == 'Nested Loop' + and target_cur[1] == 'Merge Join'): + src_up = True + target_up = True + elif (test_name == 'join_hash' + and src_cur_value == 'Seq Scan on tenk1 t1'): + src_up = True + target_up = True + else: + print(f"src_cur[1] = {src_cur[1]}\ntarget_cur[1] = {target_cur[1]}") + raise RuntimeError(f"Unsupported tree diff. Add branch specifically for \"{test_name}\" test") + + if src_down: + if goto_down_level(src_stack, src_cur) == False: + src_up = True + if src_up: + while (len(src_stack) > 0 + and src_stack[0][1] == len(src_stack[0][0]) - 1): + src_stack.pop(0) + if len(src_stack) > 0: + src_stack[0][1] += 1 + if target_down: + if goto_down_level(target_stack, target_cur) == False: + target_up = True + if target_up: + while (len(target_stack) > 0 + and target_stack[0][1] == len(target_stack[0][0]) - 1): + target_stack.pop(0) + if len(target_stack) > 0: + target_stack[0][1] += 1 + return equal + + +def find_table_lines(line_no, lines): + # Use r".*;$" as a separator in case of non-table output (e.g. COPY TO STDOUT) + def is_table_start(line: str) -> bool: + return re.match(r"^[-+]+$", line) or re.match(r".*;$", line) + def is_table_end(line: str) -> bool: + return re.match(r"\(\d+ rows?\)", line) or re.match(r".*;$", line) + + table_start = 0 + if is_table_start(lines[line_no]): + table_start = line_no + else: + while table_start == 0 and line_no > 0: + if is_table_start(lines[line_no]): + table_start = line_no + line_no -= 1 + + table_end = 0 + while table_end == 0 and line_no < len(lines): + if is_table_end(lines[line_no]): + if line_no != table_start: + table_end = line_no + 1 + line_no += 1 + return table_start, table_end + + +def find_desc_lines(line_no, lines): + desc_start = 0 + desc_end = 0 + while desc_start == 0 and line_no > 0: + if re.match(r"^\\d", lines[line_no]): + desc_start = line_no + line_no -= 1 + + in_column_table = False + line_no += 1 + while desc_end == 0 and line_no < len(lines): + line_type = type_of_line(lines[line_no]) + if in_column_table and line_type == LineType.description: + in_column_table = False + if (not in_column_table and line_type == LineType.table + and ([x.strip() for x in lines[line_no].split("|")] + == ["Column", "Type", "Collation", "Nullable", "Default"])): + in_column_table = True + if not in_column_table and line_type != LineType.description: + desc_end = line_no - 1 + line_no += 1 + return (desc_start, desc_end) + + +class LineType(Enum): + error = 1 + description = 2 + table = 3 + info = 4 + stmt = 5 + + +def type_of_line(line: str): + if re.match(r"ERROR: ", line): + return LineType.error + elif re.match(r"(NOTICE|WARNING|DETAIL|INFO|HINT): ", line): + return LineType.info + # TODO: Also add regexes for column types and other things + elif (re.match(r"^\\d", line) or re.match(r"^\s+Table \".*\"", line) + or re.match(r"Indexes:", line) + or re.match(r".*[,\"] btree \(", line)): + return LineType.description + # TODO: Need to add support for multi-line stmts and other stmt types + elif re.match(r"^(declare).*;$", line): + return LineType.stmt + else: + return LineType.table + + +def query_plan_to_tree(table_lines: str) -> list: + tree = [] + stack = [] + for table_line in table_lines: + level = 0 + char_num = 1 + while table_line[0][char_num] == ' ': + if level == 0: + char_num += 2 + else: + char_num += 6 + level += 1 + + if table_line[0].strip()[0:2] == '->': + value = table_line[0].strip()[4:] + else: + value = table_line[0].strip() + if len(stack) == 0: + # level, value, properties, children + tree += [[level, value, [], []]] + # level_list, level_index + stack.insert(0, [tree, len(tree) - 1]) + elif level == stack[0][0][stack[0][1]][0] + 1: + if table_line[0].strip()[0:2] == '->': + children = stack[0][0][stack[0][1]][3] + children += [[level, value, [], []]] + stack.insert(0, [children, len(children) - 1]) + else: + properties = stack[0][0][stack[0][1]][2] + properties += [value] + elif level == stack[0][0][stack[0][1]][0]: + if len(stack) == 1: + siblings = tree + else: + siblings = stack[1][0][stack[1][1]][3] + siblings += [[level, value, [], []]] + stack.pop(0) + stack.insert(0, [siblings, len(siblings) - 1]) + else: + # Unreachable during normal execution + stack.clear() + # level, value, properties, children + tree += [[level, value, [], []]] + # level_list, level_index + stack.insert(0, [tree, len(tree) - 1]) + + return tree + + +patched_files = list(patch_set) +# patched_files = patched_files[0:5] +# patched_files = patched_files[0:9] +# patched_files = patched_files[-1:] +# patched_files = patched_files[9:10] +patch_set.clear() +for patched_file in patched_files: + src_table_start = 0 + src_table_end = 0 + src_table_lines = [] + target_table_start = 0 + target_table_end = 0 + target_table_lines = [] + table_hunks = [] + finish_table = False + + src_desc_start = 0 + src_desc_end = 0 + target_desc_start = 0 + target_desc_end = 0 + desc_hunks = [] + finish_desc = False + + src_stmts = [] + target_stmts = [] + stmts_hunks = [] + finish_stmts = False + + index = 0 + testName = os.path.splitext(os.path.basename(patched_file.target_file))[0] + hunks = list(patched_file) + with open(patched_file.source_file) as sf: + source = sf.readlines() + with open(patched_file.target_file) as tf: + target = tf.readlines() + + # Merge hunks that represent sequential parts of output to handle the case + # when lines were removed and added in different hunks + hunk_num = 1 + while hunk_num < len(hunks): + cur_hunk_first_line = list(hunks[hunk_num])[0] + prev_hunk_last_line = list(hunks[hunk_num - 1])[-1] + + if (cur_hunk_first_line.source_line_no == prev_hunk_last_line.source_line_no + 1 + and not re.match(r".*;$", cur_hunk_first_line.value)): + hunks[hunk_num - 1] += hunks[hunk_num] + hunks.pop(hunk_num) + else: + hunk_num += 1 + + # Drop all hunks that contain "bad" lines that is difficult to compare with script currently + hunk_num = 0 + while hunk_num < len(hunks): + is_dropped = False + for line in hunks[hunk_num]: + if can_drop_hunk(testName, line.value): + patched_file.remove(hunks[hunk_num]) + hunks.pop(hunk_num) + is_dropped = True + break + if not is_dropped: + hunk_num += 1 + + for hunk_num in range(0, len(hunks)): + hunk = hunks[hunk_num] + lines = list(hunk) + # ignore all non significant log messages and know errors + for line_num in range(0, len(lines)): + line = lines[line_num] + if (line.value == "\n" + or (line.line_type != ' ' + and type_of_line(line.value) == LineType.info) + or is_known_error(testName, line.value)): + hunk.remove(line) + # Update lines according to the modified hunk + lines = list(hunk) + if hunk.added == 0 and hunk.removed == 0: + patched_file.remove(hunk) + else: + # here, we are finding all tables, query plans, \d outputs + # and then removing all tables that only with different row order, + # query plans with known path differences and known removed/existing + # columns or indices + for line_num in range(0, len(lines)): + line = lines[line_num] + line_type = type_of_line(line.value) + if line.is_removed: + # print(f"{line.line_type}:{patched_file.source_file}:{line.source_line_no}:{line_type}: {source[line.source_line_no - 1]}", end="") + if line_type == LineType.table: + if src_table_start == 0: + src_table_start, src_table_end = find_table_lines( + line.source_line_no, source) + # print(f"SRC TABLE {(src_table_start, src_table_end)}") + if (line.source_line_no <= src_table_end + and line.source_line_no > src_table_start + 1): + if line.source_line_no < src_table_end: + src_table_lines += [line.value.split("|")] + if hunk not in table_hunks: + table_hunks += [hunk] + elif line_type == LineType.description: + if src_desc_start == 0: + src_desc_start, src_desc_end = find_desc_lines( + line.source_line_no, source) + # print(f"SOURCE DESC: ({src_desc_start}, {src_desc_end})") + if (line.source_line_no <= src_desc_end + and line.source_line_no > src_desc_start + 1): + if hunk not in desc_hunks: + desc_hunks += [hunk] + elif line_type == LineType.stmt: + src_stmts += [line.value.strip()] + if hunk not in stmts_hunks: + stmts_hunks += [hunk] + + elif line.is_added: + # print(f"{line.line_type}:{patched_file.target_file}:{line.target_line_no}:{line_type}: {target[line.target_line_no - 1]}", end="") + if line_type == LineType.table: + if target_table_start == 0: + target_table_start, target_table_end = find_table_lines( + line.target_line_no, target) + # print(f"TARGET TABLE {(target_table_start, target_table_end)}") + if (line.target_line_no <= target_table_end + and line.target_line_no > target_table_start + 1): + if line.target_line_no < target_table_end: + target_table_lines += [line.value.split("|")] + if hunk not in table_hunks: + table_hunks += [hunk] + elif line_type == LineType.description: + if target_desc_start == 0: + target_desc_start, target_desc_end = find_desc_lines( + line.target_line_no, target) + if (line.target_line_no <= target_desc_end + and line.target_line_no > target_desc_start + 1): + if hunk not in desc_hunks: + desc_hunks += [hunk] + elif line_type == LineType.stmt: + target_stmts += [line.value.strip()] + if hunk not in stmts_hunks: + stmts_hunks += [hunk] + else: + if src_table_end != 0 and target_table_end != 0 and line.source_line_no > src_table_end and line.target_line_no > target_table_end: + finish_table = True + elif src_desc_end != 0 and line.source_line_no > src_desc_end: + finish_desc = True + elif (line.source_line_no == hunk.source_start + hunk.source_length - 1 + and line.target_line_no == hunk.target_start + hunk.target_length - 1): + if len(src_stmts) > 0 and len(target_stmts) > 0: + finish_stmts = True + if src_table_end != 0 and target_table_end != 0: + if (not finish_table and line_num == len(lines) - 1 + and hunk_num != len(hunks) - 1 + and hunks[hunk_num + 1].source_start > src_table_end): + finish_table = True + elif (line_num == len(lines) - 1 + and hunk_num == len(hunks) - 1): + finish_table = True + + + if finish_stmts: + # print("FINISH STMTS?") + + finish_stmts = False + stmts_remove = False + + # print(testName) + # print(src_stmts) + # print(target_stmts) + + if testName in known_stmt_diff: + test_stmts_diffs = known_stmt_diff[testName] + for test_stmts_diff in test_stmts_diffs: + # import ipdb; ipdb.set_trace() + if (test_stmts_diff[0] == src_stmts + and test_stmts_diff[1] + == target_stmts): + stmts_remove = True + + if stmts_remove: + for stmts_hunk in stmts_hunks: + stmts_lines = list(stmts_hunk) + for stmts_line in stmts_lines: + line_type = type_of_line(stmts_line.value) + if line_type == LineType.stmt: + if (stmts_line.is_removed + and stmts_line.value.strip() in src_stmts): + stmts_hunk.remove(stmts_line) + src_stmts.remove(stmts_line.value.strip()) + elif (stmts_line.is_added + and stmts_line.value.strip() in target_stmts): + stmts_hunk.remove(stmts_line) + target_stmts.remove(stmts_line.value.strip()) + + if stmts_hunk.added == 0 and stmts_hunk.removed == 0 and stmts_hunk in patched_file: + patched_file.remove(stmts_hunk) + + src_stmts = [] + target_stmts = [] + stmts_hunks = [] + + if finish_table: + # print("FINISH TABLE?") + finish_table = False + table_remove = False + table_columns = [ + x.strip() + for x in source[src_table_start - 1].split("|") + ] + if table_columns[0].strip() == 'QUERY PLAN': + src_tree = query_plan_to_tree(src_table_lines) + target_tree = query_plan_to_tree(target_table_lines) + # compare plan trees + equal = compare_trees(src_tree, target_tree, testName) + # sys.exit(0) + if equal: + table_remove = True + else: + src_table_lines = sorted( + [cell.strip() for cell in line] + for line in src_table_lines) + target_table_lines = sorted( + [cell.strip() for cell in line] + for line in target_table_lines) + + # print(f"src_table_lines = {src_table_lines}") + # print(f"target_table_lines = {target_table_lines}") + + # remove diffs of tables just with different order + if src_table_lines == target_table_lines: + # print("Table lines are equal") + table_remove = True + else: + while (len(src_table_lines) > 0 + and len(target_table_lines) > 0 + and src_table_lines[0] == target_table_lines[0]): + src_table_lines.pop(0) + target_table_lines.pop(0) + + # print(testName) + # print(src_table_lines) + # print(target_table_lines) + + if testName in known_table_diffs: + test_table_diffs = known_table_diffs[testName] + for test_table_diff in test_table_diffs: + if (test_table_diff[0] == src_table_lines + and test_table_diff[1] + == target_table_lines): + table_remove = True + break + + if table_remove: + for table_hunk in table_hunks: + table_lines = list(table_hunk) + for table_line in table_lines: + line_type = type_of_line(table_line.value) + if line_type == LineType.table: + if (table_line.is_removed + and table_line.source_line_no + >= src_table_start + and table_line.source_line_no + <= src_table_end): + table_hunk.remove(table_line) + elif (table_line.is_added + and table_line.target_line_no + >= target_table_start + and table_line.target_line_no + <= target_table_end): + table_hunk.remove(table_line) + + if table_hunk.added == 0 and table_hunk.removed == 0 and table_hunk in patched_file: + patched_file.remove(table_hunk) + src_table_start = 0 + src_table_end = 0 + src_table_lines = [] + target_table_start = 0 + target_table_end = 0 + target_table_lines = [] + table_hunks = [] + + if finish_desc: + # print("FINISH DESC?") + + in_column_table = False + for desc_hunk in desc_hunks: + desc_lines = list(desc_hunk) + for desc_line in desc_lines: + line_type = type_of_line(desc_line.value) + if in_column_table and line_type == LineType.description: + in_column_table = False + if (not in_column_table + and line_type == LineType.table and ([ + x.strip() + for x in desc_line.value.split("|") + ] == [ + "Column", "Type", "Collation", "Nullable", + "Default" + ])): + in_column_table = True + if (line_type == LineType.description + or (line_type == LineType.table + and in_column_table)): + if (desc_line.is_removed and + desc_line.source_line_no >= src_desc_start + and desc_line.source_line_no + <= src_desc_end + 1): + desc_hunk.remove(desc_line) + + if desc_hunk.added == 0 and desc_hunk.removed == 0 and desc_hunk in patched_file: + patched_file.remove(desc_hunk) + + finish_desc = False + src_desc_start = 0 + src_desc_end = 0 + target_desc_start = 0 + target_desc_end = 0 + desc_hunks = [] + if len(patched_file) != 0: + patch_set.append(patched_file) +print(patch_set, end='') diff --git a/contrib/orioledb/ci/filter_stream_regress_diff.py b/contrib/orioledb/ci/filter_stream_regress_diff.py new file mode 100644 index 00000000000..1dde4b5435b --- /dev/null +++ b/contrib/orioledb/ci/filter_stream_regress_diff.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# This script filters out diff output where both files contain the same lines +# but in a different order. This is used for filtering pg_dumpall diffs between +# primary and standby in streaming replication tests, where OrioleDB may +# produce the same data in a different order. +# +# Expects unified diff format (diff -u). + +from unidiff import PatchSet +import argparse +import sys + +ap = argparse.ArgumentParser( + description="Filter out order-only differences from diff output") +ap.add_argument("--diff", "-d", help="path to diff output file") +args = ap.parse_args() + +if args.diff: + with open(args.diff, "r", encoding="utf-8", errors="replace") as f: + diff_text = f.read() +else: + diff_text = sys.stdin.read() + +patch_set = PatchSet(diff_text) +result = PatchSet([]) + +removed = [] +added = [] + +for patched_file in list(patch_set): + for hunk in patched_file: + removed += [line.value for line in hunk if line.is_removed] + added += [line.value for line in hunk if line.is_added] + +if sorted(removed) != sorted(added): + sys.exit(1) +else: + sys.exit(0) diff --git a/contrib/orioledb/ci/lcov.sh b/contrib/orioledb/ci/lcov.sh new file mode 100644 index 00000000000..560ac6819e0 --- /dev/null +++ b/contrib/orioledb/ci/lcov.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -eu + +cd orioledb +if [ $COMPILER = "clang" ]; then + # trick with gcov-tool needed to not have gcov version mismatch: file.gcno:version '...*', prefer '...*' + lcov --gcov-tool "$PWD/ci/llvm-gcov.sh" --capture --directory . --no-external --rc geninfo_unexecuted_blocks=1 --ignore-errors negative --output-file coverage.info +else + lcov --capture --directory . --no-external --rc geninfo_unexecuted_blocks=1 --ignore-errors negative --output-file coverage.info +fi +cd .. \ No newline at end of file diff --git a/contrib/orioledb/ci/lcov_merge.sh b/contrib/orioledb/ci/lcov_merge.sh new file mode 100644 index 00000000000..3b5665e3968 --- /dev/null +++ b/contrib/orioledb/ci/lcov_merge.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -eu + +sudo apt-get update -qq +sudo apt-get install lcov +lcov $(ls -d1 *coverage.info/coverage.info | xargs -I{} echo "-a {}") -o ./orioledb/coverage.info \ No newline at end of file diff --git a/contrib/orioledb/ci/list_stuck.sh b/contrib/orioledb/ci/list_stuck.sh new file mode 100644 index 00000000000..5d7bac5693c --- /dev/null +++ b/contrib/orioledb/ci/list_stuck.sh @@ -0,0 +1,88 @@ +pgrep postgres | xargs -r ps +pgrep memcheck | xargs -r ps +pgrep python | xargs -r ps + +for process in $(pgrep postgres); do + psout=$(ps -o pid,command $process) + status=$? + if [ $status -eq 0 ]; then + psout=$(echo -ne "$psout" | tail +2) + echo ::group::Backtrace $psout + echo -e $psout + sudo gdb --batch --quiet \ + -ex "thread apply all bt full" \ + -ex 'eval "p *((LWLockHandle (*) [%u]) held_lwlocks)", num_held_lwlocks' \ + -ex 'eval "p *((MyLockedPage (*) [%u]) myLockedPages)", numberOfMyLockedPages' \ + -ex "source $(dirname "$0")/dump_stuck_pages.py" \ + -ex "quit" \ + -p $process + echo ::endgroup:: + echo $psout + if [[ "$psout" =~ ^.*\ -D\ /tmp/([a-z0-9_]+)/.*$ ]]; then + logfile="/tmp/${BASH_REMATCH[1]}/logs/postgresql.log" + echo ::group::tail -n 100 $logfile + tail -n 100 $logfile + echo ::endgroup:: + fi + fi +done + +for process in $(pgrep memcheck); do + psout=$(ps -o pid,command $process) + status=$? + if [ $status -eq 0 ]; then + psout=$(echo -ne "$psout" | tail +2) + psout=$(echo $psout | sed 's/\([0-9]\+\).*initdb /\1 /') + if [[ $psout == *"/postgres"* ]]; then + echo $psout >command_$process.log + mkfifo vgdb-$process-input + tail -f vgdb-$process-input | gdb --quiet \ + -ex "target remote | vgdb --pid=$process" \ + -ex "thread apply all bt full" \ + -ex 'eval "p *((LWLockHandle (*) [%u]) held_lwlocks)", num_held_lwlocks' \ + -ex 'eval "p *((MyLockedPage (*) [%u]) myLockedPages)", numberOfMyLockedPages' \ + -ex "source $(dirname "$0")/dump_stuck_pages.py" \ + -ex "handle all nostop pass" \ + -ex "c" \ + $(which postgres) >vgdb_$process.log 2>&1 & + fi + fi +done + +for process in $(pgrep memcheck); do + psout=$(ps -o pid,command $process) + status=$? + if [ $status -eq 0 ]; then + psout=$(echo -ne "$psout" | tail +2) + psout=$(echo $psout | sed 's/\([0-9]\+\).*initdb /\1 /') + if [[ $psout == *"/postgres"* ]]; then + echo "quit" > vgdb-$process-input + fi + fi +done + +pkill -KILL tail +pkill -KILL gdb +pkill -KILL postgres +pkill -KILL memcheck +rm vgdb-*-input + +for vgdb_file in vgdb_*.log; do + if [ -e "${vgdb_file}" ]; then + echo $vgdb_file + pid=$(echo $vgdb_file | sed 's/vgdb_\(.*\)\.log/\1/') + echo ::group::{Backtrace VALGRIND $pid} + echo $vgdb_file + cat $vgdb_file | awk '/\(No debugging symbols.*|SIG.*|Reading symbols from|EXC_/ {} !/\(No debugging symbols.*|SIG.*|Reading symbols from|EXC_/ { print }' + # rm $vgdb_file + echo ::endgroup:: + command=$(cat command_$pid.log) + # rm command_$process.log + if [[ "$command" =~ ^.*\ -D\ /tmp/([a-z0-9_]+)/.*$ ]]; then + logfile="/tmp/${BASH_REMATCH[1]}/logs/postgresql.log" + echo ::group::tail -n 100 $logfile + tail -n 100 $logfile + echo ::endgroup:: + fi + fi +done \ No newline at end of file diff --git a/contrib/orioledb/ci/llvm-gcov.sh b/contrib/orioledb/ci/llvm-gcov.sh new file mode 100755 index 00000000000..a516e6fada5 --- /dev/null +++ b/contrib/orioledb/ci/llvm-gcov.sh @@ -0,0 +1,2 @@ +#!/bin/bash +exec llvm-cov-$LLVM_VER gcov "$@" \ No newline at end of file diff --git a/contrib/orioledb/ci/local_docker_matrix.sh b/contrib/orioledb/ci/local_docker_matrix.sh new file mode 100755 index 00000000000..7c28d3b9e2e --- /dev/null +++ b/contrib/orioledb/ci/local_docker_matrix.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -Eeo pipefail + +# Testing all possible docker builds on a local machine +# run from project root: ./ci/local_docker_matrix.sh +# and check the logs in ./log_docker_build/*.*.log + +# Full matrix of test builds 2x2x12 = 48 builds +pg_major_list=( 16 17) +compiler_list=( clang gcc ) +base_list=( + # alpine versions + alpine:3.21 + alpine:3.20 + alpine:3.19 + alpine:3.18 + + # ubuntu versions + ubuntu:25.04 + ubuntu:24.10 + ubuntu:24.04 + ubuntu:22.04 + + # developer versions + alpine:edge + ubuntu:devel + ) + + + +# set and prepare $logpath for build logs +mkdir -p ./log_docker_build +logpath=./log_docker_build/"$(date +%Y-%m-%d-%H%M%S)-pid-$$" +mkdir -p $logpath +rm -f ${logpath}/*.log + + +# Using official postgres docker test code +# from https://github.com/docker-library/postgres/blob/master/test +OFFIMG_LOCAL_CLONE=./log_docker_build/official-images +OFFIMG_REPO_URL=https://github.com/docker-library/official-images.git +# Check if the directory exists and contains a git repository +mkdir -p "$OFFIMG_LOCAL_CLONE" +if [ -d "$OFFIMG_LOCAL_CLONE/.git" ]; then + echo "::Updating official-images : $OFFIMG_LOCAL_CLONE" + pushd "$OFFIMG_LOCAL_CLONE" && git pull origin master && popd +else + echo "::Cloning official-images into $OFFIMG_LOCAL_CLONE" + git clone "$OFFIMG_REPO_URL" "$OFFIMG_LOCAL_CLONE" +fi + +for pg_major in "${pg_major_list[@]}" ; do + for compiler in "${compiler_list[@]}" ; do + for base in "${base_list[@]}" ; do + + base_os="${base%%:*}" + base_tag="${base##*:}" + base_os_upper="${base_os^^}" + + # Determine the Dockerfile based on base OS + if [ "$base_os" = "alpine" ]; then + dockerfile="docker/Dockerfile" + elif [ "$base_os" = "ubuntu" ]; then + dockerfile="docker/Dockerfile.ubuntu" + fi + + docker_tag="${pg_major}-${compiler}-${base_os}-${base_tag}" + echo "------------ $docker_tag ------------------" + + rm -f ${logpath}/"${docker_tag}".*.log + + time docker build --pull --network=host --progress=plain \ + -f $dockerfile \ + --build-arg "${base_os_upper}_VERSION=$base_tag" \ + --build-arg BUILD_CC_COMPILER="$compiler" \ + --build-arg PG_MAJOR="$pg_major" \ + -t orioletest:"${docker_tag}" . 2>&1 | tee ${logpath}/"${docker_tag}".build.log + + # Run docker test : oriole + postgres official test scripts + "${OFFIMG_LOCAL_CLONE}/test/run.sh" \ + -c "${OFFIMG_LOCAL_CLONE}/test/config.sh" \ + -c "docker/orioledb-config.sh" \ + "orioletest:${docker_tag}" 2>&1 | tee ${logpath}/"${docker_tag}".test.log + + done + done +done + +docker images orioletest:* | sort + +# You can check the build logs with: +# grep -i -C 1 warning: ./log_docker_build/*/*.build.log diff --git a/contrib/orioledb/ci/perf_build.sh b/contrib/orioledb/ci/perf_build.sh new file mode 100755 index 00000000000..f52f148e9a7 --- /dev/null +++ b/contrib/orioledb/ci/perf_build.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -eux + +# Local build cache for self-hosted runners. +# Avoids rebuilding PG+OrioleDB in each matrix job. +ORIOLEDB_SHA=$(cd orioledb && git rev-parse HEAD) +CACHE_KEY="${COMPILER}-${PGTAG}-${ORIOLEDB_SHA}" +CACHE_DIR="/tmp/perf-build-cache/${CACHE_KEY}" + +if [ -d "$CACHE_DIR/pgsql" ]; then + echo "=== Restoring build from local cache ===" + cp -a "$CACHE_DIR/pgsql" "$GITHUB_WORKSPACE/pgsql" + exit 0 +fi + +rm -rf /tmp/perf-build-cache/ + +echo "=== Building from scratch ===" + +if [ $COMPILER = "clang" ]; then + export CC=clang-$LLVM_VER +else + export CC=gcc +fi + +# configure & build PostgreSQL (debug symbols, no asserts) +CONFIG_ARGS="--enable-debug --disable-cassert --with-icu --prefix=$GITHUB_WORKSPACE/pgsql" + +cd postgresql +./configure $CONFIG_ARGS +if printf "%s\n" "$PGTAG" | grep -v -Fqe "patches$(sed -n "/PACKAGE_VERSION='\(.*\)'/ s//\1/ p" configure | cut -d'.' -f1 )_"; then \ + echo "ORIOLEDB_PATCHSET_VERSION = $PGTAG" >> src/Makefile.global; \ +fi ; +make -sj `nproc` +make -sj `nproc` install +make -C contrib -sj `nproc` +make -C contrib -sj `nproc` install +cd .. + +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" + +# build OrioleDB (no coverage, no sanitizer, no -Werror) +cd orioledb +make -j `nproc` USE_PGXS=1 +make -j `nproc` USE_PGXS=1 install +cd .. + +# Save to local cache, clean up old entries (keep last 4) +mkdir -p "$CACHE_DIR" +cp -a "$GITHUB_WORKSPACE/pgsql" "$CACHE_DIR/pgsql" +find /tmp/perf-build-cache/ -maxdepth 1 -mindepth 1 -type d \ + | sort | head -n -4 | xargs -r rm -rf diff --git a/contrib/orioledb/ci/perf_pg_start.sh b/contrib/orioledb/ci/perf_pg_start.sh new file mode 100644 index 00000000000..771f1ce3849 --- /dev/null +++ b/contrib/orioledb/ci/perf_pg_start.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eux + +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" + +PGDATA="$GITHUB_WORKSPACE/pgdata" + +# Initialize PostgreSQL +rm -rf "$PGDATA" +initdb -N --encoding=UTF-8 --locale=C -D "$PGDATA" + +# Configure for benchmarks +TOTAL_MEM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') +SHARED_BUFFERS_MB=$(( TOTAL_MEM_KB / 4 / 1024 )) + +cat >> "$PGDATA/postgresql.conf" </dev/null || echo "Warning: no postgresql.log found" + +rm -rf "$PGDATA" diff --git a/contrib/orioledb/ci/pgbench.py b/contrib/orioledb/ci/pgbench.py new file mode 100755 index 00000000000..b7d15380ea1 --- /dev/null +++ b/contrib/orioledb/ci/pgbench.py @@ -0,0 +1,892 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import testgres +import re +import argparse +import socket +import subprocess +import tarfile +import telegram +import tempfile +import os +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import time +import psutil +import shutil +import json + + +from testgres.utils import \ + get_bin_path, \ + execute_utility + +data_size_regex = r'\s*(\d+)\s*(kB|MB|GB|TB)\s*' + +def engineGetSchema(engine): + if engine == 'orioledb': + return 'orioledb' + else: + return 'public' + +class ReadOnlyTest(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + pass + + def prepareForRun(self, engine, node): + pass + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid random(1, 100000 * :scale)\n" + + "SELECT abalance FROM {0}.pgbench_accounts WHERE aid = :aid;\n").format(schema) + +class ReadOnlyZipfTest(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + pass + + def prepareForRun(self, engine, node): + pass + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid random_zipfian(1, 100000 * :scale, 1.5)\n" + + "SELECT abalance FROM {0}.pgbench_accounts WHERE aid = :aid;\n").format(schema) + +class ReadOnlyTest9(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + pass + + def prepareForRun(self, engine, node): + pass + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid1 random(1, 100000 * :scale)\n" + + "\\set aid2 random(1, 100000 * :scale)\n" + + "\\set aid3 random(1, 100000 * :scale)\n" + + "\\set aid4 random(1, 100000 * :scale)\n" + + "\\set aid5 random(1, 100000 * :scale)\n" + + "\\set aid6 random(1, 100000 * :scale)\n" + + "\\set aid7 random(1, 100000 * :scale)\n" + + "\\set aid8 random(1, 100000 * :scale)\n" + + "\\set aid9 random(1, 100000 * :scale)\n" + + "SELECT abalance FROM {0}.pgbench_accounts WHERE aid IN (:aid1,:aid2,:aid3,:aid4,:aid5,:aid6,:aid7,:aid8,:aid9);\n").format(schema) + +class ReadWriteTest(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + pass + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + schema = engineGetSchema(engine) + node.safe_psql('TRUNCATE {0}.pgbench_history;'.format(schema)) + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid random(1, 100000 * :scale)\n" + + "\\set bid random(1, 1 * :scale)\n" + + "\\set tid random(1, 10 * :scale)\n" + + "\\set delta random(-5000, 5000)\n" + + "BEGIN;\n" + + "UPDATE {0}.pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" + + "SELECT abalance FROM {0}.pgbench_accounts WHERE aid = :aid;\n" + + "UPDATE {0}.pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;\n" + + "UPDATE {0}.pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;\n" + + "INSERT INTO {0}.pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" + + "END;").format(schema) + +class ReadWriteZipfTest(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + pass + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + schema = engineGetSchema(engine) + node.safe_psql('TRUNCATE {0}.pgbench_history;'.format(schema)) + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid random_zipfian(1, 100000 * :scale, 1.5)\n" + + "\\set bid random_zipfian(1, 1 * :scale, 1.5)\n" + + "\\set tid random_zipfian(1, 10 * :scale, 1.5)\n" + + "\\set delta random(-5000, 5000)\n" + + "BEGIN;\n" + + "UPDATE {0}.pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" + + "SELECT abalance FROM {0}.pgbench_accounts WHERE aid = :aid;\n" + + "UPDATE {0}.pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;\n" + + "UPDATE {0}.pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;\n" + + "INSERT INTO {0}.pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" + + "END;").format(schema) + +class ReadWriteProcTest(): + def needsStdTables(self): + return True + + def prepare(self, engine, node): + schema = engineGetSchema(engine) + node.safe_psql(( + "CREATE OR REPLACE FUNCTION {0}.pgbench_transaction(_aid int, _bid int, _tid int, _delta int) RETURNS void AS $$\n" + + "BEGIN\n" + + "UPDATE {0}.pgbench_accounts SET abalance = abalance + _delta WHERE aid = _aid;\n" + + "PERFORM abalance FROM {0}.pgbench_accounts WHERE aid = _aid;\n" + + "UPDATE {0}.pgbench_tellers SET tbalance = tbalance + _delta WHERE tid = _tid;\n" + + "UPDATE {0}.pgbench_branches SET bbalance = bbalance + _delta WHERE bid = _bid;\n" + + "INSERT INTO {0}.pgbench_history (tid, bid, aid, delta, mtime) VALUES (_tid, _bid, _aid, _delta, CURRENT_TIMESTAMP);\n" + + "END;" + "$$ LANGUAGE plpgsql;").format(schema)) + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + schema = engineGetSchema(engine) + node.safe_psql('TRUNCATE {0}.pgbench_history;'.format(schema)) + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set aid random(1, 100000 * :scale)\n" + + "\\set bid random(1, 1 * :scale)\n" + + "\\set tid random(1, 10 * :scale)\n" + + "\\set delta random(-5000, 5000)\n" + + "SELECT {0}.pgbench_transaction(:aid, :bid, :tid, :delta);\n").format(schema) + +class OrderedInsertTest(): + def needsStdTables(self): + return False + + def prepare(self, engine, node): + if engine == 'orioledb': + node.safe_psql( + "CREATE TABLE orioledb.insert_test (\n" + + " ts timestamp NOT NULL,\n" + + " client_id int NOT NULL,\n" + + " PRIMARY KEY(ts, client_id)) USING orioledb;") + else: + node.safe_psql( + "CREATE TABLE public.insert_test (\n" + + " ts timestamp NOT NULL,\n" + + " client_id int NOT NULL,\n" + + " PRIMARY KEY(ts, client_id));") + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + schema = engineGetSchema(engine) + node.safe_psql('TRUNCATE {0}.insert_test;'.format(schema)) + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("INSERT INTO {0}.insert_test VALUES (current_timestamp, :client_id);\n").format(schema) + +class BloatTest(): + def needsStdTables(self): + return False + + def prepare(self, engine, node): + if engine == 'orioledb': + node.safe_psql( + "CREATE TABLE orioledb.bloat_test (\n" + + " id integer primary key,\n" + + " value1 float8 not null,\n" + + " value2 float8 not null,\n" + + " value3 float8 not null,\n" + + " value4 float8 not null,\n" + + " ts timestamp not null\n" + + ") USING orioledb;\n" + + "CREATE INDEX bloat_test_value1_idx ON orioledb.bloat_test (value1);\n" + + "CREATE INDEX bloat_test_value2_idx ON orioledb.bloat_test (value2);\n" + + "CREATE INDEX bloat_test_value3_idx ON orioledb.bloat_test (value3);\n" + + "CREATE INDEX bloat_test_value4_idx ON orioledb.bloat_test (value4);\n" + + "CREATE INDEX bloat_test_ts_idx ON orioledb.bloat_test (ts);") + else: + node.safe_psql( + "CREATE TABLE public.bloat_test (\n" + + " id integer primary key,\n" + + " value1 float8 not null,\n" + + " value2 float8 not null,\n" + + " value3 float8 not null,\n" + + " value4 float8 not null,\n" + + " ts timestamp not null\n" + + ");\n" + + "CREATE INDEX bloat_test_value1_idx ON public.bloat_test (value1);\n" + + "CREATE INDEX bloat_test_value2_idx ON public.bloat_test (value2);\n" + + "CREATE INDEX bloat_test_value3_idx ON public.bloat_test (value3);\n" + + "CREATE INDEX bloat_test_value4_idx ON public.bloat_test (value4);\n" + + "CREATE INDEX bloat_test_ts_idx ON public.bloat_test (ts);") + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + schema = engineGetSchema(engine) + node.safe_psql('TRUNCATE {0}.bloat_test;'.format(schema)) + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ("\\set id random(1,100000 * :scale)\n" + + "INSERT INTO {0}.bloat_test VALUES(:id, random(), random(), random(), random(), now())\n" + + "ON CONFLICT (id) DO UPDATE SET ts = now();").format(schema) + +class WGTest(): + def needsStdTables(self): + return False + + def prepare(self, engine, node): + if engine == 'orioledb': + schema = 'orioledb' + tableAm = 'orioledb' + else: + schema = 'public' + tableAm = 'heap' + + node.safe_psql((''' +CREATE TYPE {0}.trx_type AS ENUM ('void','normal'); +CREATE TYPE {0}.trx_status AS ENUM ('finished','in_progress'); +CREATE TYPE {0}.trx_hold AS ENUM ('begin','commit', 'rollback'); +CREATE TYPE {0}.trx_origin AS ENUM ('receipt','invoice', 'game'); +CREATE TYPE {0}.trx_reason AS ENUM ('game','purchase', 'other'); +CREATE TYPE {0}.op_type AS ENUM ('grant','consume'); + +CREATE TABLE {0}.trx_pgbench_0 ( + id bigint NOT NULL, + root_player_id bigint NOT NULL, + emitter smallint NOT NULL, + ns_id integer NOT NULL, + idempotency_key uuid NOT NULL, + type {0}.trx_type NOT NULL, + origin {0}.trx_origin, + meta_data jsonb, + internal_meta_data jsonb, + status {0}.trx_status DEFAULT 'finished'::{0}.trx_status NOT NULL, + hold {0}.trx_hold, + reason {0}.trx_reason, + created timestamp without time zone DEFAULT timezone('UTC'::text, now()), + updated timestamp without time zone DEFAULT timezone('UTC'::text, now()) +) USING {1}; + +ALTER TABLE ONLY {0}.trx_pgbench_0 + ADD CONSTRAINT trx_pgbench_0_pkey PRIMARY KEY (id); +CREATE INDEX trx_pgbench_0_root_player_id_idx ON {0}.trx_pgbench_0 +USING btree (root_player_id); +CREATE INDEX trx_pgbench_0_status_holds_idx ON {0}.trx_pgbench_0 USING btree (status, hold); + + +CREATE TABLE {0}.balance_pgbench_0 ( + ns_id integer NOT NULL, + player_id bigint NOT NULL, + currency_id integer NOT NULL, + amount bigint, + expires_after timestamp without time zone, + priority_id integer, + created timestamp without time zone DEFAULT timezone('UTC'::text, now()), + updated timestamp without time zone DEFAULT timezone('UTC'::text, now()), + is_single boolean, + classifier_id smallint DEFAULT 0 NOT NULL, + CONSTRAINT balance_pgbench_0_amount_check CHECK ((amount >= 0)) +) USING {1}; + +ALTER TABLE ONLY {0}.balance_pgbench_0 + ADD CONSTRAINT balance_pgbench_0_pkey PRIMARY KEY (ns_id, player_id, currency_id); + +CREATE TABLE {0}.op_pgbench_0 ( + id bigint NOT NULL, + ns_id integer NOT NULL, + player_id bigint NOT NULL, + trx_id bigint NOT NULL, + currency_id integer, + amount bigint, + balance_id bigint, + created timestamp without time zone DEFAULT timezone('UTC'::text, now()), + type {0}.op_type NOT NULL, + "order" smallint, + CONSTRAINT op_pgbench_0_amount_check CHECK ((amount >= 0)) +) USING {1}; + +ALTER TABLE ONLY {0}.op_pgbench_0 + ADD CONSTRAINT op_pgbench_0_pkey PRIMARY KEY (id); +CREATE INDEX op_pgbench_0_player_id_idx ON {0}.op_pgbench_0 USING btree (player_id); +CREATE INDEX op_pgbench_0_ns_id_idx ON {0}.op_pgbench_0 USING btree (ns_id); +CREATE INDEX op_pgbench_0_balance_id_idx ON {0}.op_pgbench_0 USING btree (balance_id); +CREATE INDEX op_pgbench_0_trx_id_idx ON {0}.op_pgbench_0 USING btree (trx_id); + +CREATE TABLE {0}.balance_version_pgbench_0 ( + root_player_id bigint NOT NULL, + balance_version bigint NOT NULL +) USING {1}; + +ALTER TABLE ONLY {0}.balance_version_pgbench_0 + ADD CONSTRAINT balance_version_pgbench_0_pkey PRIMARY KEY (root_player_id); + +CREATE SEQUENCE IF NOT EXISTS {0}.trx_pgbench_seq_0; +CREATE SEQUENCE IF NOT EXISTS {0}.op_pgbench_seq_0; +CREATE SEQUENCE IF NOT EXISTS {0}.balance_version_pgbench_seq_0; +''').format(schema, tableAm)) + + def prepareForRun(self, engine, node): + node.safe_psql('CHECKPOINT;') + + def getScript(self, engine): + schema = engineGetSchema(engine) + return ((''' +\\set region_id 0 +\\set ns_id random_zipfian(1, 10, 1.1) +\\set emitter random_zipfian(1000, 3000, 1.1) +\\set root_player_id random(1, 100000*:scale) +\\set currency_id random_zipfian(1, 50, 1.1) +\\set amount random(1, 10000) +BEGIN; +INSERT INTO {0}.trx_pgbench_0 + (id, ns_id, idempotency_key, + origin, type, hold, status, meta_data, internal_meta_data, emitter, + root_player_id, reason) +VALUES + (nextval('{0}.trx_pgbench_seq_0'), + :ns_id, gen_random_uuid(), NULL, 'normal', NULL, 'finished', + '{{"reason": 0, "eventID": null}}', NULL, :emitter, :root_player_id, 'game'); + +INSERT INTO {0}.op_pgbench_0 + (id, ns_id, player_id, trx_id, currency_id, amount, balance_id, type) +VALUES + (nextval('{0}.op_pgbench_seq_0'), + :ns_id, :root_player_id, currval('{0}.trx_pgbench_seq_0'), + :currency_id, :amount, NULL, 'grant'); + + +INSERT INTO {0}.balance_version_pgbench_0 + (root_player_id, balance_version) +VALUES + (:root_player_id, nextval('{0}.balance_version_pgbench_seq_0')) +ON CONFLICT (root_player_id) DO UPDATE + SET balance_version = excluded.balance_version; + +INSERT INTO {0}.balance_pgbench_0 + (ns_id, player_id, currency_id, amount, classifier_id) +VALUES + (:ns_id, :root_player_id, :currency_id, :amount, 0) +ON CONFLICT (ns_id, player_id, currency_id) DO UPDATE + SET amount = balance_pgbench_0.amount + :amount; +END; +''').format(schema)) + + +test_classes = { + 'read-write' : ReadWriteTest, + 'read-write-proc' : ReadWriteProcTest, + 'read-write-zipf' : ReadWriteZipfTest, + 'read-only' : ReadOnlyTest, + 'read-only-9' : ReadOnlyTest9, + 'read-only-zipf' : ReadOnlyZipfTest, + 'ordered-insert' : OrderedInsertTest, + 'bloat' : BloatTest, + 'wg' : WGTest, +} + +def parse_data_size(value): + match = re.match(data_size_regex, value) + if not match: + raise argparse.ArgumentTypeError("%s is an invalid data size value" % value) + grp = match.groups() + return grp[0] + grp[1] + +def parse_clinets(value): + result = [] + for c in value.split(','): + c = int(c) + if c <= 0: + raise argparse.ArgumentTypeError("%s is an invalid positive int value" % c) + result.append(c) + return result + +def parse_engines(value): + result = [] + for c in value.split(','): + if c == 'builtin' or c == 'orioledb': + result.append(c) + else: + raise argparse.ArgumentTypeError("%s is unknown engine" % c) + return result + +def parse_tests(value): + result = [] + for c in value.split(','): + if c in test_classes: + result.append(c) + else: + raise argparse.ArgumentTypeError("%s is unknown test" % c) + return result + +def parse_on_off(value): + if value in ['on', 'off']: + return value + raise argparse.ArgumentTypeError("%s is unknown on/off value" % value) + +def parse_on_off_bool(value): + if value == 'on': + return True + elif value == 'off': + return False + raise argparse.ArgumentTypeError("%s is unknown on/off value" % value) + +def check_positive(value): + ivalue = int(value) + if ivalue <= 0: + raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) + return ivalue + +instance_type_regex = r'^instance-type: (.*)$' + +def get_machine_name(): + name = socket.gethostname() + try: + result = subprocess.run(['ec2metadata'], stdout = subprocess.PIPE) + for line in result.stdout.splitlines(): + match = re.search(instance_type_regex, line.decode('utf8')) + if match: + name = "%s (%s)" % (name, match.groups()[0]) + except: + pass + return name + +tps_regex = r'^tps = (\d+\.\d+) ' + +# Read pgbench output and find TPS +def get_tps(fname): + try: + tps = None + with open(fname, 'rt') as fh: + for line in fh: + match = re.search(tps_regex, line) + if match: + tps = match.groups()[0] + tps = int(round(float(tps))) + except: + tps = None + return tps + +class PgBenchTest: + def parse_args(self): + default_output = 'results-%s-%s.tar.gz' % ( + socket.gethostname(), str(int(time.time()))) + parser = argparse.ArgumentParser() + parser.add_argument('--shared_buffers', type=parse_data_size, + dest='shared_buffers', default='1GB') + parser.add_argument('--undo_buffers', type=parse_data_size, + dest='undo_buffers', default='1GB') + parser.add_argument('--checkpoint_flush_after', type=parse_data_size, + dest='checkpoint_flush_after', default='1MB') + parser.add_argument('--max_wal_size', type=parse_data_size, + dest='max_wal_size', default='1GB') + parser.add_argument('--max_connections', type=int, default=100) + parser.add_argument('--clients', type=parse_clinets, + default=[1, 5, 10, 20]) + parser.add_argument('--time', type=check_positive, + default=5) + parser.add_argument('--ntries', type=check_positive, + default=1) + parser.add_argument('--scale', type=check_positive, + default=10) + parser.add_argument('--output', default=default_output) + parser.add_argument('--engines', type=parse_engines, + default=['builtin', 'orioledb']) + parser.add_argument('--tests', type=parse_tests, + default=['read-only', 'read-only-9']) + parser.add_argument('--base_dir', default=None) + parser.add_argument('--wal_dir', default=None) + parser.add_argument('--port', type=check_positive, + default=None) + parser.add_argument('--bot_token', + default=os.getenv('TELEGRAM_BOT_TOKEN')) + parser.add_argument('--chat_id', + default=os.getenv('TELEGRAM_CHAT_ID')) + parser.add_argument('--fsync', + type=parse_on_off, default='on') + parser.add_argument('--synchronous_commit', + type=parse_on_off, default='off') + parser.add_argument('--rate', type=check_positive, + default=None) + parser.add_argument('--checkpoint_timeout', type=check_positive, + default=300) + parser.add_argument('--max_io_concurrency', type=int, default=0) + parser.add_argument('--initdb', + type=parse_on_off_bool, default='on') + parser.add_argument('--device_filename', default=None) + parser.add_argument('--device_length', type=parse_data_size, + dest='device_length', default='1GB') + parser.add_argument('--use_mmap', + type=parse_on_off, default='off') + parser.add_argument('--results_dir', default=None) + + self.args = parser.parse_args() + + def report_progress(self, msg): + args = self.args + if self.bot: + try: + self.bot.send_message(chat_id=args.chat_id, text=msg) + except: + pass + + def report_file(self, filename): + if self.bot: + for i in range(0, 10): + try: + self.bot.send_document(chat_id = self.args.chat_id, + document = open(filename, 'rb')) + return + except: + time.sleep(1) + + def report_image(self, filename): + if self.bot: + for i in range(0, 10): + try: + self.bot.send_photo(chat_id = self.args.chat_id, + photo = open(filename, 'rb')) + return + except: + time.sleep(1) + + def prepare(self): + args = self.args + if args.bot_token and args.chat_id: + self.bot = telegram.Bot(args.bot_token) + else: + self.bot = None + node = testgres.get_new_node('test', + base_dir = args.base_dir, + port = args.port) + self.node = node + if args.results_dir: + self.results_dir = args.results_dir + os.makedirs(self.results_dir, exist_ok=True) + else: + self.results_dir = tempfile.mkdtemp(prefix='benchmark_') + + if args.initdb: + node.init(["--no-locale", "--encoding=UTF8"]) # run initdb + + if args.wal_dir: + shutil.move(os.path.join(node.data_dir, 'pg_wal'), + args.wal_dir) + os.symlink(os.path.join(args.wal_dir, 'pg_wal'), + os.path.join(node.data_dir, 'pg_wal')) + + node.append_conf('postgresql.conf', + "fsync = %s\n" + "log_statement = 'none'\n" + "max_connections = %s\n" + "synchronous_commit = %s\n" + "checkpoint_timeout = %s\n" + "checkpoint_flush_after = %s\n" + "max_wal_size = %s\n" + "checkpoint_timeout = %s\n"% + (args.fsync, + str(args.max_connections), + args.synchronous_commit, + str(args.checkpoint_timeout), + args.checkpoint_flush_after, + str(args.max_wal_size), + str(args.checkpoint_timeout))) + + if 'builtin' in args.engines: + node.append_conf("shared_buffers = %s\n" % + (args.shared_buffers)) + + if 'orioledb' in args.engines: + node.append_conf("shared_preload_libraries = 'orioledb'\n" + "orioledb.main_buffers = %s\n" + "orioledb.undo_buffers = %s\n" + "orioledb.checkpoint_completion_ratio = 1.0\n" + "orioledb.max_io_concurrency = %s\n" % + (args.shared_buffers, + args.undo_buffers, + args.max_io_concurrency)) + + if args.device_filename: + node.append_conf("orioledb.use_mmap = %s\n" + "orioledb.device_filename = '%s'\n" + "orioledb.device_length = '%s'\n" % + (args.use_mmap, + args.device_filename, + args.device_length)) + + stdTablesNeeded = False + tests = {} + for test_name in args.tests: + testInstance = test_classes[test_name]() + tests[test_name] = testInstance + if testInstance.needsStdTables(): + stdTablesNeeded = True + self.tests = tests + + node.start() # start PostgreSQL + + if args.initdb: + if 'orioledb' in args.engines: + node.safe_psql('postgres', + "CREATE EXTENSION orioledb;\n" + "CREATE SCHEMA orioledb;") + + if stdTablesNeeded: + if 'builtin' in args.engines: + node.safe_psql('postgres', + "CREATE TABLE public.pgbench_accounts (\n" + " aid integer NOT NULL,\n" + " bid integer,\n" + " abalance integer,\n" + " filler character(84)\n" + " );\n" + "CREATE TABLE public.pgbench_branches (\n" + " bid integer NOT NULL,\n" + " bbalance integer,\n" + " filler character(88)\n" + ");\n" + "CREATE TABLE public.pgbench_tellers (\n" + " tid integer NOT NULL,\n" + " bid integer,\n" + " tbalance integer,\n" + " filler character(84)\n" + ");\n" + "CREATE TABLE public.pgbench_history\n" + "(\n" + " tid integer NOT NULL,\n" + " bid integer NOT NULL,\n" + " aid integer NOT NULL,\n" + " delta integer NOT NULL,\n" + " mtime timestamp NOT NULL,\n" + " filler character(22)\n" + ");\n") + + if 'orioledb' in args.engines: + node.safe_psql('postgres', + "CREATE TABLE orioledb.pgbench_accounts (\n" + " aid integer NOT NULL PRIMARY KEY,\n" + " bid integer,\n" + " abalance integer,\n" + " filler character(84)\n" + ") USING orioledb;\n" + "CREATE TABLE orioledb.pgbench_branches (\n" + " bid integer NOT NULL PRIMARY KEY,\n" + " bbalance integer,\n" + " filler character(88)\n" + ") USING orioledb;\n" + "CREATE TABLE orioledb.pgbench_tellers (\n" + " tid integer NOT NULL PRIMARY KEY,\n" + " bid integer,\n" + " tbalance integer,\n" + " filler character(84)\n" + ") USING orioledb;\n" + "CREATE TABLE orioledb.pgbench_history\n" + "(\n" + " tid integer NOT NULL,\n" + " bid integer NOT NULL,\n" + " aid integer NOT NULL,\n" + " delta integer NOT NULL,\n" + " mtime timestamp NOT NULL,\n" + " filler character(22),\n" + " PRIMARY KEY(bid, mtime, tid, aid, delta)\n" + ") USING orioledb;\n") + + for engine in args.engines: + schema = engineGetSchema(engine) + for i in range(0, args.scale): + node.safe_psql('postgres', + "INSERT INTO %s.pgbench_branches (bid, bbalance)\n" + " (SELECT i, 0\n" + " FROM generate_series(%s, %s) i);\n" % + (schema, i * 1 + 1, (i + 1) * 1)) + node.safe_psql('postgres', + "INSERT INTO %s.pgbench_tellers (tid, bid, tbalance)\n" + " (SELECT i, (i - 1) / 10 + 1, 0\n" + " FROM generate_series(%s, %s) i);\n" % + (schema, i * 10 + 1, (i + 1) * 10)) + node.safe_psql('postgres', + "INSERT INTO %s.pgbench_accounts (aid, bid, abalance, filler)\n" + " (SELECT i, (i - 1) / 100000 + 1, 0, ''\n" + " FROM generate_series(%s, %s) i);\n" % + (schema, i * 100000 + 1, (i + 1) * 100000)) + + if stdTablesNeeded and 'builtin' in args.engines: + node.safe_psql('postgres', + "ALTER TABLE public.pgbench_branches ADD PRIMARY KEY (bid);\n" + "ALTER TABLE public.pgbench_tellers ADD PRIMARY KEY (tid);\n" + "ALTER TABLE public.pgbench_accounts ADD PRIMARY KEY (aid);\n") + + node.safe_psql('postgres', 'VACUUM ANALYZE public.pgbench_accounts;') + node.safe_psql('postgres', 'VACUUM ANALYZE public.pgbench_branches;') + node.safe_psql('postgres', 'VACUUM ANALYZE public.pgbench_tellers;') + node.safe_psql('postgres', 'VACUUM ANALYZE public.pgbench_history;') + + for engine in args.engines: + for test_name in args.tests: + self.tests[test_name].prepare(engine, node) + + node.safe_psql('postgres', 'CHECKPOINT;') + + self.report_progress('initilization completed') + + def run_pgbench(self, args, run_name): + output_filename = '%s/%s.log' % ( + self.results_dir, + run_name) + resources_filename = '%s/%s-resources.log' % ( + self.results_dir, + run_name) + con = self.node.connect() + output_file = open(output_filename, 'w') + resources_file = open(resources_filename, 'w') + process = subprocess.Popen( + args, + stdout=output_file, + stderr=subprocess.STDOUT) + + prev_cpu_times = psutil.cpu_times() + prev_disk_usage = psutil.disk_io_counters() + cpu_count = psutil.cpu_count() + t = time.time() + i = 0 + mount_point = self.node.base_dir + + while process.poll() is None: + i = i + 1 + time.sleep(max(t + i - time.time(), 0.0)) + cpu_times = psutil.cpu_times() + disk_usage = psutil.disk_io_counters() + try: + disk_space_used = shutil.disk_usage(mount_point).used + except: + disk_space_used = None + (waits, lsn) = con.execute(""" + SELECT jsonb_object_agg(k, v)::text waits, + pg_current_wal_lsn() lsn + FROM (SELECT coalesce(wait_event, 'CPU') k, count(*) v + FROM pg_stat_activity + GROUP BY wait_event) x + """)[0] + con.commit() + delta = {'time': i, + 'disk_used': disk_space_used, + 'system': (cpu_times.system - prev_cpu_times.system) / cpu_count * 100.0, + 'user': (cpu_times.user - prev_cpu_times.user) / cpu_count * 100.0, + 'idle': (cpu_times.idle - prev_cpu_times.idle) / cpu_count * 100.0, + 'read_count': disk_usage.read_count - prev_disk_usage.read_count, + 'write_count': disk_usage.write_count - prev_disk_usage.write_count, + 'read_bytes': disk_usage.read_bytes - prev_disk_usage.read_bytes, + 'write_bytes': disk_usage.write_bytes - prev_disk_usage.write_bytes, + 'waits': json.loads(waits), + 'lsn': lsn} + prev_cpu_times = cpu_times + prev_disk_usage = disk_usage + resources_file.write(json.dumps(delta) + "\n") + resources_file.flush() + + output_file.close() + resources_file.close() + con.close() + + tps = get_tps(output_filename) + return tps + + def benchmark(self): + args = self.args + node = self.node + self.results = {} + for engine in args.engines: + for test_name in args.tests: + test_file = tempfile.mktemp(prefix='benchmark_') + with open(test_file, 'wt') as f: + f.write(self.tests[test_name].getScript(engine)) + serie = [] + for c in args.clients: + measures = [] + for num in range(0, args.ntries): + self.tests[test_name].prepareForRun(engine, node) + params = [ + get_bin_path("pgbench"), + '-s', str(args.scale), + "-p", str(node.port), + "-h", node.host, + '-c', str(c), + '-j', str(c), + '-M', 'prepared', + '-f', test_file, + '-T', str(args.time), + '-P', '1' + ] + if args.rate: + params.append('-R') + params.append(str(args.rate)) + params.append('postgres') + run_name = '%s-%s-scale-%s-%s-%s' % ( + engine, + test_name, + str(args.scale), + str(c), + str(num)) + tps = self.run_pgbench(params, run_name) + measures.append(tps) + self.report_progress('%s: %s' % (run_name, str(tps))) + measures.sort() + if len(measures) % 2 == 1: + serie.append(measures[len(measures) // 2]) + else: + serie.append((measures[len(measures) // 2] + measures[len(measures) // 2 - 1]) / 2.0) + self.results[engine + '-' + test_name] = serie + self.report_progress('benchmark completed') + + def draw_graph(self): + args = self.args + colors = ['#2200C6', '#039533', '#F60114', '#FC6A17', '#8030D4', '#ACAC16', '#02C0D7', '#E81091'] + markers = ['v', 'o', 'x', 's', 'x', 'o', 'v', 's'] + fig = plt.figure(figsize = (10, 6)) + ax = fig.add_subplot(1, 1, 1) + i = 0 + for name in self.results: + serie = self.results[name] + line, = ax.plot(self.args.clients, serie, label = name, color = colors[i]) + plt.setp(line, linewidth = 4, marker = markers[i], markersize = 8, markeredgewidth = 2, markeredgecolor = colors[i]) + i = i + 1 + legend = ax.legend(loc = 0, fancybox = True) + title = ("pgbench -s %s on %s\nmedian of %s %s-seconds runs " + "with shared_buffers = %s, max_connections = %s") % ( + args.scale, get_machine_name(), args.ntries, args.time, + args.shared_buffers, args.max_connections) + ax.set_title(title, y = 1.03) + ax.ticklabel_format(axis = 'y', style = 'sci', scilimits = (-2, 10)) + plt.xlabel('# Clients') + plt.ylabel('TPS') + axes = plt.gca() + ax.grid(True) + plt.tight_layout() + graph_filename = self.results_dir + '/graph.png' + plt.savefig(graph_filename, format = 'png', dpi = 144, transparent = False) + self.report_image(graph_filename) + + def tear_down(self): + if hasattr(self, 'node'): + self.node.stop() # stop PostgreSQL + + def run(self): + try: + self.parse_args() + self.prepare() + self.benchmark() + self.draw_graph() + finally: + self.tear_down() + +test = PgBenchTest() +test.run() diff --git a/contrib/orioledb/ci/pgindent.sh b/contrib/orioledb/ci/pgindent.sh new file mode 100644 index 00000000000..aaa13690043 --- /dev/null +++ b/contrib/orioledb/ci/pgindent.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -eu + +cd postgresql/src/tools/pg_bsd_indent +make -sj4 install +cd ../../../.. + +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$GITHUB_WORKSPACE/python3-venv/bin:$GITHUB_WORKSPACE/postgresql/src/tools/pgindent:$PATH" + +cd orioledb +make USE_PGXS=1 -s pgindent +make USE_PGXS=1 -s yapf +git diff > pgindent.diff +cd .. + +if [ -s orioledb/pgindent.diff ]; then + echo "========= Contents of pgindent.diff" + cat orioledb/pgindent.diff + exit 1 +else + exit 0 +fi diff --git a/contrib/orioledb/ci/post_build_prerequisites.sh b/contrib/orioledb/ci/post_build_prerequisites.sh new file mode 100755 index 00000000000..3e284af3510 --- /dev/null +++ b/contrib/orioledb/ci/post_build_prerequisites.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -eu + +python3 -m venv $GITHUB_WORKSPACE/python3-venv + +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$GITHUB_WORKSPACE/python3-venv/bin:$PATH" + +# install required packages + +# psycopg2 depends on existing postgres installation +if [ $GITHUB_JOB = "run-benchmark" ]; then + pip_packages="psycopg2-binary six testgres==1.11.0 unidiff python-telegram-bot matplotlib" + sudo env "PATH=$PATH" pip3 install --upgrade $pip_packages +elif [ $GITHUB_JOB = "pgindent" ]; then + sudo env "PATH=$PATH" pip3 install --upgrade yapf +else + sudo env "PATH=$PATH" pip3 install --upgrade -r orioledb/requirements.txt +fi + +if [ $GITHUB_JOB != "run-benchmark" ] && [ $GITHUB_JOB != "pgindent" ]; then + wget https://codeload.github.com/eulerto/wal2json/tar.gz/refs/tags/wal2json_2_6 + tar -zxf wal2json_2_6 + rm wal2json_2_6 + cd wal2json-wal2json_2_6 + make + make install +fi diff --git a/contrib/orioledb/ci/prerequisites.sh b/contrib/orioledb/ci/prerequisites.sh new file mode 100644 index 00000000000..e059a45d1da --- /dev/null +++ b/contrib/orioledb/ci/prerequisites.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -eu + +# print the hostname to be able to identify runner by logs +echo "HOSTNAME=`hostname`" +TIMESTAMP=$(date +%s) +echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_ENV +echo "TIMESTAMP=$TIMESTAMP" + +# Disable background apt tasks +sudo systemctl stop --now apt-daily{,-upgrade}.service apt-daily{,-upgrade}.timer || true +sudo systemctl disable apt-daily{,-upgrade}.timer || true +sudo systemctl mask apt-daily{,-upgrade}.service || true +sudo systemctl stop --now unattended-upgrades || true + +# Wait for locks to be released +while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + echo "apt is busy, waiting..." + sleep 3 +done +while sudo fuser /var/lib/dpkg/lock >/dev/null 2>&1; do + echo "dpkg is busy, waiting..." + sleep 3 +done + +sudo apt-get -y install -qq wget ca-certificates + +sudo apt-get update -qq + +apt_packages="build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3-full python3-pip python3-setuptools python3-testresources libzstd1 libzstd-dev libcurl4-openssl-dev libssl-dev lcov" + +if [ $COMPILER = "clang" ]; then + apt_packages="$apt_packages llvm-$LLVM_VER clang-$LLVM_VER clang-tools-$LLVM_VER" +fi + +if [ $CHECK_TYPE = "static" ] || [ $COMPILER = "gcc" ]; then + apt_packages="$apt_packages cppcheck" +fi + +if [ $CHECK_TYPE = "valgrind_1" ] || [ $CHECK_TYPE = "valgrind_2" ]; then + apt_packages="$apt_packages valgrind" +fi + +if [ $CHECK_TYPE = "dm_log_writes" ]; then + apt_packages="$apt_packages e2fsprogs" +fi + +# install required packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages + +if [ $CHECK_TYPE = "dm_log_writes" ]; then + # dm-log-writes is built into the CI runner kernels; abort early if a + # future kernel update drops it instead of silently skipping. + if ! sudo dmsetup targets 2>/dev/null | grep -q "log-writes"; then + echo "ERROR: dm-log-writes DM target is not available in kernel $(uname -r)." + exit 1 + fi + + # Fetch and build replay-log from josefbacik/log-writes pinned to a + # known-good revision. The repo is tiny (3 files) so we wget them + # directly rather than cloning. + LOG_WRITES_REV=7b70d8a6863c5de30933d42a7672d35d01d2dc6c + LOG_WRITES_URL="https://raw.githubusercontent.com/josefbacik/log-writes/$LOG_WRITES_REV" + TMPLW=$(mktemp -d) + wget -q -O "$TMPLW/log-writes.h" "$LOG_WRITES_URL/log-writes.h" + wget -q -O "$TMPLW/log-writes.c" "$LOG_WRITES_URL/log-writes.c" + wget -q -O "$TMPLW/replay-log.c" "$LOG_WRITES_URL/replay-log.c" + sudo gcc -O2 -I "$TMPLW" -o /usr/local/bin/replay-log "$TMPLW/replay-log.c" "$TMPLW/log-writes.c" + rm -rf "$TMPLW" +fi diff --git a/contrib/orioledb/ci/sort_dump.py b/contrib/orioledb/ci/sort_dump.py new file mode 100644 index 00000000000..3e5e88dce46 --- /dev/null +++ b/contrib/orioledb/ci/sort_dump.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""Normalize a pg_dump directory-format dump for deterministic comparison. + +Reads the TOC via pg_restore -l to map file IDs to table names, then +sorts rows in each .dat file and writes normalized files named by +schema.table into an output directory. + +Usage: sort_dump.py +""" + +import os +import subprocess +import sys + + +def main(): + dump_dir = sys.argv[1] + out_dir = sys.argv[2] + + os.makedirs(out_dir, exist_ok=True) + + result = subprocess.run(['pg_restore', '-l', dump_dir], + capture_output=True, text=True) + + for line in result.stdout.splitlines(): + line = line.strip() + if not line or line.startswith(';'): + continue + if 'TABLE DATA' not in line: + continue + + # Format: "ID; OFFSET OID TABLE DATA schema table owner" + parts = line.split() + file_id = parts[0].rstrip(';') + td_idx = parts.index('TABLE') + schema = parts[td_idx + 2] + table = parts[td_idx + 3] + + dat_file = os.path.join(dump_dir, f"{file_id}.dat") + if not os.path.exists(dat_file): + continue + + with open(dat_file, 'r') as f: + rows = f.readlines() + rows.sort() + + out_file = os.path.join(out_dir, f"{schema}.{table}.dat") + with open(out_file, 'w') as f: + f.writelines(rows) + + +if __name__ == '__main__': + main() diff --git a/contrib/orioledb/ci/static.sh b/contrib/orioledb/ci/static.sh new file mode 100644 index 00000000000..9920a11d097 --- /dev/null +++ b/contrib/orioledb/ci/static.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -eu +export PATH="$GITHUB_WORKSPACE/pgsql/bin:$PATH" + +status=0 + +cd orioledb +if [ "$COMPILER" = "clang" ]; then + scan-build-$LLVM_VER --status-bugs \ + -disable-checker deadcode.DeadStores \ + make USE_PGXS=1 IS_DEV=1 USE_ASSERT_CHECKING=1 || status=$? + +elif [ "$COMPILER" = "gcc" ]; then + # Collect all orioledb's include paths recursively + ORIOLEDB_INCLUDE_DIRS=$(find include -type d) + + # Collect all PostgreSQL include paths recursively + PG_INCLUDE_DIRS=$(find "$(pg_config --includedir-server)" -type d) + + # Combine and convert to -I flags + INCLUDE_FLAGS=$(for dir in $ORIOLEDB_INCLUDE_DIRS $PG_INCLUDE_DIRS; do echo -n "-I$dir "; done) + + cppcheck \ + --enable=warning,portability,performance \ + --suppressions-list=ci/cppcheck-suppress \ + --std=c99 --inline-suppr --verbose \ + -D__GNUC__ \ + -D__x86_64__ \ + -D__aarch64__ \ + -D__arm \ + -D__arm__ \ + -DUSE_ASSERT_CHECKING \ + $INCLUDE_FLAGS \ + src/*.c src/*/*.c include/*.h include/*/*.h 2> cppcheck.log + + if [ -s cppcheck.log ]; then + echo "cppcheck report:" + cat cppcheck.log + status=1 # error + fi +fi +cd .. + +exit $status diff --git a/contrib/orioledb/doc/architecture/buffering.mdx b/contrib/orioledb/doc/architecture/buffering.mdx new file mode 100644 index 00000000000..aead201571a --- /dev/null +++ b/contrib/orioledb/doc/architecture/buffering.mdx @@ -0,0 +1,34 @@ +--- +id: buffering +sidebar_label: Buffering +--- + +# Undo log storage + +OrioleDB can store the undo log in files, but it also provides in-memory buffering. Buffering is designed to provide the fastest access to the most recent undo records. The total size of undo buffers is controlled by `orioledb.undo_buffers` GUC parameter. This size is split into two halves, one for the circular buffer and another for block buffers. + +OrioleDB keeps as many of undo log records as to fulfill the following requirements: + +- rollback any in-progress transactions; +- serve any existing snapshots; +- rollback any transactions during the checkpointing. + +The picture below illustrates the filling of the undo circular buffer. The retain location is the minimal position where we must keep undo records to fulfill the requirements. The arrows between the undo records illustrate the insertion order, not the actual links. + +![Undo circular buffer](../images/undo_buffer_1.svg) + +Once we reach the end of the circular buffer, we start from the beginning if the retain location allows us to do this without overwriting required undo records. Unless we need to retain too many undo records, we may store all of them in the circular buffer. + +![Undo circular buffer overflow](../images/undo_buffer_2.svg) + +Once we need to add a new undo record, but the corresponding space is still occupied by retained undo records, we need to write some undo records to undo files. The picture below shows undo log split between the circular buffer and undo file. The records before written location are kept in undo files, and the records after written location are kept in the circular buffer. If we don't need to retain much of undo log (for instance, some long-running transaction was finished), we may switch back to the storage of the whole undo log in the circular buffer. + +![Undo log split](../images/undo_buffer_3.svg) + +The two pictures below explain how we write a chunk of undo records into the data file. At first, we set the "write in-progress" location. This means that undo records within the range [written location; write in-progress location) are about to be written to undo files. During this period, undo records from this range can still be read from the circular buffer but can't be written (in some situations, we need to update existing undo records). + +![Undo log write in-progress](../images/undo_buffer_4.svg) + +After the writing is finished, the written location is advanced, and the whole undo range is available for reading and writing. + +![Undo log written](../images/undo_buffer_5.svg) diff --git a/contrib/orioledb/doc/architecture/checkpoints.mdx b/contrib/orioledb/doc/architecture/checkpoints.mdx new file mode 100644 index 00000000000..7a0f5fb2e9e --- /dev/null +++ b/contrib/orioledb/doc/architecture/checkpoints.mdx @@ -0,0 +1,137 @@ +--- +id: checkpoints +sidebar_label: Checkpoints +--- + +# Checkpoints + +## Tree walk order + +Checkpointer walks OrioleDB trees in LNR-order. The walk is divided into steps. The result of each step is a message, which determines the next step. The possible messages are given below. + +1. `WalkDownwards` – the last step found an in-memory downlink within an internal page. The next step should be to visit a page on the lower level and start processing it. +2. `WalkUpwards` – the last step finished processing the page. The next step should continue processing the parent. +3. `WalkContinue` – continue working with the current page after releasing a lock. That happens when the checkpointer has to wait for the concurrent operation. + +The picture below is the example of OrioleDB B-tree walking by the checkpointer. + +![Checkpoint walk the tree](../images/checkpoint_walk.svg) + +Checkpointer comes to the root page `n1` with the `WalkDownwards` message (step 1), then checks the first downlink `l1`. Since `l1` is the in-memory downlink, the checkpointer moves to the leaf page `n2` with the `WalkDownwards` message (step 2). After flushing `n2`, checkpointer comes back to `n1` with the `WalkUpwards` message (step 3) and continues iteration over `n1` downlinks. Similarly, it walks down to the `n3` and back via the `l2` downlink (steps 4 and 5). `l3` appears to be IO in-progress downlink, and the checkpointer has to unlock the `n1` page, and wait till IO is completed and continue with `WalkContinue` message (step 6). After relocking `n1`, checkpointer finds `l3` to be an on-disk downlink and copies it "as is". Finally, checkpointer walks down to the `n4` and back via the `l4` downlink (steps 7 and 8). Then `n1` is done, the checkpointer finishes the walk with the `WalkUpwards` message. + +## Checkpoint state + +While the checkpointer is writing children of non-leaf page, concurrent splits and merges could happen. Therefore, the checkpoint state contains images of non-leaf pages under checkpointing as its reconstructed as the checkpointer visits the downlinks. If there are no concurrent changes to the non-leaf page, the reconstructed state finally matches the page state. Otherwise, the reconstructed page state could not even match to page state at any moment of time but always match the history of the checkpointer tree walk. + +Note that the reconstructed state does not contain in-memory downlinks. In-memory downlinks are replaced with on-disk downlinks as we wrote the children's pages. + +The picture below represents an example of a checkpoint state. + +![Checkpoint state 1](../images/checkpoint_state_1.svg) + +The root page `n1`, internal page `n3`, and leaf page `n8` are currently under checkpointing. The downlink `l2` is written to the reconstructed state. The downlink `l3` is the `next downlink` in the reconstructed state. Its key is known, but the link is not because the corresponding children were not written yet. Similarly, downlink `l6` is written, downlink `l7` is the `next downlink`, and downlink `l8` is not processed yet. + +Let us imagine that the following event happened: + +1. Page `n7` was written by checkpointer. +2. Page `n8` was split into `n8` and `n9`. +3. Pages `n6` and `n7` were merged. The result is marked as `n67`. +4. Checkpointer has written page `n8` and started processing page `n9`. + +The resulting state is given in the picture below. Note that the reconstructed page image contains links `l6` and `l7` (as we visited them before the merge) but contains `l8` and the `next downlink` corresponding to `l9` (as we visited those downlinks after the split). + +![Checkpoint state 1](../images/checkpoint_state_2.svg) + +## Autonomous non-leaf pages + +If the non-leaf page under checkpointing gets modified concurrently, it becomes an "autonomous" non-leaf page. Autonomous pages work with the rules below. + +1. If the page is marked as "autonomous", all its parents to the root are also marked as "autonomous". +2. If the page has an associated on-disk location, this association is cleared. The corresponding location is marked as free space at the current checkpoint. +3. The autonomous page will be processed until its hikey is met, disregarding how many pages will be visited to meet this target (due to concurrent insertion, it could be many pages). +4. Even if the initial page corresponding to the autonomous page has been split. The page holding the initial hikey is tracked. The merge, which would remove that hikey, is prevented. +5. If the autonomous page is full, but the corresponding hikey is not yet met, current contents are flushed to the disk (and parent got the corresponding downlink with `WalkUpwards`), but processing of the autonomous page continues till the hikey is met. +6. When flushing autonomous, the corresponding "on-disk" location is marked as free for the future checkpoint. + +## Checkpointer messages + +Consider more details regarding the checkpointer messages we enumerated above. + +### WalkDownwards + +This message has the parameters below. + +- The number and change count of the in-memory page to be visited. +- Low key ("lokey"). The lokey is from the parent page downlink, or it is the parent page lokey if the downlink is the first on the page. + +The checkpointer has to process the referenced page. After the page is processed, the `WalkUpwards` message must be returned. If the referenced page is non-leaf, more messages will be issued during its processing, but finally, there must be `WalkUpwards` of the referenced page. + +There might be a failure due to concurrent operations: the in-memory page might have a different change count. In this case, the corresponding `WalkUpwards` should return the invalid downlink. Also, in a failure case, `WalkUpwards` message should go just after `WalkDownwards`: once we start processing the non-leaf page, we must finish it. + +### WalkUpwards + +This message has the parameters below. + +- On-disk downlink. This link might be invalid, as described above. +- Next key. That is actually a hikey of the page written. It might not match the subsequent downlink of the parent page due to concurrent splits and merges. On mismatch, the parent page must be marked "autonomous". +- Flag indicating that parent page must be marked as "dirty". This flag is set when the page has been written to the new place after the previous checkpoint. This flag is not set if the page and its children will not be modified then. The parent must be marked as "dirty" to be written and reflect the new on-disk downlink. +- The flag indicates that we must save the existing `next downlink` on the parent page. That happens to the autonomous page when the current reconstructed image is finished: we have the page written and need to insert a new downlink to the parent, but we still need to visit the same `next downlink`. + +This message indicates that the child page has been processed, and the parent needs to add the downlink. If there is no parent, we have processed the root and now have a pointer to the new root on-disk location. + +### WalkContinue + +This message has no parameters. It just indicates that checkpointer must continue processing the same page with the same `next downlink`. That happens when the checkpointer has to wait for the concurrent operation. Such as meeting IO in-process downlink and having to release the log and wait till the IO is finished. + +## Sequential buffers + +### What are sequential buffers? + +Sequential buffers (*seq bufs*) are lightweight, file-backed streaming I/O abstractions used by the checkpointer and by ordinary backends that write B-tree pages. Instead of holding all checkpoint metadata in shared memory, seq bufs stream data to and from on-disk files using two in-memory OrioleDB pages as a double-buffer. While one page is being filled (or drained) by the caller, the other can be flushed to disk or pre-fetched in the background, giving sequential throughput without occupying large amounts of shared memory. + +Each seq buf is identified by a `SeqBufTag` – a `(datoid, relnode, checkpointNumber, type)` tuple. The `type` field distinguishes the two kinds of files: + +- **`'m'` (map file)** – the checkpoint map that records on-disk page locations. +- **`'t'` (temporary file)** – the temporary tracking file used during the checkpoint walk. + +The in-memory state that must be shared between the checkpointer and writer backends lives in `SeqBufDescShared`, which is embedded directly in the B-tree meta page (`BTreeMetaPage`). Per-backend state such as the open file descriptor lives in `SeqBufDescPrivate`, which is stored in the tree descriptor (`BTreeDescr`) and is private to each backend. + +### Why are sequential buffers used? + +Each checkpointable B-tree keeps three groups of seq bufs: + +1. **`freeBuf`** – On entry to a new checkpoint the checkpointer opens this buffer to *read* the list of free disk extents recorded by the *previous* checkpoint. As the current checkpoint writes pages it can reuse those extents, avoiding unnecessary file growth. There is exactly one `freeBuf` per tree (no dual array) because it is replaced atomically at the start of each checkpoint: the new file is put in place before the old one is removed. + +2. **`nextChkp[2]`** – The checkpointer opens one of these two slots to *write* the checkpoint map file for the current checkpoint. Every time a B-tree page is flushed to disk, its location is appended to the map. The next checkpoint will read this map via a `freeBuf` so that it knows where each page lives without having to walk the whole tree again. + +3. **`tmpBuf[2]`** – The checkpointer opens one of these two slots to *write* a temporary file that tracks every page written during the checkpoint walk. After the walk finishes, this file is sorted, duplicates are removed, and it drives the hole-punching pass that reclaims unused space inside the data file. The file is deleted once the checkpoint is complete. + +### Why are there two slots (the `[2]` arrays)? + +OrioleDB checkpoints run concurrently with normal DML. To avoid serialising checkpoint N against the initialisation of checkpoint N+1, each of the arrays (`nextChkp`, `tmpBuf`, `datafileLength`, `partsInfo`) is indexed by `checkpointNumber % 2`. + +At any moment exactly one slot is "active" – the slot currently being written by the in-progress checkpoint – while the other slot either still holds data from the previous checkpoint (needed for recovery until the new checkpoint is verified) or is idle. + +``` +Checkpoint N → uses slot N % 2 +Checkpoint N+1 → uses slot (N+1) % 2 (the other slot) +``` + +This ping-pong scheme means checkpoint N+1 can begin allocating pages and initialising its seq buf files while checkpoint N is still finalising, without any shared state collision. + +The two dirty flags on the meta page (`dirtyFlag1`, `dirtyFlag2`) support the same scheme. `dirtyFlag1` is cleared at the start of the checkpoint. `dirtyFlag2` provides an extra generation so that a modification racing the clear of `dirtyFlag1` is never silently lost. If both flags are false when the checkpointer is about to start processing a tree, the tree has not changed since the last checkpoint; the freshly initialised seq buf files are closed and removed immediately without writing any data. + +### When are sequential buffers removed? + +Seq buf **in-memory pages** are returned to the page pool as soon as the buffer is finalised. This happens in one of the following situations: + +- **Checkpoint completes successfully** – `tmpBuf` pages are freed after post-processing; `nextChkp` pages are freed after the map file header is written and the file is renamed. +- **Tree descriptor is evicted** – when a tree descriptor is reclaimed from the descriptor cache, `btree_finalize_private_seq_bufs()` flushes and frees all in-memory pages belonging to that descriptor's active seq bufs. +- **Tree is dropped** – `checkpointable_tree_free()` closes all seq buf file descriptors, freeing the OS resources. + +Seq buf **on-disk files** have a longer lifetime than the in-memory pages: + +- The **tmp file** (`'t'`) is deleted once the checkpoint that created it has finished post-processing. +- The **map file** (`'m'`) from checkpoint N is kept until checkpoint N+1 has been completed and its own map file is in place. This ensures that point-in-time recovery can always find the latest clean checkpoint. +- The **free-extent file** is replaced atomically: the new file is written and renamed into place before the old one is unlinked. +- If a tree was **not modified** between two checkpoints (both dirty flags are false), the seq buf files initialised for that checkpoint are closed immediately and removed without writing any data. \ No newline at end of file diff --git a/contrib/orioledb/doc/architecture/concurrency.mdx b/contrib/orioledb/doc/architecture/concurrency.mdx new file mode 100644 index 00000000000..d1b0f224aa9 --- /dev/null +++ b/contrib/orioledb/doc/architecture/concurrency.mdx @@ -0,0 +1,94 @@ +--- +id: concurrency +sidebar_label: Concurrency +--- + +# Concurrency algorithms in OrioleDB B-tree + +## Page change count + +In OrioleDB B-tree, we do not lock pages unless we need to modify them. Therefore, after traversing the downlink, we need to check if we reach the target page (because it could be concurrently evicted, merged, etc.). In order to cope with that, OrioleDB each in-memory page has `OrioleDBPageHeader.pageChangeCount` field, which got incremented every time the in-memory page changes its identity. The in-memory page identity means a particular tree, level, and lokey (left bound of the page key range). Despite the page lokey is not stored at the page, it is structurally determined. Therefore left page of split and the left page of merge save identities. Thanks to the change count mechanism, the process traversing the downlink can detect a concurrent change of the page identity. + +## Rightlinks + +In OrioleDB, in-memory downlinks can be changed to on-disk downlinks and vice versa on page eviction and load correspondingly. Therefore, OrioleDB pages do not normally have rightlinks or leftlinks. Rightlinks temporarily exist during split to prevent blocking tree navigation. Consider the page split process in the pictures below. + +Step 1 depicts the initial state of part of the tree comprising parent page 1 and child page 2. + +![Split step 1](../images/split_step_1.svg) + +Step 2 depicts the split of page 2, creating the new page 3. At this point, page 2 has a rightlink to page 3. At this point, if a concurrent process is looking for the key located on page 3, it will check the hikey of page 2 and traverse to page 3 via rightlink. The keys located on page 2 could be found as usual. + +![Split step 2](../images/split_step_2.svg) + +Step 3 depicts the insertion of the new downlink to page 1. The rightlink between pages 2 and 3 still persists. At this point, the concurrent process can locate page 3 via downlink. If the concurrent process came to page 2 before downlink insertion, it still could use the rightlink. + +![Split step 3](../images/split_step_3.svg) + +Step 4 depicts the removal of the rightlink from page 2 to page 3. If a concurrent process, which came to page 2 before downlink insertion, is looking for a key located on page 3, then it has to check the rightlink and restart from page 1. + +![Split step 4](../images/split_step_4.svg) + +## Page eviction + +Page eviction is forbidden for the page with a rightlink or without a downlink from the parent. Generally, both source and target of rightlink are forbidden. Therefore page eviction does not have to deal with rightlinks. Rightlinks can connect only in-memory pages. Consider the page eviction process in the pictures below. + +Step 1 depicts the initial state of part of the tree comprising parent page 1 and child page 2. Page 2 is locked and should be evicted. At this point evicting process should find and lock the parent page 1 using the page 2 hikey to find it. + +![Evict step 1](../images/evict_step_1.svg) + +Step 2 depicts both page 1 and page 2 locked. At this point evicting process repaces in-memory downlink with IO downlink and increases page change count. IO downlink prevents a concurrent process from using it, making them wait till IO is completed. Increased change count prevents all the concurrent processes, which managed to use in-memory downlink, from using page 2. + +![Evict step 2](../images/evict_step_2.svg) + +Step 3 depicts page 1 disconnected from its child with IO downlink. + +![Evict step 3](../images/evict_step_3.svg) + +Finally evicting process writes the on-disk downlink to page 1 (step 4). + +![Evict step 4](../images/evict_step_4.svg) + +## Page load + +When Postgres backend needs a page referenced by the on-disk downlink on OrioleDB non-leaf page, it has to load that page. + +Step 1 depicts the initial state of the non-leaf page 1 before loading its child. Page 1 is locked and has an on-disk downlink. At this point, the process needs to replace the downlink with IO in-progress one, unlock page 1 and start the IO. + +![Load step 1](../images/load_step_1.svg) + +Step 2 depicts the page 1 state while the IO is in progress. All the concurrent processes dealing with that downlink must wait until IO is completed. When the IO is completed, our process needs to relock page 1. + +![Load step 2](../images/load_step_2.svg) + +Step 3 depicts the state when page 1 is relocked. At this point, our process needs to add child page 2 and make the downlink on page 1 point to page 2. + +![Load step 3](../images/load_step_3.svg) + +Step 4 represents the final state. Page 2 is loaded, and page 1 contains the in-memory downlink to page 2. + +![Load step 4](../images/load_step_4.svg) + +## Page merge + +When OrioleDB has a page candidate for eviction, and that page is too sparse (with less than 30% of space busy), it considers page merging. Page merging will also free an in-memory page but does not need an IO, even for a dirty page. + +Step 1 depicts the initial state of part of the tree comprising parent page 1, child page to be merged 3 (locked), left sibling 2, and right sibling 4. At this point, we need to lock the parent page 1. We release child page 3 lock first. + +![Merge step 1](../images/merge_step_1.svg) + +Step 2 depicts the state where parent page 1 is locked, but the child is not. At this point, we need to relock the child page 3 to be merged. If this page is gone (due to concurrent eviction or another merge), then give up with merging. We also check that parent is not under checkpoint and give up otherwise. + +![Merge step 2](../images/merge_step_2.svg) + +Step 3 depicts both parent page 1 and child page 3 locked. Here we need to select the way to merge. We check that child page 3 is not under checkpoint and does not have a rightlink. Give up otherwise. + +![Merge step 3](../images/merge_step_3.svg) + +Step 4 depicts two possible ways to merge: with right sibling 4 (upper picture) or with left sibling 3 (upper picture). Note that the left page always saves its identity in either direction we chose. So if we merge to the left, page 3 will be removed. We cannot merge with a sibling under checkpoint or have a rightlink. + +![Merge step 4](../images/merge_step_4.svg) + +Step 5 depicts the merge result. In this example, we merged to the right. All the tuples on page 4 were merged into page 3. The result page is marked as page 34. Page 34 also has a hikey of page 4. Also, we remove page 4 and the corresponding downlink on page 1. + +![Merge step 5](../images/merge_step_5.svg) diff --git a/contrib/orioledb/doc/architecture/fsm.mdx b/contrib/orioledb/doc/architecture/fsm.mdx new file mode 100644 index 00000000000..254a448ecc6 --- /dev/null +++ b/contrib/orioledb/doc/architecture/fsm.mdx @@ -0,0 +1,86 @@ +--- +id: fsm +sidebar_label: Free space management +--- + +# Free space management + +OrioleDB manages free space for regular and compressed trees using a combination of metadata files, checkpoints, and system trees to efficiently allocate and track free blocks and extents. + +## Regular trees + +Each OrioleDB has an associated data file. The length of that datafile is kept in the `BTreeMetaPage.datafileLength`. It does not necessarily strictly matches the actual file length. The data file might be shorter than `BTreeMetaPage.datafileLength`: it means that some pages have their offsets virtually allocated but have not been written there yet. Also, the data file might be longer than `BTreeMetaPage.datafileLength`: some pages have been written beyond during previous database run, which ended up with a crash, but now that places should be considered as free space. + +Each tree checkpoint has associated two files for free space management. + +- `*.tmp` file contains block numbers, which have been freed from previous checkpoint completion to the current checkpoint completion. This file is optional and might not exist if no blocks were freed in that period. +- `*.map` file, which contains: + - link to the tree root, + - The data file length during the checkpoint completion, + - the array of free block numbers in this checkpoint. + +Therefore, root and all the blocks directly or indirectly referenced by root are considered busy in this checkpoint. All other blocks within the datafile length are considered free and should be listed in the array of free blocks. + +When OrioleDB needs a block to write the page, it gets it in the following order: + +- from the array of free block numbers in the `*.map` file of the checkpoint where the database instance was started, +- from the `*.tmp` files of further completed checkpoints, if any, +- by increasing the data file length (atomic increment of `BTreeMetaPage.datafileLength`). + +The picture below illustrates the page writing of the tree when there is no concurrent checkpointing. + +![Free space management 1](../images/fsm_1.svg) + +If the page to be written already has an associated block in the data file, that block number is written to the `*.tmp` and `*.map` files of the next checkpoint. On the picture, previous block number is written to the `*.tmp` and `*.map` files of checkpoint 2. + +This page's new block number is acquired according to the rules described above. On the picture, we get it from the `*.tmp` file of checkpoint 1. + +The picture below illustrates the page writing of the tree when there is no concurrent checkpointing. + +![Free space management 1](../images/fsm_2.svg) + +In the picture above, checkpoint 1 is completed, while checkpoint 2 is in-progress. Checkpointer processes trees in some deterministic order. + +For the trees passed by checkpointer, the in-progress checkpoint is considered completed. In the picture, tree 1 is passed by checkpointer. Therefore, checkpoint 2 is considered completed and checkpoint 3 as future. + +For the trees not yet reached by checkpointer, the in-progress checkpoint is the same as the non-started. In the picture, tree 3 is not yet reached by checkpointer. Therefore, checkpoint 1 is considered completed and checkpoint 2 as future. + +The tree under checkpointer is the most complicated case. The checkpointer has already written some parts of the tree. When we need to write that part of a tree, we cannot write it in-place because it would violate the copy-on-write principle. Instead, we write the pages to the new place. That place is the checkpoint next to the in-progress one. Thus, when the checkpointer walks the tree from left to right, the particular logic depends on whether the checkpointer already passed the particular page. + +- When the page to be written is already passed by checkpointer has an associated block in the data file, that block is written to the `*.tmp` and `*.map` files of the checkpoint next to the in-progress one. On the picture, we write the previous block number page 5 of tree 2 to the `*.tmp` and `*.map` files of checkpoint 3. + +- When the page to be written is already passed by checkpointer, the new block number is acquired according to the rules described above. The in-progress checkpoint `*.tmp` file still cannot be the source for new blocks because it is not completed for this tree. However, when we get the new block number, we have to also write it to the `*.map` file of in progress checkpoint because that block belongs to the next checkpoint and is free for in progress checkpoint. On the picture, we get a new block number for page 5 of tree 2 from the `*.tmp` file of checkpoint 1 and also write it to the `*.map` file of checkpoint 2. + +- When the page to be written is not yet passed by the checkpointer, it works similarly to the trees not yet reached by the checkpointer. The same rules apply to the checkpointer when it writes the pages itself. If there is an associated block in the data file, that block number is written to the `*.tmp` and `*.map` files of the in progress. On the picture, the previous block number of page 6 of tree 2 is written to the `*.tmp` and `*.map` files of checkpoint 2. The page's new block number is acquired according to the rules described above. On the picture, we get a new block number for page 6 of tree 2 from the `*.tmp` file of checkpoint 1. + +When checkpointer finishes checkpointing the particular tree, it needs to finish `*.map` file. In particular, it adds all the free blocks available in the `*.map` and `*.tmp` files (according to the rules) to the `*.map` file and writes the current value of `BTreeMetaPage.datafileLength` as the file length. + +## Compressed trees + +OrioleDB implements page-level compression. Pages have fixed size in-memory but variable size in the data file. Therefore, the free space management described above needs some advancements. + +At first, `*.tmp` and `*.map` files contains not free block numbers but free extents. Extent comprises offset and length. + +Since we are dealing with extents, we can just read `*.tmp` and `*.map` files sequentially because we may meet extents that don't match the required length. In order to deal with free extents, we also have system trees `SYS_TREES_EXTENTS_OFF_LEN` and `SYS_TREES_EXTENTS_LEN_OFF`. The `SYS_TREES_EXTENTS_LEN_OFF` is ordered by the extent length, and we use it to find the extent which is the best fit for our needs. The `SYS_TREES_EXTENTS_OFF_LEN` is ordered by the extent offset, and we use it to join the conjuncted extents. + +`SYS_TREES_EXTENTS_OFF_LEN` and `SYS_TREES_EXTENTS_LEN_OFF` trees are temporary. Their content does not survive server restart: we always start with these trees empty. Once we load some tree, we also load the content of its `*.map` file to `SYS_TREES_EXTENTS_OFF_LEN` and `SYS_TREES_EXTENTS_LEN_OFF`. When the checkpoint of a tree is completed, we read the content of `*.tmp` file of the previous checkpoint to these trees. When the checkpoint is completed, we add extents from those trees to the `*.map` file. + +Multiple processes could be concurrently looking for the free extent, and one process inserting a new free extent. Correct handling the concurrency problem is a challenge. The algorithms are considered below. + +The algorithm for getting a free extent for writing the page is given below. + +1. Find the shortest fitting extent in the `SYS_TREES_EXTENTS_LEN_OFF` tree and delete it. If not found, increase `BTreeMetaPage.datafileLength` by the required length and return the corresponding extent. +2. If there is a remaining part of the selected extent, insert it into the `SYS_TREES_EXTENTS_OFF_LEN` tree. +3. Delete the selected extent from the `SYS_TREES_EXTENTS_OFF_LEN` tree. +4. If there is a remaining part of the selected extent, insert it into the `SYS_TREES_EXTENTS_LEN_OFF` tree. + +Please, note that if the concurrent process needs to merge the remaining part, it will always find some extent for merge. In `SYS_TREES_EXTENTS_OFF_LEN`, we insert first (step 2) and only then delete (step 3). So, there is no intermediate state with no extent to merge. + +The algorithm for insertion of a new free extent is given below. + +1. In the `SYS_TREES_EXTENTS_OFF_LEN` tree, find the left and right siblings of the new extent. Check if they are adjacent to a new extent; thus, we need to merge them. +2. If we need to merge the left sibling, delete it from the `SYS_TREES_EXTENTS_LEN_OFF` tree. On failure, re-try from step 1. +3. If we need to merge the right sibling, delete it from the `SYS_TREES_EXTENTS_LEN_OFF` tree. On failure, re-insert the left sibling if it was deleted, and re-try from step 1. +4. At this point concurrent process cannot use either left or right siblings because they were deleted from the `SYS_TREES_EXTENTS_LEN_OFF` tree. +5. Delete siblings to be merged from the `SYS_TREES_EXTENTS_OFF_LEN` tree. +6. Insert new extent (with siblings merged) into the `SYS_TREES_EXTENTS_OFF_LEN` tree, then the `SYS_TREES_EXTENTS_LEN_OFF` tree. diff --git a/contrib/orioledb/doc/architecture/overview.mdx b/contrib/orioledb/doc/architecture/overview.mdx new file mode 100644 index 00000000000..c3e7f5d6f2e --- /dev/null +++ b/contrib/orioledb/doc/architecture/overview.mdx @@ -0,0 +1,167 @@ +--- +id: overview +sidebar_label: Overview +--- + +# OrioleDB Architecture Overview + +OrioleDB structures data in index-organized tables, employs dual pointers for efficient memory management, and utilizes a novel page structure, copy-on-write checkpoints, an undo log, and row-level WAL for robust transaction management and recovery, while also supporting system catalog and data compression for optimized operations. + +## Indexes + +OrioleDB stored data in index-organized tables. Consider the example of the `vegetables` table, which has a primary index on `id` and a secondary index on `count`. + +| Id | Name | Count | +| --- | -------- | ----- | +| 1 | Tomato | 10 | +| 2 | Cucumber | 3 | +| 3 | Cabbage | 7 | +| 4 | Melon | 6 | + +The data structure behind the `vegetables` table is given below. Both primary and secondary indexes comprise variations of B+-tree. Leaf tuples are marked blue, non-leaf tuples are marked green, high-keys are marked yellow. The primary index's non-leaf tuples and high keys consist of the primary key columns; leaf tuples are table rows themselves. Non-leaf and leaf tuples, high keys of the secondary index consists of secondary index columns and primary index columns. + +![Indexes of vegetables table](../images/vegetables_indexes.svg) + +The primary index and secondary index are not tied together; they are only connected by logical values persisting in table rows. OrioleDB automatically creates a primary index on the virtual `ctid` column when no primary index is given. + +## Dual pointers + +OrioleDB uses a so-called `dual pointers` scheme to avoid buffer mapping and corresponding bottlenecks. The idea of `dual pointers` is that downlinks in non-leaf in-memory pages could point to either in-memory or storage pages. Therefore, one could navigate in-memory pages using direct links without any buffer table. The storage page must be loaded into the main memory if one needs to traverse downlink, which points to storage. The corresponding downlink had to be replaced with an in-memory downlink. + +The diagram below shows the simplified structure of the OrioleDB B-tree. This diagram does not detail the page contents: main memory pages are numbered the same as their storage prototypes. Arrows depict downlinks. + +![Dual pointers scheme](../images/dual_pointers.svg) + +The main memory page could refer to both main memory and storage pages. However, storage pages can refer to storage pages only. Therefore, besides main memory and storage pages `1`, `2` and `3` are marked the same as storage pages `1`, `2` and `3`, their contents are different in the above matter. + +In order to implement this scheme, we have to sacrifice rightlinks. That would be too complex (and slow?) to toggle downlinks and rightlinks between in-memory and storage pointers. + +Technically, OrioleDB B-trees still contain rightlinks, but they have only temporary usage during page splits. Rightlink exists only between splitting a new page and insertion downlink to the parent. Therefore, if the completed split happens concurrently with locating a tree page, one must retry from the parent (see find_page() function). Stepping tree pages right and left become more complex too. Instead of using rightlinks (and leftlinks) one have to find siblings from parent (see find_right_page() and find_left_page()). However, this complexity is more than justified by better vertical scalability. + +See [concurrency algorithms in OrioleDB B-tree](concurrency.mdx) for details. + +## Page structure + +OrioleDB implements a novel page structure optimized for modern multi-core machines. + +There is a `state` atomic variable in a page header. This atomic variable provides the following functionality: + +1. Exclusive lock on a page. There could be only one exclusive page locker at a time. The exclusive lock itself does not give the right to modify page contents or prevent other processes from reading the page. +2. Exclusive locker may upgrade his lock to block page readers. Once readers are blocked, the locker may start modification of page contents. +3. Tracking change count for page contents. Thanks to that, one may copy part of the page (or do some computations) and check that page was not changed concurrently (and retry if it was). + +See `src/btree/page_state.c` for details. + +The tuples on the page are split into chunks. There is also an area of high keys, where each chunk has an associated high key. The high key of the last chunk is simultaneously the high key of the page. Thank this, if one needs a particular tuple on the page, he does not need to copy the whole page. It is enough to copy the high keys area, find the appropriate page chunk, and copy it. `PartialPageState` structure is responsible for tracking partially read pages. + +![Page structure](../images/page_structure.svg) + +In the future, we plan to get rid co copying in the majority of page access patterns and implement vectorization for faster search within the page. + +## Copy-on-write checkpoints + +OrioleDB utilized copy-on-write checkpoints and row-level WAL. The example below illustrates the big picture of copy-on-write checkpoints. + +For example, page number `7` was modified. It was marked as `7*`. + +![Copy-on-write checkpoint 1](../images/cow_1.svg) + +Checkpoint has written `7*` to the storage. It has written to the free space according to the copy-on-write principle. When checkpoint considers writing a non-leaf page, it replaces in-memory downlinks with storage ones. Therefore, page `3` is also considered modified because we need to reference the new `7*` from the storage page. So, page `3*` is also written to the free storage space. Similar to `1*`. + +![Copy-on-write checkpoint 2](../images/cow_2.svg) + +Once the checkpoint is completed, old storage page images `1`, `3`, and `7` are marked as free space. + +![Copy-on-write checkpoint 3](../images/cow_3.svg) + +Therefore, a consistent tree image exists in storage every moment. + +OrioleDB supports fuzzy checkpointing. That is, we allow tree modification concurrent to checkpointing. That is essential because too fast or frequent checkpoints could cause a write flood. + +Consider the following example. The tree contains pages `1 – 7`. Pages `1 – 6` are present in both main memory and storage (checkpoint 1), while page `7` is present in storage only. Page `4` was modified (`4*`). + +![Concurrent checkpoint 1](../images/checkpoint_concurrent_1.svg) + +Checkpointing was started to traverse the tree from left to right, and it passed the subtree of pages `2`, `4` and `5`. Page images `4*` and `2*` were written to checkpoint 2. + +![Concurrent checkpoint 2](../images/checkpoint_concurrent_2.svg) + +Concurrently, page `5` was modified (`5*`). Background writer wrote page image `5*`. Page `5` belongs to the tree part, which is already processed by checkpointer. That is why we cannot write it to checkpoint 2, because it can affect its consistency. + +Page number `7` was also concurrently modified but was not written by a background writer. + +![Concurrent checkpoint 3](../images/checkpoint_concurrent_3.svg) + +Then, checkpointer finished writing page images `7*`, `3*` and `1*` to checkpoint 2. + +![Concurrent checkpoint 4](../images/checkpoint_concurrent_4.svg) + +In general, checkpointing of non-leaf pages is more tricky than described above. While the checkpointer is writing children of non-leaf page, concurrent splits and merges could happen. In such cases, we have to reconstruct non-leaf pages based on the state of its children as we met them. Therefore, we might write to the storage a non-leaf page image, which never existed in the main memory. Furthermore, we could even write multiple storage pages corresponding to a single main memory page (imagine merges happen above the checkpoint boundary, while splits happen below the checkpoint boundary). Finally, that is OK, because it reflects how checkpointer wrote the children. + +At the moment of time, there could be multiple checkpoints which use different but overlapping sets of blocks. Therefore, [free space management](fsm.mdx) becomes an untrivial task. + +See [the detailed description of checkpointing algorithm](checkpoints.mdx). + +## Undo log + +OrioleDB implements transactions and MVCC using UNDO log. The row-level undo records comprises chains of row versions. Particularly, tuple headers are connected in lists, where the head is located on the data page, and the rest of the elements are located in undo log. See the diagram below. + +![Row level undo](../images/row_level_undo.svg) + +Some undo records include both tuple header and tuple body (update record), while some do not alter tuple body and contain just tuple header (delete and row-level lock records). + +Besides a presence in row versions chains, undo records are also present in transaction chains. If the transaction aborts, the corresponding chain is traversed to replay all the undo records. + +The snapshot of undo log is written out during checkpointing. We need this because, during recovery, we might need to rollback some of the transactions that were in progress during checkpointing. Besides checkpointing, we do not have to write the undo log to the storage except when it does not fit to corresponding shared memory. + +Sometimes we need to replay only part of the transaction chain. For instance, OrioleDB's implementation of `ROLLBACK TO SAVEPOINT` replays the transaction's undo log chain to the given point. Aborting speculative insertions during `INSERT ... ON CONFLICT ...` works the same way. However, once we replay some part of the transaction undo the chain, we might still need it during recovery because corresponding data could be (partially) checkpointed before replay. In order to track this, we add special `branch` undo records, which gives recovery a possibility to walk already replayed branches. + +Some undo records also require some action on commit. For instance, our implementation of `TRUNCATE` issues an undo record, which deletes old relfilenodes on transaction commit and deletes new relfilenodes on a transaction abort. We track the separate list for undo records requiring on-commit actions. + +OrioleDB also supports block-level undo records. The block-level undo records are the changes applied to the whole page. + +The diagram below gives an example of a `compact` block-level undo record. Here the data pages contains tuples `t1`, `t2` and `t5`. However, a page image in the undo log contains tuples `t1`, `t2`, `t3`, and `t4`. That means, when tuples `t3` and `t4` were deleted, we lacked space for a new tuple `t5`. In order to do this, we made a `compaction` first. Therefore, we issue a page-level undo record and erase tuples `t3` and `t4` to fit `t5`. + +![Block level undo](../images/block_level_undo.svg) + +OrioleDB has three types of block-level undo records: + +1. Compact undo record: one data page references one undo page image, +2. Split undo record: two data pages reference one undo page image, +3. Merge undo record: one data page references two undo page images. + +OrioleDB uses both circular buffers and block buffers for accessing the undo log. See [undo log storage](buffering.mdx) for details. + +## Row-level WAL + +OrioleDB implements a row-level write-ahead log (WAL) used for both recovery and replication. Row-level WAL requires structurally consistent checkpoints described above. Row-level WAL records include transaction start, row insert, row update, row delete, transaction commit/abort etc. The complete list of OrioleDB's WAL records is given in `include/recovery/wal.h` header. + +OrioleDB table has the following B-trees for its data: + +1. TOAST tree, +2. Primary key tree, +3. Secondary keys trees. + +TOAST and primary key trees are subjects of WAL-logging, while secondary keys trees aren't. Secondary keys are recovering based on changes in TOAST and primary key. + +Since OrioleDB implements fuzzy checkpointing, we require idempotency property here. Checkpointer visits TOAST and primary key before secondary keys. Therefore, secondary keys might have a newer state than TOAST and primary keys in the checkpoint. Idempotency guarantees that if some changes are applied to secondary keys twice or more, it does not affect the final state. + +OrioleDB implements parallel application of WAL records. It launches `orioledb.recovery_pool_size` number of workers. Each worker is responsible for its own set of primary key values (according to hash value). The startup process distributes row-level WAL records to the queues connected to workers. + +![Block level undo](../images/recovery_wokers.svg) + +Queues might be processed at different paces. In order to avoid MVCC anomalies, we assume the transaction to be committed and visible for readers only once all the workers have completed all the pieces of work associated with that transaction. + +See [the details about OrioleDB's recovery](recovery.mdx). + +## System catalog + +In OrioleDB, checkpointing, page writing/eviction, and recovery manipulate trees logically. In order to do so, they need to be able to compare tuple keys, calculate tuple lengths, etc. One who performs the routines above needs access to some meta-information. However, we need to do these routines without a fully initialized connection to PostgreSQL database. + +In order to resolve this problem, OrioleDB implements "system trees" -- OrioleDB's analog of system catalog with minimal information required to perform the routines. System trees contain information about tables, associated trees, data types etc. See `src/catalog/sys_trees.c` for details. + +## Data compression + +OrioleDB supports data compression at the block level. In compressed trees, storage pages are compressed with the `zstd` algorithm. That is, storage pages have variable lengths. Because of this, we have to maintain more complex mechanisms for managing free space in compressed trees. + +OrioleDB implements this free space management using two system trees: `SYS_TREES_EXTENTS_OFF_LEN`, `SYS_TREES_EXTENTS_LEN_OFF`. Both these trees contain information about free space extents. The first maintains extents sorted by offset, while the second one sorts extents by length then offset. The second tree allows searching for the most appropriate free extent for the new storage block. The first tree is used to find adjacent extents and initiate their join. diff --git a/contrib/orioledb/doc/architecture/recovery.mdx b/contrib/orioledb/doc/architecture/recovery.mdx new file mode 100644 index 00000000000..8d50b240b7f --- /dev/null +++ b/contrib/orioledb/doc/architecture/recovery.mdx @@ -0,0 +1,67 @@ +--- +id: recovery +sidebar_position: 1 +sidebar_label: Recovery & replication +--- + +# Recovery & replication + +OrioleDB leverages a distributed recovery mechanism, assigning each worker process its own set of primary key values to manage, facilitating scalable and efficient recovery and replication by splitting large transactions across multiple workers. + +## Splitting work between multiple processes + +OrioleDB implements multiprocess recovery and replication (technically, the replication in PostgreSQL is network-based recovery allowing concurrent read-only queries). The main recovery process reads the WAL stream and distributes the messages via recovery workers via queues. + +Unlike other solutions, we do not distribute work between workers transaction-wise. Instead, each worker is responsible for his own set of keys (values of the primary key for each table). Therefore, the large transaction will be split into chunks for each worker. The essential advantage of this approach is the ability to scale the recovery and replication independently on the degree of transaction parallelism. + +The picture below illustrates the recovery scheme involving the main recovery process and four recovery workers. The DMLs in the example are related to some single table, in which the primary key is the `id` column of integer type. The WAL stream contains transactions 1 comprising insert with `id = 1` and update with `id = 2`, transaction 2 comprising delete with `id = 2` and insert with `id = 3`. The main recovery process distributes these operations to the queues based on the hash of the `id` column (`id = 1` to queue 1, `id = 2` to queue 2, `id = 3` to queue 3). + +![Distribution of messages to recovery queues](../images/recovery_distribute.svg) + +Note that the main process does not distribute the transaction begin message because it does not know which workers will be involved in the transaction. Instead, it attaches the transaction id to the row modification messages. Also, note that OrioleDB transactions are not necessarily continuous chunks in WAL. They could be interleaved. + +Main recovery process tracks which worker participates in which transaction. Once the transaction is confirmed and aborted in the WAL stream, the main process spreads this message to the participating workers. + +Recovery workers are not synchronized on each transaction finish. So, worker #2 can process the `delete` message before worker #2 completes the `commit` message. It is possible because each worker has his own notion of finished transactions. + +Recovery processes store the recovery transaction statuses in `recovery_xid_state_hash`. When transaction status needs to be clarified, the recovery process first checks `recovery_xid_state_hash` and only then the shared memory. The main recovery process updates transaction status in the shared memory only when all the worker processes have already processed the transaction finish message. Consequently, once the transaction status is updated in shared memory, the worker process can remove its entry from the hash. + +Given that transaction is effectively split between the main process and multiple workers during recovery, the corresponding undo log is also split. The picture below illustrates this. The transaction may have the undo chain in the main process. That chain reflects transaction undo records that existed during checkpointing as well as undo of actions replayed by the main recovery process (such as DDL). Simultaneously, the transaction may have one or more undo chains in the recovery workers. + +![Undo chains during recovery](../images/recovery_undo.svg) + +## Primary keys, TOAST, and secondary indexes + +Recovery must bring all the table trees into a consistent state: primary key, TOAST, and secondary indexes. Primary key and TOAST are the primary information, while secondary indexes could be derived from them. That is why the OrioleDB WAL log only changes in primary keys and TOAST trees. + +The key of OrioleDB TOAST tree contains: + +1. Value of primary key, +2. Attribute number for TOASTed value, +3. Offset within the TOASTED value. + +Therefore the single value is represented by one or more leaf tuples in TOAST trees with different offsets (starting from zero). + +There could be multiple versions of the same tuple in the same transaction. Correspondingly there could be multiple versions of the TOASTed values (if they got updated). Therefore, we need to correctly match the version of a primary key tuple to that of a TOAST tuple. We handle this by attaching the version number to the tuple, as depicted below. + +![Versions for toast tuples](../images/toast_version.svg) + +The version number is transaction-wise. Thus, in each new transaction, the version number starts from zero. Zero version number is the default. If the tuple does not contain the version number, then the version is zero. When the primary key tuple belonging to the in-progress transaction gets updated within the same transaction, its version increases. The TOASTed fields get updated, and TOAST tuples get the same version as the new primary key tuple. Therefore when we need to find the TOAST tuple corresponding to the given primary key tuple, we should find the tuple with the greatest version less than equal to the primary key tuple's version. + +OrioleDB needs to recover secondary indexes from the TOAST trees and primary key trees. Secondary indexes might be built on the TOASTed, which complicates the thing. + +Therefore, OrioleDB writes checkpoints in the following order: + +1. TOAST trees, +2. Primary key trees, +3. Secondary index trees. + +Also, we are writing to the WAL TOAST tuples first and then primary key tuples. + +When the checkpointing of the primary key trees is finished, we mark the current WAL position of the "toast consistency point". See the picture below. + +![Recovery of secondary indexes](../images/recovery_secondary_indexes.svg) + +We only apply WAL records to TOAST and primary key trees during recovery before the toast consistency point. We cannot "lose" any secondary index changes in that period because secondary index trees were checkpointer later. Thus, secondary indexes already contain all the changes made before the toast consistency point. + +After the toast consistency point, we start to apply changes to the secondary indexes. Since TOAST WAL records are going first, we can fetch all the TOASTed values we need (if any) and apply the changes to the secondary indexes while applying the primary key WAL record. diff --git a/contrib/orioledb/doc/architecture/row-level-concurrency.mdx b/contrib/orioledb/doc/architecture/row-level-concurrency.mdx new file mode 100644 index 00000000000..51e4ed4ab15 --- /dev/null +++ b/contrib/orioledb/doc/architecture/row-level-concurrency.mdx @@ -0,0 +1,86 @@ +--- +id: row-level-concurrency +sidebar_label: Row-level concurrency +--- + +# Row-level concurrency + +Row-level concurrency in OrioleDB is as close as possible to regular PostgreSQL tables. However, there are still some differencies because tables in OrioleDB are index-organized. + +## Update of the primary key + +Update of primary key is typically rare and unusual situation in index-organized table. Unlike regular update, update of primary key is implemented as a sequence of delete and insert. Concurrent update and delete can't follow this update and may result in error. See the example below. Session 2 gets an error due to concurrent primary key update in session 1. + +```sql +CREATE TABLE tbl +( + id int4 primary key, + value numeric NOT NULL +) USING orioledb; + +INSERT INTO tbl VALUES (1, 0.0); +``` + +```bash title="Session 1" +> BEGIN; +> UPDATE tbl SET id = 2 WHERE id = 1; +UPDATE 1 +``` + +```bash title="Session 2" +> BEGIN; +> UPDATE tbl SET value = value + 1 WHERE id = 1; +(waiting) +> COMMIT; +COMMIT +ERROR: tuple to be locked has its primary key changed due to concurrent update +> ROLLBACK; +ROLLBACK +``` + +## Following the update chain + +If some row was deleted and then new row with same primary key value is immediately inserted, then concurrent update or delete may consider the new row as a new version of the old row. See the example below. Session 1 deletes row and then inserts row with same primary key value. Session 2 were intended to update initial row, but finally updates the newly inserted row. + +```sql +CREATE TABLE tbl +( + id int4 primary key, + value numeric NOT NULL +) USING orioledb; + +INSERT INTO tbl VALUES (1, 0.0); +``` + +```bash title="Session 1" +> BEGIN; +> DELETE FROM tbl WHERE id = 1; +DELETE 1 +> INSERT INTO tbl VALUES (1, 0.0); +INSERT 0 1 +``` + +```bash title="Session 2" +> BEGIN; +> UPDATE tbl SET value = value + 1 WHERE id = 1; +(waiting) +> COMMIT; +COMMIT +UPDATE 1 +> COMMIT; +COMMIT +``` + +## Transaction ID Allocation and Heap Relations + +OrioleDB optimizes concurrency by utilizing virtual transaction IDs (Virtual XIDs) for operations confined exclusively to OrioleDB tables. This bypasses the overhead of allocating a full PostgreSQL Transaction ID (XID) and the associated Write-Ahead Logging (WAL) required for heap relations. + +Modifying any PostgreSQL `heap` relation within an OrioleDB transaction forces the allocation of a full XID. This alters the concurrency path and negates the Virtual XID optimization. + +### Sequence Generation Overhead + +PostgreSQL sequences are backed by `heap` relations. Using `nextval()` modifies the sequence relation, triggering a full XID allocation. To preserve OrioleDB's optimized transaction mechanics: + +* **Enable Sequence Caching:** Configure sequences with a `CACHE` directive (e.g., `CREATE SEQUENCE my_seq CACHE 100;`). +* **Mechanism:** Caching restricts heap modification (and full XID allocation) to the single `nextval()` call that fetches the cache block. Subsequent calls retrieve values from session memory. +* **Workload Impact:** This optimization can be critical for workloads with high volumes of small transactions. In large transactions, the performance penalty of a single XID allocation is heavily amortized. \ No newline at end of file diff --git a/contrib/orioledb/doc/contributing/docker-builds.mdx b/contrib/orioledb/doc/contributing/docker-builds.mdx new file mode 100644 index 00000000000..7e23b730b6c --- /dev/null +++ b/contrib/orioledb/doc/contributing/docker-builds.mdx @@ -0,0 +1,318 @@ +--- +id: docker-builds +sidebar_label: Docker Builds +--- + +# Building Docker images + +This document provides instructions on how to build Docker images for OrioleDB, and how to test them. + +#### Prerequisites + +Before you begin, make sure you have Docker installed on your local machine. If not, you can download and install it from the Docker official website. https://docs.docker.com/get-docker/ + +- `docker -v` + +## Quickstart + +Open a terminal and navigate to the OrioleDB project directory, if you are not already in it: + +- `cd path/to/orioledb` + +Build (Alpine) PostgreSQL 17 + OrioleDB extension: + +- `docker build -t orioletest:17 -f docker/Dockerfile --pull --network=host --progress=plain --build-arg PG_MAJOR="17" .` + +Start server: + +- `docker run --name orioletest17 -v orioletest17data:/var/lib/postgresql/data -e POSTGRES_PASSWORD=oriole123 -d orioletest:17` + +Connect to the server via psql: + +- `docker exec -ti orioletest17 psql -U postgres` + +You should expect a similar psql message: + +``` +psql (17.7 OrioleDB pre-2 beta 16 PGTAG=patches17_18 alpine:3.21+clang build:2025-10-25T19:54:25+00:00 17.7) +Type "help" for help. +postgres=# +``` + +Enable orioledb extension: + +- `create extension if not exists orioledb;` + +Test some commands: + +``` +postgres=# select orioledb_version(); + orioledb_version +------------------------- + OrioleDB pre-2 beta 16 +(1 row) + +postgres=# CREATE TABLE oriole_test (a int) USING orioledb; +CREATE TABLE +postgres=# INSERT INTO oriole_test VALUES (1), (2); +INSERT 0 2 +postgres=# SELECT * FROM oriole_test; + a +--- + 1 + 2 +(2 rows) + +postgres=# VACUUM ANALYZE oriole_test; +VACUUM + +postgres=# \d+ oriole_test + Table "public.oriole_test" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+---------+-----------+----------+---------+---------+-------------+--------------+------------- + a | integer | | | | plain | | | +Access method: orioledb + + +postgres=# \d+ + List of relations + Schema | Name | Type | Owner | Persistence | Access method | Size | Description +--------+----------------------+-------+----------+-------------+---------------+------------+------------- + public | oriole_test | table | postgres | permanent | orioledb | 8192 bytes | + public | orioledb_index | view | postgres | permanent | | 0 bytes | + public | orioledb_index_descr | view | postgres | permanent | | 0 bytes | + public | orioledb_table | view | postgres | permanent | | 0 bytes | + public | orioledb_table_descr | view | postgres | permanent | | 0 bytes | +(5 rows) + +postgres=# \dx + List of installed extensions + Name | Version | Schema | Description +----------+---------+------------+------------------------------------------------------ + orioledb | 1.2 | public | OrioleDB -- the next generation transactional engine + plpgsql | 1.0 | pg_catalog | PL/pgSQL procedural language +(2 rows) + +postgres=# \dx+ orioledb + Objects in extension "orioledb" + Object description +------------------------------------------------------------------------- + access method orioledb + function orioledb_commit_hash() + function orioledb_compression_max_level() + function orioledb_evict_pages(oid,integer) + function orioledb_get_evicted_trees() + function orioledb_get_index_descrs() + function orioledb_get_table_descrs() + function orioledb_has_retained_undo() + function orioledb_idx_structure(oid,text,character varying,integer) + function orioledb_index_description(oid,oid,oid,text) + function orioledb_index_oids() + function orioledb_index_rows(oid) + function orioledb_page_stats() + function orioledb_parallel_debug_start() + function orioledb_parallel_debug_stop() + function orioledb_recovery_synchronized() + function orioledb_relation_size(oid) + function orioledb_sys_tree_check(integer,boolean) + function orioledb_sys_tree_rows(integer) + function orioledb_sys_tree_structure(integer,character varying,integer) + function orioledb_table_description(oid) + function orioledb_table_description(oid,oid,oid) + function orioledb_table_oids() + function orioledb_table_pages(oid) + function orioledb_tableam_handler(internal) + function orioledb_tbl_are_indices_equal(regclass,regclass) + function orioledb_tbl_bin_structure(oid,boolean,integer) + function orioledb_tbl_check(oid,boolean) + function orioledb_tbl_compression_check(bigint,oid,integer[]) + function orioledb_tbl_indices(oid) + function orioledb_tbl_structure(oid,character varying,integer) + function orioledb_ucm_check() + function orioledb_version() + function orioledb_write_pages(oid) + function pg_stopevent_reset(text) + function pg_stopevent_set(text,jsonpath) + function pg_stopevents() + function s3_get(text) + function s3_put(text,text) + type orioledb_index + type orioledb_index[] + type orioledb_index_descr + type orioledb_index_descr[] + type orioledb_table + type orioledb_table[] + type orioledb_table_descr + type orioledb_table_descr[] + view orioledb_index + view orioledb_index_descr + view orioledb_table + view orioledb_table_descr +(51 rows) +``` + +Quit from the database: `\q` + +Stop the server: + +- `docker stop orioletest17` + +Remove container: + +- `docker container rm orioletest17` + +Remove docker image: + +- `docker rmi orioletest:17` + +Remove the data volume: + +- `docker volume rm orioletest17data` + +## Building Docker Images + +To build a Docker image, use one of the following commands: + +#### To build PostgreSQL 17 + OrieleDB extension + +``` +docker build -t orioletest:17 -f docker/Dockerfile --pull --network=host --progress=plain --build-arg PG_MAJOR="17" . +``` + +#### To build PostgreSQL 16 + OrieleDB extension + +``` +docker build -t orioletest:16 -f docker/Dockerfile --pull --network=host --progress=plain --build-arg PG_MAJOR="16" . +``` + +## Supported environment variables + +This project aims to maintain compatibility with the Docker Official PostgreSQL image, and therefore, it also supports the environmental variables found there: + +- `POSTGRES_PASSWORD` +- `POSTGRES_USER` +- `POSTGRES_DB` +- `POSTGRES_INITDB_ARGS` +- `POSTGRES_INITDB_WALDIR` +- `POSTGRES_HOST_AUTH_METHOD` +- `PGDATA` + +Read more: https://github.com/docker-library/docs/blob/master/postgres/README.md + +## Available Docker build args + +Please check the Dockerfiles for the full list of build args! +- Alpine Linux: `./Dockerfile` + - supported [ `edge 3.21 3.20 3.19 3.18` ] + - example: `--build-arg ALPINE_VERSION="3.21" -f docker/Dockerfile ` +- Ubuntu Linux: `./Dockerfile.ubuntu` + - supported [ `devel 25.04 24.10 24.04 22.04 plucky oracular noble jammy` ] + - example: `--build-arg UBUNTU_VERSION="24.04" -f docker/Dockerfile.ubuntu ` + +Other important build args: +- `--build-arg PG_MAJOR="17"` + - Choose the main version of PostgreSQL. Default is `17`. + - You can choose from `16`, `17`. +- `--build-arg BUILD_CC_COMPILER="gcc"` + - Choose the C compiler. Default is `clang`. + - You can choose either `clang` or `gcc`. + +For example, to build an image using Alpine version `3.21`, the `gcc` compiler and PostgreSQL version `16`, use the following command: + +```bash +docker build --pull --network=host --progress=plain \ + --build-arg ALPINE_VERSION="3.21" \ + --build-arg BUILD_CC_COMPILER="gcc" \ + --build-arg PG_MAJOR="16" \ + -f docker/Dockerfile \ + -t orioletest:16-gcc-alpine3.21 . +``` + +To build an image using Ubuntu version `devel`, the `clang` compiler and PostgreSQL version `17`, use the following command: + +```bash +docker build --pull --network=host --progress=plain \ + --build-arg UBUNTU_VERSION="devel" \ + --build-arg BUILD_CC_COMPILER="clang" \ + --build-arg PG_MAJOR="17" \ + -f docker/Dockerfile.ubuntu \ + -t orioletest:17-clang-ubuntu-devel . +``` +The "devel" version is the latest development Ubuntu version, so it might not be stable. + +## Experimental OrioleDB + PostGIS Extension build + +Known limitations: +- OrioleDB `gist`, `sp-gist`, and other related indexes are not yet supported. + +#### Step 1: create image: `orioletest:17-gcc-alpine3.21` + +```bash +docker build --pull --network=host --progress=plain \ + --build-arg ALPINE_VERSION="3.21" \ + --build-arg BUILD_CC_COMPILER="gcc" \ + --build-arg PG_MAJOR="17" \ + -f docker/Dockerfile \ + -t orioletest:17-gcc-alpine3.21 . +``` + +#### Step2: Build the `oriolegis:17-3.5-alpine` image. + +in a new directory, run this commands: + +```bash +git clone --depth=1 https://github.com/postgis/docker-postgis.git +cd ./docker-postgis/17-3.5/alpine +docker build --network=host --progress=plain \ + --build-arg BASE_IMAGE=orioletest:17-gcc-alpine3.21 \ + -t oriolegis:17-3.5-alpine . +``` + +## Developer notes: + +To build all Docker image variations on a local machine, run the following command: +- `./ci/local_docker_matrix.sh` +- or (experimental) `./ci/docker_matrix.sh --help` + +##### Ubuntu +- Supported base images (with security updates): + - https://hub.docker.com/_/ubuntu +- Supported base image architectures: + - [ amd64, arm32v7, arm64v8, ppc64le, riscv64, s390x] + +##### Alpine +- Supported base images (with security updates): + - https://hub.docker.com/_/alpine +- Supported base image architectures: + - [ amd64, arm32v6, arm32v7, arm64v8, i386, ppc64le, riscv64, s390x ] + +#### macOS +On macOS you might need to install `bash` and `gnu-getopt` from Homebrew. To install them run the command: + +```bash +brew install bash gnu-getopt +``` + +Update your `/etc/shells`: + +```bash +echo /opt/homebrew/bin/bash >> /etc/shells +``` + +You may need to update your `PATH` variable in the `.bashrc` or `.zshrc` file: + +``` +PATH=/opt/homebrew/bin:/opt/homebrew/opt/gnu-getopt/bin:$PATH +``` + +##### Other: + +- Testing: If you can test on architectures other than `amd64`, please let us know! + +- Some QEMU versions can't emulate PostgreSQL JIT. In this case, use `jit=off`. + +- Security: Note that ports which are not bound to the host (i.e., `-p 5432:5432` instead of `-p 127.0.0.1:5432:5432`) will be accessible from the outside. This also applies if you configured UFW to block this specific port, as Docker manages its own iptables rules. ( [Read More](https://docs.docker.com/network/iptables/) ). With a simple password and open ports, you can be infected by [crypto miners]( https://github.com/docker-library/postgres/issues/770#issuecomment-704460980 ) ! + +- Windows: If you encounter any problems, please use Windows Subsystem for Linux (WSL2). + +- Extending the current OrioleDB Docker images is not easy; you can't use Ubuntu PostgreSQL packages (like: `postgresql-16-mobilitydb`) - you need to build from source. diff --git a/contrib/orioledb/doc/contributing/local-builds.mdx b/contrib/orioledb/doc/contributing/local-builds.mdx new file mode 100644 index 00000000000..a18d799ba9a --- /dev/null +++ b/contrib/orioledb/doc/contributing/local-builds.mdx @@ -0,0 +1,338 @@ +--- +id: local-builds +sidebar_label: Building from source +--- + +# OrioleDB development quickstart + +This guide will help you to build and run OrioleDB on your local machine from the source code. + +## Linux + +### Install prerequisites + +```bash +sudo apt-get update +sudo apt install git build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources libzstd1 libzstd-dev valgrind libssl-dev libcurl4-openssl-dev wget +``` + +### Download and install PostgreSQL 17 with patches + +```bash +git clone https://github.com/orioledb/postgres.git --branch patches17 --single-branch postgres-patches17 +cd postgres-patches17/ +``` + +### Checkout to required patch tag: + +Check required postgres patch version in [.pgtags](https://github.com/orioledb/orioledb/blob/main/.pgtags) or [README.md](https://github.com/orioledb/orioledb?tab=readme-ov-file#build-from-source) files. Because documentation can become outdated. + +```bash +git checkout patches17_6 +``` + +### Enable Valgrind support in PostgreSQL code (optional) + +```bash +sed -i.bak "s/\/\* #define USE_VALGRIND \*\//#define USE_VALGRIND/g" src/include/pg_config_manual.h +``` + +### Configure and build + +```bash +PG_PREFIX=$HOME/pg17 +./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu --prefix=$PG_PREFIX +make -j$(nproc) +make -j$(nproc) install +make -C contrib -j$(nproc) +make -C contrib -j$(nproc) install +echo "export PATH=\"$PG_PREFIX/bin:\$PATH\"" >> ~/.bashrc +source ~/.bashrc +``` + +### Install python requirements + +```bash +# create venv if needed +python -m venv venv +# activate venv if needed +source ./venv/bin/activate +pip3 install -r requirements.txt +sudo pip3 install compiledb +``` + +### Download and build the OrioleDB extension + +```bash +cd .. +git clone https://github.com/orioledb/orioledb.git +cd orioledb +# Build with compiledb, because it creates compile_commands.json needed for VSCode C/C++ extension +compiledb make USE_PGXS=1 IS_DEV=1 +# Exclude compile_commands.json from the Git tracking +echo "compile_commands.json" >> .git/info/exclude +``` + +### Download and install Visual Studio Code + +```bash +cd .. +wget --content-disposition "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-x64" +sudo apt install ./code_*.deb +# Install Python and C++ extension +code --install-extension ms-python.python +code --install-extension ms-vscode.cpptools +code orioledb +``` + +### Check installation + +#### Run automated tests + +```bash +cd orioledb +make USE_PGXS=1 IS_DEV=1 installcheck +``` + +#### Manual installation and running + +```bash +cd orioledb +make USE_PGXS=1 IS_DEV=1 install +initdb --no-locale -D $HOME/pgdata +sed -i 's/#shared_preload_libraries = '\'''\''/shared_preload_libraries = '\''orioledb'\''/' $HOME/pgdata/postgresql.conf +pg_ctl -D $HOME/pgdata/ start -l $HOME/log +psql -c "CREATE EXTENSION IF NOT EXISTS orioledb; SELECT orioledb_commit_hash();" -d postgres +``` + +# MacOS + +### Disable System Integrity Protection + +Follow [the instruction to disable System Integrity Protection](http://osxdaily.com/2015/10/05/disable-rootless-system-integrity-protection-mac-os-x/). + +### Install Homebrew + +``` +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +``` + +### Install prerequisites + +```bash +brew install python zstd pkg-config icu4c openssl wget gnu-sed +sudo cpan IPC::Run +echo "export PKG_CONFIG_PATH=\"\$PKG_CONFIG_PATH:$(brew --prefix icu4c)/lib/pkgconfig\"" >> ~/.zshrc + +# For Intel chip devices +echo "export CFLAGS=\"$CFLAGS -I/usr/local/include\"" >> ~/.zshrc +echo "export LDFLAGS=\"$LDFLAGS -L/usr/local/lib\"" >> ~/.zshrc +# + +# For Apple M chip devices +echo "export CFLAGS=\"$CFLAGS -I/opt/homebrew/include\"" >> ~/.zshrc +echo "export LDFLAGS=\"$LDFLAGS -L/opt/homebrew/lib\"" >> ~/.zshrc +# + +exec zsh -l +``` + +### Download and install PostgreSQL 17 with patches + +```bash +git clone https://github.com/orioledb/postgres.git --branch patches17 --single-branch postgres-patches17 +cd postgres-patches17/ +``` + +### Checkout to required patch tag: + +Check required postgres patch version in [.pgtags](https://github.com/orioledb/orioledb/blob/main/.pgtags) or [README.md](https://github.com/orioledb/orioledb?tab=readme-ov-file#build-from-source) files. Because documentation can become outdated. + +```bash +git checkout patches17_6 +``` + +### Configure and build + +```bash +PG_PREFIX=$HOME/pg17 +./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu --prefix=$PG_PREFIX +make -j$(nproc) +make -j$(nproc) install +make -C contrib -j$(nproc) +make -C contrib -j$(nproc) install +echo "export PATH=\"$PG_PREFIX/bin:\$PATH\"" >> ~/.zshrc +exec zsh -l +``` + +### Install python requirements + +```bash +# create venv if needed +python -m venv venv +# activate venv if needed +source ./venv/bin/activate +pip3 install -r requirements.txt +sudo pip3 install compiledb +``` + +### Download and build the OrioleDB extension + +```bash +cd .. +git clone https://github.com/orioledb/orioledb.git +cd orioledb +# Build with compiledb, because it creates compile_commands.json needed for VSCode C/C++ extension +compiledb make USE_PGXS=1 IS_DEV=1 +# Exclude compile_commands.json from the Git tracking +echo "compile_commands.json" >> .git/info/exclude +``` + +### Download and install Visual Studio Code + +```bash +cd .. +brew install --cask visual-studio-code +exec zsh -l +# Install Python and C++ extension +code --install-extension ms-python.python +code --install-extension ms-vscode.cpptools +code orioledb +``` + +### Check installation + +#### Run automated tests + +```bash +cd orioledb +make USE_PGXS=1 IS_DEV=1 installcheck +``` + +#### Manual installation and running + +```bash +cd orioledb +make USE_PGXS=1 IS_DEV=1 install +initdb --no-locale -D $HOME/pgdata +gsed -i 's/#shared_preload_libraries = '\'''\''/shared_preload_libraries = '\''orioledb'\''/' $HOME/pgdata/postgresql.conf +pg_ctl -D $HOME/pgdata/ start -l $HOME/log +psql -c "CREATE EXTENSION IF NOT EXISTS orioledb; SELECT orioledb_commit_hash();" -d postgres +``` + +## Windows + +### Install ubuntu in WSL + +```bat +wsl --install -d Ubuntu +``` + +Then reboot, start Ubuntu from start menu, and choose login/password. + +```bat +wsl --shutdown +``` + +Start Ubuntu from start menu again. + +### Install prerequisites + +```bash +sudo hwclock --hctosys +sudo apt-get update +sudo apt install git build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources libzstd1 libzstd-dev valgrind libssl-dev libcurl4-openssl-dev wget +``` + +### Download and install PostgreSQL 17 with patches + +```bash +git clone https://github.com/orioledb/postgres.git --branch patches17 --single-branch postgres-patches17 +cd postgres-patches17/ +``` + +### Checkout to required patch tag: + +Check required postgres patch version in [.pgtags](https://github.com/orioledb/orioledb/blob/main/.pgtags) or [README.md](https://github.com/orioledb/orioledb?tab=readme-ov-file#build-from-source) files. Because documentation can become outdated. + +```bash +git checkout patches17_6 +``` + +### Enable Valgrind support in PostgreSQL code (optional) + +```bash +sed -i.bak "s/\/\* #define USE_VALGRIND \*\//#define USE_VALGRIND/g" src/include/pg_config_manual.h +``` + +### Configure and build + +```bash +PG_PREFIX=$HOME/pg17 +./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu --prefix=$PG_PREFIX +make -j$(nproc) +make -j$(nproc) install +make -C contrib -j$(nproc) +make -C contrib -j$(nproc) install +echo "export PATH=\"$PG_PREFIX/bin:\$PATH\"" >> ~/.bashrc +source ~/.bashrc +``` + +### Install python requirements + +```bash +# create venv if needed +python -m venv venv +# activate venv if needed +source ./venv/bin/activate +pip3 install -r requirements.txt +sudo pip3 install compiledb +``` + +### Download and build the OrioleDB extension + +```bash +cd .. +git clone https://github.com/orioledb/orioledb.git +cd orioledb +# Build with compiledb, because it creates compile_commands.json needed for VSCode C/C++ extension +compiledb make USE_PGXS=1 IS_DEV=1 +# Exclude compile_commands.json from the Git tracking +echo "compile_commands.json" >> .git/info/exclude +``` + +### Download and install Visual Studio Code + +https://code.visualstudio.com/sha/download?build=stable&os=win32-x64-user + +### Install Python and C++ VSCode extensions + +```bat +code --install-extension ms-vscode-remote.remote-wsl +code --remote wsl+ubuntu /home/USERNAME/orioledb +``` + +In VSCode terminal: + +```bash +code --install-extension ms-python.python +code --install-extension ms-vscode.cpptools +``` + +### Check installation + +#### Run automated tests + +```bash +make USE_PGXS=1 IS_DEV=1 installcheck +``` + +#### Manual installation and running + +```bash +make USE_PGXS=1 IS_DEV=1 install +initdb --no-locale -D $HOME/pgdata +sed -i 's/#shared_preload_libraries = '\'''\''/shared_preload_libraries = '\''orioledb'\''/' $HOME/pgdata/postgresql.conf +pg_ctl -D $HOME/pgdata/ start -l $HOME/log +psql -c "CREATE EXTENSION IF NOT EXISTS orioledb; SELECT orioledb_commit_hash();" -d postgres +``` diff --git a/contrib/orioledb/doc/contributing/structure.mdx b/contrib/orioledb/doc/contributing/structure.mdx new file mode 100644 index 00000000000..e5dcf8b9072 --- /dev/null +++ b/contrib/orioledb/doc/contributing/structure.mdx @@ -0,0 +1,240 @@ +--- +id: code-structure +sidebar_label: Code Structure +--- + +# OrioleDB Source Code Structure + +The OrioleDB source code structure comprises various components, including +workflows for CI/CD, documentation, tests, and C-source files. It is structured +to facilitate development, testing (including under Valgrind), and deployment of +the OrioleDB extension for PostgreSQL. A heavy focus is placed on concurrency +testing and performance analysis through stop events and CI workflows. + +## File structure + +```sql +orioledb +|- .github +| |- workflows +| | |- check.yml -- build & test each commit +| | |- docker.yml -- build docker images for each release +| | |- dockertest.yml -- tests the docker images +| | |- pgindent.yml -- checks C-source code formatting +| | |- rpm.yml -- build CentOS 7 packages on demand +| | |- static.yml -- runs static analysis +| |- FUNDING.yml +| +|- ci -- scripts for building, testing, and CI automation +|- doc -- documentation +|- docker +| |- tests -- Docker images tests +| |- Dockerfile -- Alpine based Dockerfile used within docker.yml and dockertest.yml workflows +| |- Dockerfile.ubuntu -- Ubuntu based Dockerfile used within docker.yml and dockertest.yml workflows +| |- README.md -- Docker images tests documentation +| |- docker-entrypoint.sh -- entrypoint file for Docker images +| |- orioledb-config.sh -- Docker images tests configuration +| |- postgresql.docker.conf -- configuration file used by Docker images +|- include -- C-headers of extension +|- sql -- installation scripts with definitions of the extension's SQL-level objects +|- src -- C-sources of extension +|- test +| |- expected -- expected output for regression and isolation tests +| |- specs -- isolation tests +| |- sql -- regression tests +| |- t -- python tests +| |- orioledb_isolation.conf -- configuration file used during isolation tests +| |- orioledb_regression.conf -- configuration file used during regression tests +|- LICENSE -- defines PostgreSQL-licence for the project +|- Makefile -- defines make targets +|- README.md -- main documentation entrypoint +|- orioledb.control -- extension control file +|- stopevents.txt -- list of "stop event" +|- stopevents_gen.py -- generates include/utils/stopevents_(defs|data).h from stopevents.txt +|- typedefs_gen.py -- generates list of C-symbols in orioledb.so to orioledb.typedefs +|- valgrind.supp -- suppression rules for valgrind checks + +``` + +## Makefile targets + +All test-related targets accept the `VALGRIND=1` argument, which runs the tests +under [valgrind](https://valgrind.org/). Valgrind makes tests about ~100 times +slower but catches uninitialized memory access. Another positive side-effect of +Valgrind's super-slow test runs is the altered timing, which can expose various +types of concurrency errors. + +- `regresscheck` -- run SQL-tests (see below). +- `isolationcheck` -- run isolation tests (see below). +- `testgrescheck` -- run testgres tests (see below). +- `testgrescheck_part_1` -- first half of testgres checks. Testgres tests are + split into two nearly equal halves to avoid overly long individual CI runs. +- `testgrescheck_part_2` -- second half of testgres checks. +- `installcheck` -- run all types of tests when installed using the PostgreSQL + extension system (`USE_PGXS=1`). +- `check` -- run all types of tests when installed from the `contrib` folder of + the PostgreSQL source code. +- `pgindent` -- automatically indents OrioleDB sources. The + [pgindent](https://github.com/postgres/postgres/blob/master/src/tools/pgindent/pgindent) + tool should be available in `$PATH`. Note that you need to install + [pg_bsd_indent](https://github.com/postgres/postgres/blob/master/src/tools/pg_bsd_indent/README#L17) + first. Also, you need GNU Objdump available in `$PATH` as `objdump` or + `gobjdump`, or specified via the `OBJDUMP` environment variable. + +## Extension SQL scripts + +OrioleDB extension files are organized using a base version installation script, +followed by upgrade scripts for respective minor versions. For instance, when +installing version `1.7`, PostgreSQL first applies the `1.0` script and then +sequentially runs all upgrade scripts from `1.0` up to `1.7`. + +To bump the extension version (for example, bumping from `1.6` to `1.7`), follow +these steps: + +1. Edit the `default_version` parameter in the `orioledb.control` file to match + the new version. +2. Manually create development and production upgrade scripts in the `sql` + directory: + - `./sql/orioledb--1.6--1.7_prod.sql` + - `./sql/orioledb--1.6--1.7_dev.sql` +3. Add the corresponding headers to the scripts and populate the changes: + - Changes intended **only** for the development environment (such as test + functions that should not be exposed on production systems) go into + `_dev.sql`. + - All standard changes go into the `_prod.sql` file. + +:::note +The build system (`make`) utilizes both files to automatically generate the +final `./sql/orioledb--1.6--1.7.sql` script. Therefore, you do not need to +create this final file manually. The contents of `_dev.sql` will only be +included when the `IS_DEV` flag is specified during the build. There is no need +to duplicate changes between the two files. +::: +4. Add the generated file (`./sql/orioledb--1.6--1.7.sql`) to both `.gitignore` + and `.dockerignore`. + +## Tests + +OrioleDB has 3 groups of tests described below. + +- SQL tests are located in the `test/sql` folder. These are the simplest types of + tests. The SQL file is passed to `psql` and the result is compared to the + reference output in the `test/expected` folder. Note that there might be multiple + reference outputs for one input file. For instance, `collate.sql` contains + collation-aware tests. The result may match `collate.out`, `collate_1.out`, or + `collate_2.out` depending on database encoding and the presence of libicu. + +- Isolation tests are located in the `specs` folder. These tests simulate + multiple connections running simultaneously. See the + [README](https://github.com/postgres/postgres/blob/master/src/test/isolation/README) + in the PostgreSQL source tree. These tests are especially powerful in + conjunction with stop events. + +- Python [testgres](https://pypi.org/project/testgres/) tests are located in + `t`. These are the most powerful and complex tests. Additionally to the + ability to simulate multiple simultaneous connections, they can perform + actions with the whole PostgreSQL instance such as start, stop, backup, + replication, etc. See the [testgres + docs](https://postgrespro.github.io/testgres/) for details. + +:::note +For proper integration into the test suite, the test file names should +end with `_test.py`. +::: + +## CI + +OrioleDB uses GitHub CI. The CI workflows are described below. + +### `check.yml` + +This workflow runs the following tests for each of the two compilers (gcc and +clang) and each of the supported PostgreSQL major versions (16 and 17). + +- `normal` -- run tests without asserts and without debug symbols. +- `debug` -- run tests with asserts and with debug symbols. +- `sanitize` -- run tests with asserts, with debug symbols, with alignment, and + other sanitizers. This replaces running tests on strict alignment + architectures, providing even somewhat stricter checks (for instance, it traps + you on accessing a properly-aligned member of an improperly aligned structure, + which real hardware wouldn't do). +- `check_page` -- runs tests with asserts, with debug symbols, and with the + `CHECK_PAGE_STRUCT` macro enabled. This macro provides the page structure + check on every page unlock. +- `valgrind_1` -- runs `regresscheck`, `isolationcheck`, and + `testgrescheck_part_1` under valgrind with asserts and with debug symbols. +- `valgrind_2` -- runs `testgrescheck_part_2` under Valgrind with asserts and + with debug symbols. +- `static` -- runs `clang-analyzer` or `cppcheck` over sources. + +### `docker.yml` + +This workflow builds docker images for amd64 and arm64v8 architectures under +Alpine and Ubuntu Linux. This Dockerfile is a slightly adjusted [PostgreSQL +Dockerfile](https://github.com/docker-library/postgres). See [our +dockerhub](https://hub.docker.com/r/orioledb/orioledb) for details. + +### `dockertest.yml` + +This workflow tests the Docker images. It runs on every push and pull request. + +### `pgindent.yml` + +This workflow checks the code formatting using `pgindent`. It runs on every push +and pull request. + +### `rpm.yml` + +This workflow builds RPM packages for CentOS 7 using the specification from the +[orioledb/pgrpms](https://github.com/orioledb/pgrpms) repository, a fork of the +[pgrpms](https://git.postgresql.org/gitweb/?p=pgrpms.git;a=summary) repository. + +### `static.yml` + +This workflow runs static analysis on the code. It runs on every push and pull +request. + +## Stop events + +Stop events are special places in the code where execution can be paused based +on specific conditions. Stop events are used for the reliable reproduction of +concurrency issues. OrioleDB isolation and testgres tests use stop events. + +A stop event exposes a set of parameters, encapsulated into a jsonb value. The +conditions on stop event parameters are defined using the SQL/JSON path +language. + +The SQL-level functions and variables for stop events manipulation are listed +below. + +- `orioledb.enable_stopevents` -- enables stop events checking for the process. + Stop events checking is expensive and significantly affects performance. This + is why stop events are disabled by default. + +- `orioledb.trace_stopevents` -- enables logging of all stop events. Disabled by + default. + +- `pg_stopevent_set(eventname text, condition jsonpath) RETURNS void` -- set the + condition for the stop event. Once the function is executed, all processes + that run a given stop event with parameters satisfying the given jsonpath + condition will be stopped. + +- `pg_stopevent_reset(eventname text) RETURNS bool` -- reset the stop event. All + processes previously stopped on the given stop event will continue execution. + +- `pg_stopevents(OUT stopevent text, OUT condition jsonpath, OUT waiter_pids int[]) RETURNS SETOF record` -- + returns all the stop events currently set with their conditions and + waiter process PIDs. + +At the C level, the following macros are provided for managing stop events. + +- `STOPEVENTS_ENABLED()` -- checks if stop events are enabled. +- `STOPEVENT(event_id, params)` -- raises the given stop event with the given + jsonb parameters. +- `STOPEVENT_CONDITION(event_id, params)` -- checks the stop event condition + without stopping the execution. Used for error simulation. + +The list of stop events is defined in `stopevents.txt` file. The +`stopevents_gen.py` script generates `include/utils/stopevents_defs.h` (macros) +and `include/utils/stopevents_data.h` (name strings) files with C definitions of +the stop events list. diff --git a/contrib/orioledb/doc/images/block_level_undo.svg b/contrib/orioledb/doc/images/block_level_undo.svg new file mode 100644 index 00000000000..132fdf65d5c --- /dev/null +++ b/contrib/orioledb/doc/images/block_level_undo.svg @@ -0,0 +1,3 @@ + + +
t1
t1
t2
t2
t5
t5
Data page
Data page
t1
t1
t2
t2
t3
t3
t4
t4
Undo log
Undo log
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_concurrent_1.svg b/contrib/orioledb/doc/images/checkpoint_concurrent_1.svg new file mode 100644 index 00000000000..ee8a648745f --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_concurrent_1.svg @@ -0,0 +1,3 @@ + + +
1
1
2
2
3
3
4*
4*
5
5
6
6
7
7
checkpoint 1
checkpoint 1
Main memory
Main memory
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_concurrent_2.svg b/contrib/orioledb/doc/images/checkpoint_concurrent_2.svg new file mode 100644 index 00000000000..776d2213358 --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_concurrent_2.svg @@ -0,0 +1,3 @@ + + +
1
1
2*
2*
3
3
4*
4*
5
5
6
6
7
7
checkpoint 1
checkpoint 1
checkpoint 2
checkpoint 2
checkpoint boundary
checkpoint boundary
4*
4*
2*
2*
Main memory
Main memory
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_concurrent_3.svg b/contrib/orioledb/doc/images/checkpoint_concurrent_3.svg new file mode 100644 index 00000000000..e321104a47a --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_concurrent_3.svg @@ -0,0 +1,3 @@ + + +
1
1
2*
2*
3
3
5*
5*
6
6
7
7
checkpoint 1
checkpoint 1
checkpoint 2
checkpoint 2
4*
4*
2*
2*
7*
7*
checkpoint 3
checkpoint 3
checkpoint boundary
checkpoint boundary
5*
5*
Main memory
Main memory
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_concurrent_4.svg b/contrib/orioledb/doc/images/checkpoint_concurrent_4.svg new file mode 100644 index 00000000000..f2467a7f421 --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_concurrent_4.svg @@ -0,0 +1,3 @@ + + +
1*
1*
2*
2*
3*
3*
5*
5*
6
6
7
7
checkpoint 1
checkpoint 1
checkpoint 2
checkpoint 2
4*
4*
2*
2*
7*
7*
checkpoint 3
checkpoint 3
checkpoint boundary
checkpoint boundary
5*
5*
1*
1*
3*
3*
7*
7*
Main memory
Main memory
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_state_1.svg b/contrib/orioledb/doc/images/checkpoint_state_1.svg new file mode 100644 index 00000000000..54372459c7e --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_state_1.svg @@ -0,0 +1,4 @@ + + + +
n4
n4
hi
hi
n5
n5
hi
hi
n6
n6
hi
hi
 n7*
 n7*
hi
hi
 n8
 n8
hi
hi
l2
l2
l3
l3
n1
n1
hi
hi
l4
l4
l5
l5
hi
hi
n2
n2
l6
l6
l7
l7
l8
l8
hi
hi
n3
n3
Checkpoint state
Checkpoint state
l2
l2
nl
nl
l6
l6
nl
nl
hi
hi
hi
hi
1
1
2
2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_state_2.svg b/contrib/orioledb/doc/images/checkpoint_state_2.svg new file mode 100644 index 00000000000..e9781c7a51c --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_state_2.svg @@ -0,0 +1,4 @@ + + + +
n4
n4
hi
hi
n5
n5
hi
hi
 n67
 n67
hi
hi
 n8
 n8
hi
hi
 n9*
 n9*
hi
hi
l2
l2
l3
l3
n1
n1
hi
hi
l4
l4
l5
l5
hi
hi
n2
n2
l67
l67
l8
l8
l9
l9
hi
hi
n3
n3
Checkpoint state
Checkpoint state
l2
l2
nl
nl
l6
l6
l7
l7
hi
hi
hi
hi
1
1
2
2
l8
l8
nl
nl
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/checkpoint_walk.svg b/contrib/orioledb/doc/images/checkpoint_walk.svg new file mode 100644 index 00000000000..e98b97361c3 --- /dev/null +++ b/contrib/orioledb/doc/images/checkpoint_walk.svg @@ -0,0 +1,4 @@ + + + +
l1
l1
l2
l2
l3
l3
l4
l4
n2
n2
n3
n3
n4
n4
2. Down
2. Down
3. Up
3. Up
4. Down
4. Down
5. Up
5. Up
6. Continue
6. Contin...
7. Down
7. Down
8. Up
8. Up
n1
n1
1. Down
1. Down
9. Up
9. Up
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/cow_1.svg b/contrib/orioledb/doc/images/cow_1.svg new file mode 100644 index 00000000000..b7807196c55 --- /dev/null +++ b/contrib/orioledb/doc/images/cow_1.svg @@ -0,0 +1,3 @@ + + +
1
1
2
2
3
3
5
5
7*
7*
1
1
2
2
3
3
4
4
5
5
6
6
7
7
Main memory
Main memory
Storage
Storage
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/cow_2.svg b/contrib/orioledb/doc/images/cow_2.svg new file mode 100644 index 00000000000..76233d9de5a --- /dev/null +++ b/contrib/orioledb/doc/images/cow_2.svg @@ -0,0 +1,3 @@ + + +
1*
1*
2
2
3*
3*
5
5
7*
7*
1
1
2
2
3
3
4
4
5
5
6
6
7
7
7*
7*
3*
3*
1*
1*
Main memory
Main memory
Storage
Storage
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/cow_3.svg b/contrib/orioledb/doc/images/cow_3.svg new file mode 100644 index 00000000000..41b1fb36571 --- /dev/null +++ b/contrib/orioledb/doc/images/cow_3.svg @@ -0,0 +1,3 @@ + + +
1*
1*
2
2
3*
3*
5
5
7*
7*
2
2
4
4
5
5
6
6
7*
7*
3*
3*
1*
1*
Main memory
Main memory
Storage
Storage
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/dual_pointers.svg b/contrib/orioledb/doc/images/dual_pointers.svg new file mode 100644 index 00000000000..36c3c946728 --- /dev/null +++ b/contrib/orioledb/doc/images/dual_pointers.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
3
3
5
5
7
7
1
1
2
2
3
3
4
4
5
5
6
6
7
7
Storage
Storage
Main memory
Main memory
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/evict_step_1.svg b/contrib/orioledb/doc/images/evict_step_1.svg new file mode 100644 index 00000000000..e9552718f28 --- /dev/null +++ b/contrib/orioledb/doc/images/evict_step_1.svg @@ -0,0 +1,4 @@ + + + +
m
m
1
1
2 (L)
2 (L)
Step 1
Step 1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/evict_step_2.svg b/contrib/orioledb/doc/images/evict_step_2.svg new file mode 100644 index 00000000000..631fda7256a --- /dev/null +++ b/contrib/orioledb/doc/images/evict_step_2.svg @@ -0,0 +1,4 @@ + + + +
m
m
1 (L)
1 (L)
2 (L)
2 (L)
Step 2
Step 2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/evict_step_3.svg b/contrib/orioledb/doc/images/evict_step_3.svg new file mode 100644 index 00000000000..0ba800d88bf --- /dev/null +++ b/contrib/orioledb/doc/images/evict_step_3.svg @@ -0,0 +1,4 @@ + + + +
io
io
1
1
Step 3
Step 3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/evict_step_4.svg b/contrib/orioledb/doc/images/evict_step_4.svg new file mode 100644 index 00000000000..e3c924986df --- /dev/null +++ b/contrib/orioledb/doc/images/evict_step_4.svg @@ -0,0 +1,4 @@ + + + +
d
d
1
1
Step 4
Step 4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/fsm_1.svg b/contrib/orioledb/doc/images/fsm_1.svg new file mode 100644 index 00000000000..242a5acf241 --- /dev/null +++ b/contrib/orioledb/doc/images/fsm_1.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
3
3
4
4
5
5
6
6
7
7
Tree 1
Tree 1
chkp1
chkp1
chkp2
chkp2
tmp
tmp
map
map
tmp
tmp
map
map
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/fsm_2.svg b/contrib/orioledb/doc/images/fsm_2.svg new file mode 100644 index 00000000000..9acf35b2ea8 --- /dev/null +++ b/contrib/orioledb/doc/images/fsm_2.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
3
3
4
4
5
5
6
6
7
7
1
1
2
2
3
3
4
4
5
5
6
6
7
7
1
1
2
2
3
3
4
4
5
5
6
6
7
7
Tree 1
Tree 1
Tree 2
Tree 2
Tree 3
Tree 3
chkp1
chkp1
chkp2
chkp2
chkp3
chkp3
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
tmp
tmp
map
map
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/load_step_1.svg b/contrib/orioledb/doc/images/load_step_1.svg new file mode 100644 index 00000000000..74a0fd08a63 --- /dev/null +++ b/contrib/orioledb/doc/images/load_step_1.svg @@ -0,0 +1,4 @@ + + + +
d
d
1 (L)
1 (L)
Step 1
Step 1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/load_step_2.svg b/contrib/orioledb/doc/images/load_step_2.svg new file mode 100644 index 00000000000..c5c0ee4cd9d --- /dev/null +++ b/contrib/orioledb/doc/images/load_step_2.svg @@ -0,0 +1,4 @@ + + + +
io
io
1
1
Step 2
Step 2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/load_step_3.svg b/contrib/orioledb/doc/images/load_step_3.svg new file mode 100644 index 00000000000..0cdcb25fea1 --- /dev/null +++ b/contrib/orioledb/doc/images/load_step_3.svg @@ -0,0 +1,4 @@ + + + +
io
io
1 (L)
1 (L)
Step 3
Step 3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/load_step_4.svg b/contrib/orioledb/doc/images/load_step_4.svg new file mode 100644 index 00000000000..48cde8490ee --- /dev/null +++ b/contrib/orioledb/doc/images/load_step_4.svg @@ -0,0 +1,4 @@ + + + +
m
m
1
1
2
2
Step 4
Step 4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/merge_step_1.svg b/contrib/orioledb/doc/images/merge_step_1.svg new file mode 100644 index 00000000000..d93efcca327 --- /dev/null +++ b/contrib/orioledb/doc/images/merge_step_1.svg @@ -0,0 +1,4 @@ + + + +
m
m
3 (L)
3 (L)
Step 1
Step 1
m
m
m
m
2
2
4
4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/merge_step_2.svg b/contrib/orioledb/doc/images/merge_step_2.svg new file mode 100644 index 00000000000..68d13745647 --- /dev/null +++ b/contrib/orioledb/doc/images/merge_step_2.svg @@ -0,0 +1,4 @@ + + + +
m
m
1 (L)
1 (L)
3
3
Step 2
Step 2
m
m
m
m
2
2
4
4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/merge_step_3.svg b/contrib/orioledb/doc/images/merge_step_3.svg new file mode 100644 index 00000000000..7b9d3118fee --- /dev/null +++ b/contrib/orioledb/doc/images/merge_step_3.svg @@ -0,0 +1,4 @@ + + + +
m
m
1 (L)
1 (L)
3 (L)
3 (L)
Step 3
Step 3
m
m
m
m
2
2
4
4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/merge_step_4.svg b/contrib/orioledb/doc/images/merge_step_4.svg new file mode 100644 index 00000000000..5f1a7ad2bf1 --- /dev/null +++ b/contrib/orioledb/doc/images/merge_step_4.svg @@ -0,0 +1,4 @@ + + + +
m
m
1 (L)
1 (L)
3 (L)
3 (L)
Step 4
Step 4
m
m
m
m
2
2
4 (L)
4 (L)
m
m
1 (L)
1 (L)
3 (L)
3 (L)
m
m
m
m
2 (L)
2 (L)
4
4
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/merge_step_5.svg b/contrib/orioledb/doc/images/merge_step_5.svg new file mode 100644 index 00000000000..d0710dd61c2 --- /dev/null +++ b/contrib/orioledb/doc/images/merge_step_5.svg @@ -0,0 +1,4 @@ + + + +
m
m
1
1
34 
34 
Step 5
Step 5
m
m
2
2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/page_structure.svg b/contrib/orioledb/doc/images/page_structure.svg new file mode 100644 index 00000000000..560c6b063fb --- /dev/null +++ b/contrib/orioledb/doc/images/page_structure.svg @@ -0,0 +1,3 @@ + + +
Header
Header
State
State
High keys
High keys
Chunk 1
Chunk 1
Chunk 2
Chunk 2
Chunk 3
Chunk 3
t1
t1
t2
t2
t3
t3
t4
t4
t5
t5
t1
t1
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/recovery_distribute.svg b/contrib/orioledb/doc/images/recovery_distribute.svg new file mode 100644 index 00000000000..3eb6c996815 --- /dev/null +++ b/contrib/orioledb/doc/images/recovery_distribute.svg @@ -0,0 +1,4 @@ + + + +
WAL stream
WAL stream
BEGIN
TX = 1
BEGIN...
INSERT
ID = 1
INSERT...
UPDATE
ID = 2
UPDATE...
COMMIT
TX = 1
COMMIT...
Recovery main
Recovery main
INSERT
ID = 1
TX = 1
INSERT...
UPDATE
ID = 2
TX = 1
UPDATE...
COMMIT
TX = 1
COMMIT...
COMMIT
TX = 1
COMMIT...
Queue #1
Queue #1
Queue #2
Queue #2
Queue #3
Queue #3
Queue #4
Queue #4
Recovery
worker #1
Recovery...
Recovery
worker #2
Recovery...
Recovery
worker #3
Recovery...
Recovery
worker #4
Recovery...
BEGIN
TX = 2
BEGIN...
DELETE
ID = 2
DELETE...
INSERT
ID = 3
INSERT...
ABORT
TX = 2
ABORT...
INSERT
ID = 3
TX = 2
INSERT...
ABORT
TX = 2
ABORT...
DELETE
ID = 2
TX = 2
DELETE...
ABORT
TX = 2
ABORT...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/recovery_secondary_indexes.svg b/contrib/orioledb/doc/images/recovery_secondary_indexes.svg new file mode 100644 index 00000000000..2a486fb142c --- /dev/null +++ b/contrib/orioledb/doc/images/recovery_secondary_indexes.svg @@ -0,0 +1,4 @@ + + + +
TOAST
TOAST
Primary key
Primary key
Secondary key
Secondary key
Toast consistency point
Toast consistency point
new tuple
new tuple
old tuple
old tuple
update
update
delete
delete
insert
insert
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/recovery_undo.svg b/contrib/orioledb/doc/images/recovery_undo.svg new file mode 100644 index 00000000000..8f7a7de2078 --- /dev/null +++ b/contrib/orioledb/doc/images/recovery_undo.svg @@ -0,0 +1,4 @@ + + + +
Recovery main
Recovery main
Recovery
worker #1
Recovery...
Recovery
worker #2
Recovery...
Recovery
worker #3
Recovery...
Recovery
worker #4
Recovery...
TX: 1
TX: 1
Undo
Undo
TX: 1
TX: 1
TX: 2
TX: 2
TX: 2
TX: 2
TX: 1
TX: 1
TX: 2
TX: 2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/recovery_wokers.svg b/contrib/orioledb/doc/images/recovery_wokers.svg new file mode 100644 index 00000000000..ce3c2a0d242 --- /dev/null +++ b/contrib/orioledb/doc/images/recovery_wokers.svg @@ -0,0 +1,3 @@ + + +
WAL replay
WAL replay
Recovery
worker #1
Recovery...
Recovery
worker #2
Recovery...
Recovery
worker #N
Recovery...
.......
.......
hash(pk)
hash(pk)
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/row_level_undo.svg b/contrib/orioledb/doc/images/row_level_undo.svg new file mode 100644 index 00000000000..9c4e00529f7 --- /dev/null +++ b/contrib/orioledb/doc/images/row_level_undo.svg @@ -0,0 +1,3 @@ + + +
th1
th1
t1
t1
th1.2
th1.2
th1.3
th1.3
t1.2
t1.2
th2
th2
t2
t2
th2.2
th2.2
th2.3
th2.3
t2.2
t2.2
th2.4
th2.4
Data page
Data page
Undo log
Undo log
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/split_step_1.svg b/contrib/orioledb/doc/images/split_step_1.svg new file mode 100644 index 00000000000..ee5731f53a4 --- /dev/null +++ b/contrib/orioledb/doc/images/split_step_1.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
Step 1
Step 1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/split_step_2.svg b/contrib/orioledb/doc/images/split_step_2.svg new file mode 100644 index 00000000000..d62ba7e2e32 --- /dev/null +++ b/contrib/orioledb/doc/images/split_step_2.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
Step 2
Step 2
3
3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/split_step_3.svg b/contrib/orioledb/doc/images/split_step_3.svg new file mode 100644 index 00000000000..171c35ffab6 --- /dev/null +++ b/contrib/orioledb/doc/images/split_step_3.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
Step 3
Step 3
3
3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/split_step_4.svg b/contrib/orioledb/doc/images/split_step_4.svg new file mode 100644 index 00000000000..49d76d7fc45 --- /dev/null +++ b/contrib/orioledb/doc/images/split_step_4.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
Step 4
Step 4
3
3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/toast_version.svg b/contrib/orioledb/doc/images/toast_version.svg new file mode 100644 index 00000000000..8ff3360dc15 --- /dev/null +++ b/contrib/orioledb/doc/images/toast_version.svg @@ -0,0 +1,4 @@ + + + +
Undo
Undo
id: 1, TX: 10, V: 2
id: 1, TX: 10, V...
id: 1, TX: 10, V: 1
id: 1, TX: 10, V...
id: 1, TX: 10 (V: 0)
id: 1, TX: 10 (V...
id: 1, TX: 9
id: 1, TX: 9
id: 1, att: 1, off: 0, TX: 10, V: 2
id: 1, att: 1, off: 0, TX: 10,...
id: 1, att: 1, off: 0, TX: 10, (V: 0)
id: 1, att: 1, off: 0, TX: 10,...
id: 1, att: 1, off: 0, TX: 9
id: 1, att: 1, off: 0, TX: 9
PK
PK
Toast
Toast
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/undo_buffer_1.svg b/contrib/orioledb/doc/images/undo_buffer_1.svg new file mode 100644 index 00000000000..3271fc2a875 --- /dev/null +++ b/contrib/orioledb/doc/images/undo_buffer_1.svg @@ -0,0 +1,4 @@ + + + +
Retain
location
Retain...
Insert
location
Insert...
Circlular buffer
Circlular buffer
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/undo_buffer_2.svg b/contrib/orioledb/doc/images/undo_buffer_2.svg new file mode 100644 index 00000000000..22f554e8509 --- /dev/null +++ b/contrib/orioledb/doc/images/undo_buffer_2.svg @@ -0,0 +1,4 @@ + + + +
Retain
location
Retain...
Insert
location
Insert...
Circlular buffer
Circlular buffer
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/undo_buffer_3.svg b/contrib/orioledb/doc/images/undo_buffer_3.svg new file mode 100644 index 00000000000..c499b129f43 --- /dev/null +++ b/contrib/orioledb/doc/images/undo_buffer_3.svg @@ -0,0 +1,4 @@ + + + +
Written
location
Written...
Insert
location
Insert...
Circlular buffer
Circlular buffer
Undo files
Undo files
Written
location
Written...
Retain
location
Retain...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/undo_buffer_4.svg b/contrib/orioledb/doc/images/undo_buffer_4.svg new file mode 100644 index 00000000000..61e0e7ded55 --- /dev/null +++ b/contrib/orioledb/doc/images/undo_buffer_4.svg @@ -0,0 +1,4 @@ + + + +
Written
location
Written...
Insert
location
Insert...
Circlular buffer
Circlular buffer
Undo files
Undo files
Written
location
Written...
Retain
location
Retain...
Write
in-progress
location
Write...
Write
in-progress
location
Write...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/undo_buffer_5.svg b/contrib/orioledb/doc/images/undo_buffer_5.svg new file mode 100644 index 00000000000..1cdd51e7ce2 --- /dev/null +++ b/contrib/orioledb/doc/images/undo_buffer_5.svg @@ -0,0 +1,4 @@ + + + +
Insert
location
Insert...
Circlular buffer
Circlular buffer
Undo files
Undo files
Retain
location
Retain...
Written
location
Written...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/contrib/orioledb/doc/images/vegetables_indexes.svg b/contrib/orioledb/doc/images/vegetables_indexes.svg new file mode 100644 index 00000000000..7e6dbfefe12 --- /dev/null +++ b/contrib/orioledb/doc/images/vegetables_indexes.svg @@ -0,0 +1,3 @@ + + +
(1; Tomato; 10)
(1; Tomato; 10)
(2; Cucumber; 3)
(2; Cucumber; 3)
(3; Cabbage; 7)
(3; Cabbage; 7)
(4; Melon; 6)
(4; Melon; 6)
3
3
Primary key index on id
Primary key index on id
(3; 2)
(3; 2)
(6; 4)
(6; 4)
(7; 3)
(7; 3)
(10; 1)
(10; 1)
(7; 3)
(7; 3)
Secondary key index on count
Secondary key index on count
3
3
(7; 3)
(7; 3)
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/contrib/orioledb/doc/intro.mdx b/contrib/orioledb/doc/intro.mdx new file mode 100644 index 00000000000..314c52ba2e4 --- /dev/null +++ b/contrib/orioledb/doc/intro.mdx @@ -0,0 +1,171 @@ +--- +id: docs-intro +slug: / +sidebar_position: 1 +sidebar_label: Introduction +--- + +# OrioleDB: The next-generation storage engine for PostgreSQL + +OrioleDB is a **storage extension for PostgreSQL** which uses PostgreSQL's pluggable storage system. + +It is designed to be a drop-in replacement for PostgreSQL's existing storage engine. OrioleDB is built to take advantage of modern hardware and cloud infrastructure, providing better performance and scalability for PostgreSQL workloads. + +## Example + +OrioleDB uses Postgres Table Access Method (TAM) to provide a pluggable storage engine for PostgreSQL. Here is an example of how you can create a table using OrioleDB: + +```sql +-- Enable the OrioleDB extension +CREATE EXTENSION orioledb; + +CREATE TABLE blog_post +( + id int8 NOT NULL, + title text NOT NULL, + body text NOT NULL, + PRIMARY KEY(id) +) USING orioledb; -- Use the OrioleDB storage engine +``` + +## Pluggable Storage in PostgreSQL + +Pluggable Storage gives developers the ability to use different storage engines for different tables within the same database. Developers will be able to choose a storage method that is optimized for their specific needs: some tables could be configured for high transactional loads, others for analytics workloads, and still others for archiving. + +
+See examples + +```sql +create table analytics_data +( + id int8, + created_at timestamptz, + event text +) using parquet; -- Store data in a analytical optimized storage engine + +create table timeseries_data +( + id int8, + created_at timestamptz, + event text +) using timeseries; -- Store data in a time-series optimized storage engine +``` + +
+ +Something like this is already available in MySQL, which uses the `InnoDB` as the default storage engine since MySQL 5.5 (replacing `MyISAM`). Read more about the history of pluggable storage [here](https://supabase.com/blog/postgres-pluggable-strorage). + +## Using OrioleDB with existing PostgreSQL installations + +OrioleDB currently requires a set of patches to PostgreSQL to enhance the pluggable storage API and other PostgreSQL subsystems. All of these patches have been submitted to the PostgreSQL community and are under review. + +The important property of this set of patches is keeping the binary compatibility. That is, you can switch to the patched PostgreSQL binary while keeping the same data directory. The existing tables will continue working with the default `heap` engine until you switch them to use `orioledb`. Moreover, it's possible to switch back to using unpatched PostgreSQL binaries. You would just need to convert your `orioledb` tables back to heap before. + +The goal is to upstream everything: once these patches are accepted, OrioleDB will be able to run on any PostgreSQL installation without any modifications. This will also enable the entire PostgreSQL community to create their own pluggable storage engines. + +Until then, you can use our [pre-built Docker image](https://hub.docker.com/r/orioledb/orioledb) to try out OrioleDB. The Docker image includes a patched version of PostgreSQL with OrioleDB pre-installed. Follow the [Getting started](usage/getting-started.mdx) guide to get started. + +## Patch set + +You can get the full set of patches [here](https://github.com/orioledb/postgres/commits/patches16/). +The following patches have been submitted to the PostgreSQL community to enhance the TAM interface and other subsystems. + +| | Name | Link | Version | +| ------------------ | ----------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | ---------------- | +| :white_check_mark: | Add missing inequality searches to rbtree | [Link](https://github.com/postgres/postgres/commit/e57519a4637a8d88ae993ac1273d2b59d03a0f75) | PostgreSQL 16 | +| :white_check_mark: | Document the ability to specify TableAM for pgbench | [Link](https://github.com/postgres/postgres/commit/f77ff083350eb5a2625a4dbfca61d15b66c4918b) | PostgreSQL 16 | +| :white_check_mark: | Remove Tuplesortstate.copytup function | [Link](https://github.com/postgres/postgres/commit/d47da3162b4d77c888c895dd36e4ef9cb92fcf19) | PostgreSQL 16 | +| :white_check_mark: | Add new Tuplesortstate.removeabbrev function | [Link](https://github.com/postgres/postgres/commit/cadfdd1edff103d696bbfcdd126c2dee516aed9b) | PostgreSQL 16 | +| :white_check_mark: | Put abbreviation logic into puttuple_common() | [Link](https://github.com/postgres/postgres/commit/033dd02db2248ed792332b950431ced4771b8304) | PostgreSQL 16 | +| :white_check_mark: | Move memory management away from writetup() and tuplesort_put*() | [Link](https://github.com/postgres/postgres/commit/097366c45f5dfe142eb232dc6d348ca0705a63a9) | PostgreSQL 16 | +| :white_check_mark: | Split TuplesortPublic from Tuplesortstate | [Link](https://github.com/postgres/postgres/commit/ec92fe98356a8a36427fe9ef52873b50fe17852e) | PostgreSQL 16 | +| :white_check_mark: | Split tuplesortvariants.c from tuplesort.c | [Link](https://github.com/postgres/postgres/commit/d0b193c0fad13cf35122b0d3dc805c76e323e8bf) | PostgreSQL 16 | +| :white_check_mark: | Fix typo in comment for writetuple() function | [Link](https://github.com/postgres/postgres/commit/924954c670355f2a0ca1dd4173574a28fc0eedec) | PostgreSQL 16 | +| :white_check_mark: | Support for custom slots in the custom executor nodes | [Link](https://github.com/postgres/postgres/commit/cee120951427fe39a54ab800abfa2834d85b8771) | PostgreSQL 16 | +| :email: | Allow table AM to store complex data structures in rd_amcache | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Allow table AM tuple_insert() method to return the different slot | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Add TupleTableSlotOps.is_current_xact_tuple() method | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Allow locking updated tuples in tuple_update() and tuple_delete() | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Add EvalPlanQual delete returning isolation test | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Generalize relation analyze in table AM interface | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Custom reloptions for table AM | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | +| :email: | Let table AM insertion methods control index insertion | [Link](https://commitfest.postgresql.org/48/4958/) | PostgreSQL 18 | + +##### Legend + +:white_check_mark: - Patch has been accepted.
+:email: - Patch is submitted and under review by the PostgreSQL community.
+:pencil2: - Patch is being worked on.
+ +## Features + +OrioleDB opens the door to a future of more powerful storage models that are optimized for cloud and modern hardware architectures. + +### Open source + +OrioleDB is distributed under the standard PostgreSQL license. The goal is to upstream all the patches required to run OrioleDB on any PostgreSQL installation without any modifications. + +### Designed for modern hardware + +OrioleDB design avoids legacy CPU bottlenecks on modern servers containing dozens and hundreds CPU cores, providing optimized usage of modern storage technologies such as SSD and NVRAM. + +### Reduced maintenance needs + +OrioleDB implements the concepts of undo log and page-mergins, eliminating the need for dedicated garbage collection processes. Additionally, OrioleDB implements default 64-bit transaction identifiers, thus eliminating the well-known and painful wraparound problem. + +### Designed to be distributed + +OrioleDB implements a row-level write-ahead log with support for parallel apply. This log architecture is optimized for raft consensus-based replication allowing the implementation of active-active multimaster. + +## Differentiators + +The key technical differentiations of OrioleDB are as follows: + +### No buffer mapping and lock-less page reading + +In-memory pages in OrioleDB are connected with direct links to the storage pages. This eliminates the need for in-buffer mapping along with its related bottlenecks. Additionally, in OrioleDB in-memory page reading doesn't involve atomic operations. Together, these design decisions bring vertical scalability for Postgres to the whole new level. + +### MVCC is based on the UNDO log concept + +In OrioleDB, old versions of tuples do not cause bloat in the main storage system, but eviction into the undo log comprising undo chains. Page-level undo records allow the system to easily reclaim space occupied by deleted tuples as soon as possible. Together with page-mergins, these mechanisms eliminate bloat in the majority of cases. Dedicated VACUUMing of tables is not needed as well, removing a significant and common cause of system performance deterioration and database outages. + +### Copy-on-write checkpoints and row-level WAL + +OrioleDB utilizes copy-on-write checkpoints, which provides a structurally consistent snapshot of data every moment of time. This is friendly for modern SSDs and allows row-level WAL logging. In turn, row-level WAL logging is easy to parallelize (done), compact and suitable for active-active multimaster (planned). + +## Resources + +### Hacker news + +- [OrioleDB - solving some PostgreSQL wicked problems](https://news.ycombinator.com/item?id=30462695) +- [PostgreSQL: No More Vacuum, No More Bloat](https://news.ycombinator.com/item?id=36740921) +- [OrioleDB Reached Beta](https://news.ycombinator.com/item?id=36392765) + +### Solving PostgreSQL Wicked Problems + +This talk covers how the new engine is integrated with PostgreSQL Core and solves the wicked PostgreSQL problems. + +
+ +
diff --git a/contrib/orioledb/doc/usage/configuration.mdx b/contrib/orioledb/doc/usage/configuration.mdx new file mode 100644 index 00000000000..bedddccba88 --- /dev/null +++ b/contrib/orioledb/doc/usage/configuration.mdx @@ -0,0 +1,343 @@ +--- +id: configuration +sidebar_label: Configuration +--- + +# Configuration + +## Main parameters + +All the GUC parameters require the postmaster restart. + +### `orioledb.main_buffers` + +| | | +| ----------- | ----- | +| **Default** | 64 MB | + +the size of shared memory, where hot data pages of OrioleDB tables are cached. This parameter is analog of the built-in `shared_buffers` GUC parameter. A good starting point for this parameter if only OrioleDB tables are used is 1/4 of RAM and setting `shared_buffers` to default value `128 MB`. If OrioleDB and heap tables are used equally, then 1/8 of RAM for this parameter and 1/8 of RAM for `shared_buffers`. + +### `orioledb.undo_buffers` + +| | | +| ----------- | ---- | +| **Default** | 1 MB | + +The shared memory ring buffer size for older versions of rows and pages. + +### `orioledb.recovery_pool_size` + +| | | +| ----------- | --- | +| **Default** | 3 | + +The number of recovery workers for row-level WAL based recovery. + +Increasing the sum value of `orioledb.recovery_pool_size` and `orioledb.recovery_idx_pool_size` to 50-100% of number of +available CPU cores speeds up recovery process for the cluster. You need to set `max_worker_processes` to more that +this sum amount, otherwise OrioleDB will fallback to single-process index build in recovery or to single-process recovery. + +### `orioledb.recovery_idx_pool_size` + +| | | +| ----------- | --- | +| **Default** | 3 | + +The number of recovery parallel index build workers. + +Increasing the sum value of `orioledb.recovery_pool_size` and `orioledb.recovery_idx_pool_size` to 50-100% of number of +available CPU cores speeds up recovery process for the cluster. You need to set `max_worker_processes` to more that +this sum amount, otherwise OrioleDB will fallback to single-process index build in recovery or to single-process recovery. + +### `orioledb.recovery_queue_size` + +| | | +| ----------- | ---- | +| **Default** | 8 MB | + +The size of shared memory for message queues related to recovery workers. + +### `orioledb.checkpoint_completion_ratio` + +| | | +| ----------- | --- | +| **Default** | 0.5 | + +The fraction of OrioleDB tables checkpoint time within the whole checkpoint time. We recommend setting this value to `1.0` if only OrioleDB tables are used. + +### `orioledb.serializable` + +| | | +| ----------- | ------------ | +| **Default** | `table_lock` | + +How OrioleDB handles a client request for SERIALIZABLE isolation. OrioleDB does not implement true SSI predicate locking; this enum selects one of three approximations: + +- `table_lock` (default): take a heavyweight `ExclusiveLock` on every relation a SERIALIZABLE transaction touches. Two SERIALIZABLE transactions on the same table serialize via the lock manager, and SERIALIZABLE writers block concurrent non-SERIALIZABLE writers (`RowExclusiveLock`) on that table; non-SERIALIZABLE readers (`AccessShareLock`) are unaffected. Correct, but coarse — concurrent non-conflicting workloads degrade to serial execution. +- `error`: reject SERIALIZABLE transactions with `ERRCODE_FEATURE_NOT_SUPPORTED` (the legacy behavior). +- `repeatable_read`: treat SERIALIZABLE as `REPEATABLE READ` for OrioleDB tables only — no extra locks are taken, since OrioleDB's CSN-based snapshot already provides per-transaction stable reads. Heap tables in the same transaction continue to use PG's SSI as usual. + +## Advanced config options + +### `orioledb.free_tree_buffers` + +| | | +| ----------- | ---- | +| **Default** | 8 MB | + +Shared memory size for metadata of block allocators for compressed tables. We recommend increasing the value of this parameter to work with large compressed tables. + +### `orioledb.catalog_buffers` + +| | | +| ----------- | ---- | +| **Default** | 8 MB | + +Shared memory size of table metadata. We recommend increasing the value of this parameter to work with a large number of tables. + +### `orioledb.system_undo_circular_buffer_fraction` + +| | | +| ----------- | --- | +| **Default** | 0.1 | + +Fraction of `orioledb.undo_buffers` for older versions of rows and pages in system tables. We recommend increasing the +value of this parameter for DDL-intensive workloads. This replaces deprecated parameter `orioledb.undo_system_buffers` which +did the same in absolute memory size. + +The remaining `orioledb.undo_buffers` (except the fraction specified by sum of `orioledb.system_undo_circular_buffer_fraction` and `orioledb.regular_block_undo_circular_buffer_fraction`) are reserved for row-level undo logs for regular tables. + +### `orioledb.regular_block_undo_circular_buffer_fraction` + +| | | +| ----------- | ---- | +| **Default** | 0.45 | + +Fraction of `orioledb.undo_buffers` for block-level undo logs for regular tables. + +The remaining `orioledb.undo_buffers` (except the fraction specified by sum of `orioledb.system_undo_circular_buffer_fraction` and `orioledb.regular_block_undo_circular_buffer_fraction`) are reserved for row-level undo logs for regular tables. + +### `orioledb.xid_buffers` + +| | | +| ----------- | --- | +| **Default** | 1MB | + +Size of OrioleDB in-memory xid buffers. Each MB can acommodate 65 thousands of xid items corresponding to open +transactions. Consider increasing this parameter if you have very big transactions rate. + +### `orioledb.bgwriter_num_workers` + +| | | +| ----------- | --- | +| **Default** | 1 | + +The number of background writer processes, which flushes dirty pages of OrioleDB tables in the background. We recommend setting values greater than `1` for systems with a large number of CPU cores. + +### `orioledb.max_io_concurrency` + +| | | +| ----------- | ------- | +| **Default** | 0 (off) | + +Maximum number of concurrent IO operations issued by OrioleDB in parallel. We recommend setting this parameter when the OS kernel becomes a bottleneck for high concurrent IO. + +### `orioledb.device_filename` + +| | | +| ----------- | ------- | +| **Default** | Not set | + +Path to the block device for block device mode. + +### `orioledb.device_length` + +| | | +| ----------- | ---- | +| **Default** | 1 GB | + +The length of the block device. + +### `orioledb.use_mmap` + +| | | +| ----------- | --- | +| **Default** | off | + +Specify whether to use `mmap` to work with the block device. We recommend setting `on` value for NVRAM. + +### `orioledb.default_compress` + +| | | +| ----------- | ------------------- | +| **Default** | -1 (no compression) | + +Default block-level compression level for tables' data structures. + +### `orioledb.default_primary_compress` + +| | | +| ----------- | ------------------- | +| **Default** | -1 (no compression) | + +Default block-level compression level for tables' primary keys. + +### `orioledb.default_toast_compress` + +| | | +| ----------- | ------------------- | +| **Default** | -1 (no compression) | + +Default block-level compression level for tables' TOASTed values. + +### `orioledb.table_description_compress` + +| | | +| ----------- | --- | +| **Default** | off | + +Display compression column in orioledb_table_description. + +### `orioledb.use_sparse_files` + +| | | +| ----------- | --- | +| **Default** | off | + +Try to allocate files as `sparse` at filesystem level. Saves occupied space on disk by excluding non-allocated +regions in file from occupying disk space. It's an experimental option. + + +## Options for undo-based rewind (experimental) + +### `orioledb.enable_rewind` + +| | | +| ----------- |---- | +| **Default** | off | + +Enable undo-based rewind. + +### `orioledb.rewind_max_time` + +| | | +| ----------- | --- | +| **Default** | 500 | + +Maximum age in seconds from now of transaction after which it is completed and removed from buffer. It's a time limit for a rewind. + +### `orioledb.rewind_max_transactions` + +| | | +| ----------- | ----- | +| **Default** | 86400 | + +Maximum number of rewind transaction items that are stored for a rewind. Older rewind transaction items are completed and removed from the buffer so this specifies maximum rewind age. If there are subtransactions they also occupy rewind items in a rewind buffer. + +`orioledb.rewind_max_transactions` and `orioledb.rewind_max_time` work together so if a transaction is older than any one of these thresholds it is past the rewind capability and could not be rewound to. + +### `orioledb.rewind_buffers` + +| | | +| ----------- | --- | +| **Default** | 1MB | + +Size of OrioleDB in-memory rewind buffers. Each MB can acommodate around 8000 rewind transaction items. If you have +enough memory set it near a value (`orioledb.rewind_max_transactions` / 8000) MB to avoid writing rewind info to disk. + +### `orioledb.logical_xid_buffers` + +| | | +| ----------- | ----- | +| **Default** | 512KB | + +Size of OrioleDB in-memory buffers for logical transaction IDs to be assigned to running subtransactions. Each MB +can accommodate 8 million of running subtransactions. So default value corresponds to 4 million subtransactions. + +## Debugging options + +### `orioledb.debug_disable_pools_limit` + +| | | +| ----------- | --- | +| **Default** | off | + +Disable minimal limit for `orioledb.main_buffers`, `orioledb.free_tree_buffers`, `orioledb.catalog_buffers` for debug. + +### `orioledb.enable_stopevents` + +| | | +| ----------- | --- | +| **Default** | off | + +Enable stop events. + +### `orioledb.trace_stopevents` + +| | | +| ----------- | --- | +| **Default** | off | + +Trace all the stop events to the system log. + +### `orioledb.debug_disable_bgwriter` + +| | | +| ----------- | --- | +| **Default** | off | + +Disable bgwriter for debug. + +### `orioledb.debug_checkpoint_timeout` + +| | | +| ----------- | -------------------- | +| **Default** | `checkpoint_timeout` | + +Sets the maximum time between automatic WAL checkpoints. Setting this value to a lower value than `checkpoint_timeout` can make OrioleDB checkpoints more often for testing. + +### `orioledb.remove_old_checkpoint_files` + +| | | +| ----------- | -- | +| **Default** | on | + +Remove temporary \*.tmp and \*.map files after checkpoint. + +### `orioledb.skip_unmodified_trees` + +| | | +| ----------- | -- | +| **Default** | on | + +Skip reading of unmodified trees during checkpointing. + +### `orioledb.debug_max_bridge_ctid_blkno` + +| | | +| ----------- | --- | +| **Default** | NA | + +Sets maximum value for bridge ctid for its overflow testing. + +### `orioledb.replay_until_lsn` + +| | | +| ----------- | --- | +| **Default** | NA | + +Specifies the cutoff LSN at or after which OrioleDB will permanently stop +applying its WAL records during recovery. Replay stops when the WAL read +position reaches this LSN. `replay_until_lsn` must not be earlier than the LSN +of the latest completed OrioleDB checkpoint. Specifying `replay_until_lsn` +earlier than the LSN of the latest completed OrioleDB checkpoint will have no +effect (a warning will be logged at next startup). + +:::warning[Data consistency] +This is a last-resort disaster recovery mechanism strictly comparable to +`pg_resetwal`. It intentionally induces a split-brain state: PostgreSQL core +catalogs and standard heap tables will continue replaying WAL, while OrioleDB +tables remain at the specified LSN. +::: + +[Additional GUC's for experimental s3 mode](decoupled-storage.mdx) diff --git a/contrib/orioledb/doc/usage/decoupled-storage.mdx b/contrib/orioledb/doc/usage/decoupled-storage.mdx new file mode 100644 index 00000000000..8a2a3ccff2b --- /dev/null +++ b/contrib/orioledb/doc/usage/decoupled-storage.mdx @@ -0,0 +1,84 @@ +--- +id: decoupled-storage +sidebar_label: Decoupled storage +--- + +# Decoupled storage and compute + +One of the features that OrioleDB provides is the ability to decouple storage and compute. This is achieved by storing the data in a separate storage layer, such as S3, and running the compute layer on a separate instance. This allows for better scalability and flexibility in managing the data and compute resources. + +:::warning[Experimental feature] + +This feature is currently experimental and should be used with caution. + +S3 storage is not required to use OrioleDB. It is an optional feature that can be enabled by setting the appropriate configuration parameters. + +::: + +## S3 database storage + +OrioleDB has experimental support for the storage of all tables and materialized views data in the S3 bucket. It is useful for splitting compute and data storage instances, for increasing data safety, and for scaling and changing the architecture of compute instances preserving all data. + +Local storage implements caching of the data most often accessed. Also, it ensures that adding and updating data will be done at the speed of writing to local storage, rather than the S3 transfer rate. Data are synced with S3 asynchronously. However, all requirements of data integrity are ensured for all the data on S3 storage as well. So you can re-connect to the S3 bucket by another empty PostgreSQL instance (initialized with the utility described below) with the OrioleDB extension configured to use S3 with this bucket and get back all the data from S3 in the state of the last PostgreSQL checkpoint. + +To use S3 functionality, the following parameters should be set before creating orioledb tables and materialized views: + +- `orioledb.s3_mode` -- whether to use S3 mode. It could be `on` and `off`. The default is `off` +- `archive_library = 'orioledb'` -- set it to use s3 mode +- `archive_mode = on` -- set it to use S3 mode +- `orioledb.s3_region` -- specify S3 region, where the S3 bucket is created. +- `orioledb.s3_host` -- access endpoint address for S3 bucket (without `https://` prefix). E.g. mybucket.s3-accelerate.amazonaws.com +- `orioledb.s3_prefix` -- Prefix to prepend to S3 object name (may contain bucket name if it is not in endpoint) +- `orioledb.s3_use_https` -- Use https for S3 connections (or http otherwise). The default is `on`. (Make sure that it matches server, especially for localhost connections) +- `orioledb.s3_accesskey` -- specify AWS access key to authenticate the bucket. +- `orioledb.s3_secretkey` -- specify AWS secret key to authenticate the bucket. +- `orioledb.s3_num_workers` -- specify the number of AWS workers syncing data to S3 bucket. More workers could make sync faster. 20 - is a recommended value that is enough in most cases. +- `orioledb.s3_desired_size` -- This parameter defines the total desired size of OrioleDB tables on the local storage. Once this limit is exceeded, OrioleDB's background workers will begin evicting local data to the S3 bucket. This mechanism ensures efficient use of local storage and seamless data transfer to S3. Effective support for this limit requires a filesystem that supports sparse files. +- `max_worker_processes` -- PostgreSQL limit for maximum number of workers. Should be set to accommodate extra `orioledb.s3_num_workers` and all other Postgres workers. To start set it to `orioledb.s3_num_workers` plus the previous `max_worker_processes` value. + +After setting the GUC parameters above restart the postmaster. Then all tables and materialized views created `using orioledb` will be synced with the S3 bucket. + +```sql +CREATE TABLE s3_test +( + id int8 NOT NULL, + value1 float8 NOT NULL, + value2 text NOT NULL, + PRIMARY KEY(id) +) USING orioledb +``` + +In S3 mode, all tables and materialized views are incrementally synchronized with S3, meaning only modified blocks are uploaded to the S3 bucket. However, for tables and materialized views not created with `using orioledb`, OrioleDB’s background workers will compute file checksums during each checkpoint. Therefore, it is recommended to use S3 mode when storing the majority of your data with the OrioleDB engine. + +For best results, it's recommended to turn on `Transfer acceleration` in **General** AWS S3 bucket settings (endpoint address will be given with `s3-accelerate.amazonaws.com` suffix) and have the bucket and compute instance within the same AWS region. Even better is to use **Directory** AWS bucket within the same AWS region and sub-region as the compute instance. + +Only one database instance can connect to the same S3 bucket. During startup a database instance checks if another instance already is connected to the S3 bucket and if the bucket is compatible. Otherwise the instance will fail to start. + +As mentioned above S3 mode is currently experimental. The major limitations of this mode are the following. + +1. While OrioleDB tables and materialized views are stored incrementally in the S3 bucket, the history is kept forever. There is currently no mechanism to safely remove the old data. +2. In the primary/replica setup, each should have a separate S3 bucket. + +All of the limitations above are temporary and will be removed in further releases. + +## S3 loader utility + +The S3 loader utility allows getting data from the S3 bucket to any local machine into the specified directory. + +To use it you need to install `boto3` and `testgres` into your python: + +`pip install boto3 testgres` + +Run the script with the same parameters as from your S3 Postgres cluster config: + +- `AWS_ACCESS_KEY_ID` - same as `orioledb.s3_accesskey` +- `AWS_SECRET_ACCESS_KEY` - same as `orioledb.s3_secretkey` +- `AWS_DEFAULT_REGION` - same as `orioledb.s3_region` +- `--endpoint` - same as `orioledb.s3_host` (full URL with `https://` or `http://` prefix) E.g `--endpoint=https://mybucket.s3-accelerate.amazonaws.com` or `--endpoint=https://mybucket.s3.amazonaws.com` or for local instance `--endpoint=http://localhost:PORT` +- `--prefix` - optional prefix to prepend to object paths (May contain bucket name if it is not in endpoint) +`--endpoint=https://mybucket.s3-accelerate.amazonaws.com` or `--endpoint=https://mybucket.s3.amazonaws.com` +- `--bucket-name` - S3 bucket name from `orioledb.s3_host` E.g `--bucket-name=mybucket` +- `--data-dir` - destination directory on the local machine you want to write data to. E.g. `--data-dir=mydata/` +- `--verbose` - optionally print extended info. + +`AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY='' AWS_DEFAULT_REGION= python orioledb_s3_loader.py --endpoint=https:// --data-dir='orioledb_data' --verbose` diff --git a/contrib/orioledb/doc/usage/getting-started.mdx b/contrib/orioledb/doc/usage/getting-started.mdx new file mode 100644 index 00000000000..7266c544c2c --- /dev/null +++ b/contrib/orioledb/doc/usage/getting-started.mdx @@ -0,0 +1,286 @@ +--- +id: getting-started +sidebar_label: Getting Started +--- + +# Usage + +OrioleDB uses PosgreSQL's built-in Table Access Method API. When you create a table you can specify `USING orioledb;`. + +## Quick start + +### Start PostgreSQL + +The OrioleDB extension requires PostgreSQL pluggable Storage. Until the PostgreSQL community merges the [required patches](../intro.mdx#patch-set), you can use the OrioleDB docker image to start PostgreSQL on your machine: + +```bash title="bash" +docker run -d --name orioledb -p 5432:5432 orioledb/orioledb +``` + +### Enable the extension + +You can enable the OrioleDB extension by running the following command: + +```sql title="psql" +CREATE EXTENSION orioledb; +``` + +### Create tables + +Let's define a `blog_post` table, which stores blog posts and has two indices: primary key on the `id` column and secondary key by `published_at` column. + +```sql title="psql" +-- Create a table +CREATE TABLE blog_post +( + id int8 NOT NULL, + title text NOT NULL, + body text NOT NULL, + author text NOT NULL, + published_at timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, + views bigint NOT NULL, + PRIMARY KEY(id) +) USING orioledb; -- Define the storage engine + +-- Create an index +CREATE INDEX blog_post_published_at ON blog_post(published_at); +``` + +OrioleDB uses index-organized tables. So, the selection of the primary key is a very critical decision affecting performance. If you do not specify a primary key, a hidden surrogate primary key will be created over the virtual `ctid` column. + +### Query tables + +Query your tables using regular DML queries, including `SELECT`, `INSERT`, `UPDATE`, `DELETE` and `INSERT ON CONFLICT`. + +For example: + +```sql title="psql" +INSERT INTO blog_post (id, title, body, author, views) +VALUES (1, 'Hello, World!', 'This is my first blog post.', 'John Doe', 1000); + +SELECT * FROM blog_post ORDER BY published_at DESC LIMIT 10; + +``` + +### View query plans + +Plans of queries involving OrioleDB tables could be viewed using `EXPLAIN` clause as usual. + +```sql +EXPLAIN SELECT * FROM blog_post ORDER BY published_at DESC LIMIT 10; +``` + +```bash title="Result" + QUERY PLAN +------------------------------------------------------------------------------------------------------------ + Limit (cost=0.15..1.67 rows=10 width=120) + -> Index Scan Backward using blog_post_published_at on blog_post (cost=0.15..48.95 rows=320 width=120) +(2 rows) +``` + +```sql +EXPLAIN SELECT * FROM blog_post WHERE id = 1; +``` + +```bash title="Result" + QUERY PLAN +---------------------------------------------------------------------------------- + Index Scan using blog_post_pkey on blog_post (cost=0.15..8.17 rows=1 width=120) + Index Cond: (id = 1) +(2 rows) +``` + +`EXPLAIN (ANALYZE, BUFFERS)` clause allows to view page access statistics. + +```sql +# EXPLAIN (ANALYZE, BUFFERS) + SELECT * FROM blog_post ORDER BY published_at DESC LIMIT 10; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.29..0.54 rows=10 width=42) (actual time=0.100..0.210 rows=10 loops=1) + -> Index Scan Backward using blog_post_published_at on blog_post (cost=0.29..251.27 rows=9999 width=42) (actual time=0.097..0.202 rows=10 loops=1) + Planning: + Buffers: shared hit=35 + Planning Time: 1.147 ms + Execution Time: 0.284 ms +(6 rows) +``` + +## Advanced usage + +### Use OrioleDB tables by default + +If you want all your created tables use `orioledb` access method without explicitly specifying this each time at `CREATE +TABLE`, add in your PostgreSQL config file: + +`default_table_access_method = 'orioledb'` + +NB: This doesn't affect system catalogs that can be only `heap`. Tables created before setting this parameter also +retain their previous access method. + +### Collations + +OrioleDB tables support only ICU, C, and POSIX collations. So, make sure the cluster or database is set up with default collations that fall under those options, otherwise you have to write COLLATE for every "text" field of the table. + +ALTER COLLATION REFRESH VERSION is also disabled for collations that used for fields and indexes of orioledb tables. + +### Block-level data compression + +OrioleDB implements block-level compression. Compression levels are integer values from `-1` to `22`. Value of `-1` means no compression (default), values between 0 and 22 specified compression levels of zstd library. + +The following options control the compression level of a table: + +- `compress` – compression level for all table data structures (value of `-1` disables compression for the table, non-specifying makes `orioledb.default_compress` to be used), +- `primary_compress` – compression level for the table primary key (in case of `-1` it's inherited from `compress` value for the table if it's positive, otherwise `orioledb.default_primary_compress` is used), +- `toast_compress` – compression level for the table TOASTed values. (in case of `-1` it's inherited from `compress` value for the table if it's positive, otherwise `orioledb.default_toast_compress` is used) + +Individual indexes also have the `compress` option, which controls the compression level of a particular index, overriding the value of the table `compress` option. + +```sql +CREATE TABLE compression_test +( + id int8 NOT NULL, + value1 float8 NOT NULL, + value2 text NOT NULL, + PRIMARY KEY(id) +) USING orioledb + WITH (compress = 5, toast_compress = 10, primary_compress = -1); + +CREATE INDEX compression_test_value1_idx ON compression_test(value1) + WITH (compress = 22) +CREATE INDEX compression_test_value2_idx ON compression_test(value2); +``` + +In this example primary key of `compression_test` table uses compression as specified by `orioledb.default_primary_compress` value, TOAST values are compressed with level of `10`, `compression_test_value1_idx` index is compressed with level of `22`, index `compression_test_value2_idx` is compressed with level of `5`. + +### Fillfactor + +OrioleDB tables and indices support `fillfactor` option similar to Postgres `heap` tables and indices [docs for +heap](https://www.postgresql.org/docs/current/sql-createtable.html#RELOPTION-FILLFACTOR), [docs for +index](https://www.postgresql.org/docs/current/sql-createindex.html#INDEX-RELOPTION-FILLFACTOR). Reasonably low +fillfactor speeds up table data modifications at cost of reserving some extra place on disk. Setting it is recommended for tables where modification rate is expected to be significant. + +```sql +CREATE TABLE o_test_fillfactor +( + f1 text, + f2 varchar, + f3 integer, + PRIMARY KEY(f1) +) USING orioledb WITH (fillfactor = 60); + +CREATE INDEX o_test_fillfactor_ix1 ON o_test_fillfactor(f2) WITH (fillfactor = 80); +``` + +In `heap` tables lower `fillfactor` mainly speeds up updates by allowing place for HOT-updated tuples and decreasing +page locks at concurrent modifications by spreading them to bigger number of pages. In `orioledb` tables pages don't +store old tuple versions and pages are divided into chunks so they already don't have these limitations. But with tables +being index-organised OrioleDB inserts tuples into specific pages according to `btree` structure. In OrioleDB `fillfactor` speeds up both inserts and updates by decreasing the number of page-splits when a leaf page has no place to accommodate new tuples. It works in the same manner for the index and for the table. + +Fillfactor could be modified at any point of time + +```sql +ALTER TABLE o_test_fillfactor SET (fillfactor = 20); +ALTER INDEX o_test_fillfactor_ix1 SET (fillfactor = 50); +``` + +### Sequence caching + +OrioleDB optimizes performance by bypassing PostgreSQL's native transaction ID (XID) allocation and heap write-ahead logging (WAL) when operations are strictly confined to OrioleDB tables. However, because PostgreSQL sequences are implemented as `heap` relations, invoking `nextval()` modifies heap structures. This forces the assignment of a full PostgreSQL XID, negating OrioleDB's transaction optimizations. + +To prevent unnecessary XID allocations when using sequence-backed columns, configure sequences with a `CACHE` value: + +```sql +CREATE SEQUENCE my_seq CACHE 100; +``` + +With caching enabled, only the initial `nextval()` call requires `heap` modification and XID allocation to fetch a block of values. Subsequent calls within the same session retrieve values from memory, preserving OrioleDB's optimized transaction handling. + +:::note +This optimization is critical for workloads characterized by high volumes of small transactions. For large transactions, the performance penalty is negligible as the cost of XID allocation is heavily amortized. +::: + +### Undo log size calculation + +OrioleDB stores some of the modifications data in temporary Undo logs. While this storage separate from relation +files is more convenient and not prone to bloating, table and database size reported by Postgres functions won't +count Undo logs size. Use special function to count Undo-related temporary data: + +```sql +SELECT * FROM orioledb_undo_size(); +``` + +Undo data is counted in three categories: system undo, user page-level undo and user row-level undo. + +### Data deletion + +OrioleDB automatically merges sparse pages. Therefore, when many rows are deleted, data pages are freed and available for future usage. Data files aren't currently shrunk in such a situation, but that would be implemented soon. + +### Checkpoints, WAL & recovery + +OrioleDB has its own recovery mechanism: copy-on-write checkpoints and row-level WAL. However, both OrioleDB's checkpoints and WAL are integrated into PostgreSQL. PostgreSQL checkpointer process handles OrioleDB's tables as well. PostgreSQL WAL stream contains both WAL-records of built-in PostgreSQL tables and row-level WAL-records of OrioleDB's tables. + +Recovery using row-level WAL records might require significant CPU resources. Therefore parallel recovery of OrioleDB's tables is implemented. OrioleDB launches its own pool of recovery workers, each of them responsible for replaying a particular part of WAL records. + +OrioleDB has its own pool background writer processes (the `orioledb.bgwriter_num_workers` GUC parameter defines the pool size). Usage of multiple background writers increases the effectiveness of IO-utilization on modern hardware. + +### Experimental support of the block devices + +OrioleDB implements experimental support of direct interaction with block devices mode. This mode removes the overhead of the filesystem. + +In this mode, the main part of table data is stored in the filesystem, but small metadata is still stored in the data directory. + +The current implementation of block devices support contains memory leaks, resulting in the error message `device file overflow` even if the actual data size is much less than block device size. In this case, only the re-initialization of the data directory could help. + +We plan to fix memory leaks soon and develop tools for monitoring free block device space. + +In order to activate block device mode, one should specify `orioledb.device_filename` and `orioledb.device_length` GUC parameters. When the `orioledb.use_mmap` GUC parameter is enabled, the block device is connected using `mmap`. This mode is optimal for NVRAM, which directly connects to the data bus. `mmap` mode is not recommended for regular devices because the current `mmap` implementation in Linux has very bug concurrency. + +### Experimental support of indexes other than btree + +OrioleDB has experimental support for indexes other than btree. It is implemented by an internal "bridge index" between non-btree index and OrioleDB table. Bridge index is automatically added when the first non-btree index is built. + +```sql +CREATE INDEX blog_post_title_gin_idx ON blog_post USING GIN (title); +``` + +Manual build of a bridge index for a table is not necessary but possible: +```sql +ALTER TABLE blog_post SET (index_bridging); +``` + +If all existing bridged indexes for a table were removed, the "bridge index" would not be removed automatically. If you don't plan to add non-btree indexes anymore you can delete unnecessary "bridge index" for this table: +```sql +ALTER TABLE blog_post RESET (index_bridging); +``` + +Note: btree index could also be built as a bridged index (use only for testing purposes, not recommended) +```sql +CREATE INDEX blog_post_title_idx ON blog_post USING btree(title) with (orioledb_index = off); +``` + +## Current limitations + +OrioleDB is currently in the development stage. Therefore it has the following temporary limitations. + +1. `pg_rewind` copies OrioleDB tables completely. Shortly OrioleDB will implement incremental copying of OrioleDB tables using `pg_rewind`. +2. OrioleDB supports parallel sequential scan, but not other types of scan. +3. OrioleDB doesn't support prepared transactions. +4. OrioleDB support of non-btree indexes is experimental yet. +5. OrioleDB supports bitmap scan only for int4, int8 and ctid primary keys. +6. Row-level concurrency in OrioleDB has some [differences](../architecture/row-level-concurrency.mdx). +7. OrioleDB doesn't support `CLUSTER` and `VACUUM FULL` commands yet, because we don't implement rewrite of the tables for these commands. And also `CLUSTER` doesn't really makes much sense for index-organized tables. +8. `REINDEX CONCURRENTLY` now is not supported. +9. OrioleDB tables don't support `Sample Scans` yet. +10. OrioleDB does not implement true SERIALIZABLE Snapshot Isolation. The `orioledb.serializable` GUC controls how SERIALIZABLE transactions are handled: + - `table_lock` (default): every relation a SERIALIZABLE transaction touches is protected by a coarse heavyweight `ExclusiveLock`. Correct, but pessimistic — concurrent non-conflicting workloads degrade to serial execution. + - `error`: reject SERIALIZABLE transactions with `ERRCODE_FEATURE_NOT_SUPPORTED` (the legacy behavior). + - `repeatable_read`: treat SERIALIZABLE as `REPEATABLE READ` for OrioleDB tables only — no extra locks are taken, OrioleDB's CSN snapshot already provides per-transaction stable reads. Heap tables in the same transaction continue to use PG's SSI as usual. +11. Backward fetches from a cursor is supported only when the cursor is declared with `SCROLL`. +12. OrioleDB doesn't use checksums on data pages and initializing a database +cluster with `-k` (`--data-checksums`) doesn't have effect on OrioleDB tables. +13. PostgreSQL sequences are backed by heap relations. Accessing them (e.g., via `nextval()`) forces full PostgreSQL Transaction ID (XID) allocation, bypassing OrioleDB's optimized transaction mechanics unless sequence caching is utilized. + +## Links + +See [description of OrioleDB's settings](configuration.mdx), [experimental decoupled storage and compute mode](decoupled-storage.mdx), and [experimental undo-based rewind](rewind.mdx) diff --git a/contrib/orioledb/doc/usage/rewind.mdx b/contrib/orioledb/doc/usage/rewind.mdx new file mode 100644 index 00000000000..eefae2c1b2c --- /dev/null +++ b/contrib/orioledb/doc/usage/rewind.mdx @@ -0,0 +1,140 @@ +--- +id: rewind +sidebar_label: Undo-based rewind +--- + +# Undo-based Rewind + +OrioleDB provides an undo-based rewind capability that allows a database cluster to be reverted to a consistent previous state. Unlike Point-in-Time Recovery (PITR), which relies on Write-Ahead Log (WAL) replay, OrioleDB rewind utilizes undo logs to roll back changes. This mechanism is generally faster than WAL-based recovery for reverting to recent database states, because it directly reverses recent changes using the undo chain. + +For `orioledb` tables, rewind uses the engine's native undo logs. For standard PostgreSQL `heap` tables, rewind functionality is supported by delaying vacuuming; older tuple versions are retained in the heap until they fall outside the configured rewind retention window. + +:::warning[Experimental Feature] +This feature is experimental and imposes a significant performance penalty. It is disabled by default and should be used with caution. +::: + +## Configuration + +The following parameters control the rewind subsystem. These must be set in `postgresql.conf` or via `ALTER SYSTEM`. + +| Parameter | Type | Default | Range | Description | +| :--- | :--- | :--- | :--- | :--- | +| `orioledb.enable_rewind` | `boolean` | `off` | on/off | Enables the collection of rewind data and starts the Rewind Worker. | +| `orioledb.rewind_max_time` | `integer` | `3600` | 1 - 86400 | Maximum age (in seconds) a transaction record is retained for rewind. | +| `orioledb.rewind_max_transactions` | `integer` | `100000` | 1 - `INT_MAX` | Maximum number of transactions to retain in the rewind queue. | +| `orioledb.rewind_buffers` | `integer` | `1024` | 6 - `INT_MAX` | Number of shared memory buffers for rewind metadata. | + +:::note +Enabling rewind increases the number of background processes. Ensure that `max_worker_processes` is configured with sufficient overhead to accommodate the Rewind Worker. +::: + +## The Rewind Worker + +When `orioledb.enable_rewind` is set to `on`, OrioleDB launches a background **Rewind Worker**. This process is responsible for managing the lifecycle of transaction history and undo logs. + +### Key Responsibilities +* The worker monitors the rewind queue. Once a transaction is older than the retention threshold, the worker marks the item as completed, allowing the system to safely delete old undo files and vacuum dead `heap` tuples. +* It maintains the "Rewind Horizon" — the furthest point in time to which the database can be safely reverted. +* The worker continues to process the queue at a regular interval even during periods of database inactivity to prevent storage bloat. + +:::warning[Rewind Worker] +While the Rewind Worker manages its "to-do list" in a fixed-size circular buffer, its failure to progress has two major side effects on system health: + +- If the worker lags, OrioleDB cannot reclaim space in the physical Undo Logs. These logs will spill from memory to the orioledb_undo directory on disk, causing it to grow indefinitely until the worker catches up. + +- The worker is responsible for advancing the global xmin horizon. If the worker is stuck on an old transaction, PostgreSQL and OrioleDB will consider all subsequent row versions as "potentially needed." This prevents VACUUM from removing dead tuples, leading to significant table bloat and degraded query performance. +::: + +## Rewind Functions + +### `orioledb_rewind_by_time(seconds int4[, attempt_restart bool])` +Rewinds the cluster state by the specified number of seconds from the current time. + +* **Parameters:** + * `seconds`: The number of seconds to rewind the cluster state. + * `attempt_restart`: If `true`, the database system will automatically restart after the rewind. If `false`, the database will only be shut down. Default is `false`. +* **Examples:** + * `SELECT orioledb_rewind_by_time(600);` — Rewinds by 10 minutes and shuts down the database without restarting. + * `SELECT orioledb_rewind_by_time(600, true);` — Rewinds by 10 minutes and restarts the database. + +### `orioledb_rewind_to_timestamp(target_time timestamptz[, attempt_restart bool])` +Rewinds the cluster to a specific point in time. + +* **Parameters:** + * `target_time`: The exact timestamp to which the cluster state will be rewound. + * `attempt_restart`: If `true`, the database system will automatically restart after the rewind. If `false`, the database will only be shut down. Default is `false`. +* **Examples:** + * `SELECT orioledb_rewind_to_timestamp('2025-01-01 12:00:00 UTC', true);` — Rewinds to January 1, 2025, at 12:00:00 UTC, and restarts the database. + * `SELECT orioledb_rewind_to_timestamp('2025-01-01 12:00:00 UTC');` — Rewinds to the specified timestamp and shuts down the database without restarting. + +--- + +### `orioledb_rewind_to_transaction(xid int4, oxid int8[, attempt_restart bool])` +Rewinds the cluster to a state **before** a specific transaction pair identified by the PostgreSQL Transaction ID (`xid`) and the OrioleDB Transaction ID (`oxid`). + +* **Parameters:** + * `xid`: The PostgreSQL Transaction ID. + * `oxid`: The OrioleDB Transaction ID. + * `attempt_restart`: If `true`, the database system will automatically restart after the rewind. If `false`, the database will only be shut down. Default is `false`. +* **Examples:** + * `SELECT orioledb_rewind_to_transaction(1750, 555, true);` — Rewinds the cluster state to just before transaction pair `1750`/`555` and restarts the database. + * `SELECT orioledb_rewind_to_transaction(1750, 555);` — Rewinds the cluster state to just before the specified transaction pair and shuts down the database without restarting. + +### Examining the Rewind Horizon +To check the available rewind range, use: +* `orioledb_get_current_oxid()`: Returns the current OrioleDB transaction's ID. It will assign a new one if the current transaction does not have one already. +* `orioledb_get_complete_xid()`: Returns the oldest PostgreSQL `xid` available for rewind. +* `orioledb_get_complete_oxid()`: Returns the oldest OrioleDB `oxid` available for rewind. + +## Operational Workflow + +When a rewind function is invoked, the system executes the following steps: + +1. Validates that the requested target is within the retention window (`rewind_max_time`) and that `orioledb.enable_rewind` is active. +2. Signals the Rewind Worker to stop adding new transactions to the buffer. +3. Signals all other active backends to terminate. The process waits up to 100 seconds for backends to exit. +4. Reverts the data pages to the requested state. +5. Once the rewind is complete, depending on the value of `attempt_restart` argument, the function either shuts down the database or attempts to restart the PostgreSQL instance to finalize the state change. + +:::warning[Restart Reliability] +The automatic restart is a single, best-effort attempt. If the restart fails (e.g., due to configuration errors), manual intervention is required to bring the server back online. Ensure you have system-level access to the server to manually start the service, as the SQL connection will be terminated immediately upon completion of the rewind. +::: + +## Examples + +To rewind the database to a specific transaction state, you must first record the transaction identifiers at your desired "recovery point." + +1. **Identify the recovery point:** +```sql +-- Record these values +SELECT pg_current_xact_id(), orioledb_get_current_oxid(); + + pg_current_xact_id | orioledb_get_current_oxid +--------------------+--------------------------- + 1750 | 555 +``` + +2. **Perform modifications:** +```sql +-- Accidental data loss or undesired changes occur here +DROP TABLE important_data; +``` + +3. **Perform the rewind:** +```sql +-- Revert to the IDs recorded in step 1 +SELECT orioledb_rewind_to_transaction(1750, 555); +``` +*The server will log "Rewind complete" and shut down.* + +:::note[Data consistency] +For applications requiring strong consistency guarantees, it is recommended to explicitly acquire locks on relevant tables within the transaction intended as the rewind target. (Step 1 in the example above) +::: + +## Limitations and Caveats + +* The rewind buffer is stored in shared memory and is not persistent across restarts. You cannot rewind to a point in time prior to the current cluster start time. +* Rewind is destructive to all data modifications occurring after the target point. It is recommended to take a backup before initiating a rewind. +* Because rewind requires retaining old versions in `heap` tables, standard vacuuming is inhibited for data within the rewind window. High write volume to `heap` tables may lead to significant bloat. +* Rewind is currently incompatible with physical replication. Standby servers do not reflect the rewind operation and will become inconsistent with the primary. +* If the system crashes or is interrupted during the rewind phase, the database may be left in an inconsistent state. \ No newline at end of file diff --git a/contrib/orioledb/docker/Dockerfile b/contrib/orioledb/docker/Dockerfile new file mode 100644 index 00000000000..e8ad0e1bcb5 --- /dev/null +++ b/contrib/orioledb/docker/Dockerfile @@ -0,0 +1,290 @@ +# This is slightly adjusted Dockerfile from +# https://github.com/docker-library/postgres + +# set ALPINE_VERSION= [ edge 3.21 3.20 3.19 3.18 ] +ARG ALPINE_VERSION=3.21 +FROM alpine:${ALPINE_VERSION} + +ARG ALPINE_VERSION + +# Set PG_MAJOR = [ 17 16 ] +ARG PG_MAJOR=17 +ENV PG_MAJOR=${PG_MAJOR} + +# set compiler: [ clang gcc ] +ARG BUILD_CC_COMPILER=clang +ENV BUILD_CC_COMPILER=${BUILD_CC_COMPILER} + +# Enable debug mode and preserve the build environments for debugging. +# In this case, each image size exceeds 1GB +ARG DEBUG_MODE=false +ENV DEBUG_MODE=${DEBUG_MODE} + +# Define build dependencies for LLVM [ llvm-dev clang ] +# These include the specific versions of 'llvm-dev' and 'clang' suitable for the current version of PostgreSQL. +# They are useful for building downstream extensions using the same LLVM, like PostGIS alpine https://github.com/postgis/docker-postgis +# Note: Some older PostgreSQL version does not support LLVM 16. Therefore, for Alpine >=3.18, please use "llvm15-dev clang15". +# Reference: https://github.com/docker-library/postgres/pull/1077 +ARG DOCKER_PG_LLVM_DEPS="llvm-dev clang" +ENV DOCKER_PG_LLVM_DEPS=${DOCKER_PG_LLVM_DEPS} + +# 70 is the standard uid/gid for "postgres" in Alpine +# https://git.alpinelinux.org/aports/tree/main/postgresql/postgresql.pre-install?h=3.12-stable +RUN set -eux; \ + addgroup -g 70 -S postgres; \ + adduser -u 70 -S -D -G postgres -H -h /var/lib/postgresql -s /bin/sh postgres; \ + mkdir -p /var/lib/postgresql; \ + chown -R postgres:postgres /var/lib/postgresql + +# su-exec (gosu-compatible) is installed further down + +# make the "en_US.UTF-8" locale so postgres will be utf-8 enabled by default +# alpine doesn't require explicit locale-file generation +ENV LANG=en_US.utf8 + +RUN mkdir -p /usr/src/postgresql/contrib/orioledb + +COPY . /usr/src/postgresql/contrib/orioledb + +RUN mkdir /docker-entrypoint-initdb.d /docker-default-initdb.d + +RUN mkdir -p /var/run/postgresql && chown -R postgres:postgres /var/run/postgresql && chmod 2777 /var/run/postgresql + +RUN set -eux; \ + \ + PGTAG=$(grep "^$PG_MAJOR: " /usr/src/postgresql/contrib/orioledb/.pgtags | cut -d' ' -f2-) ; \ + ORIOLEDB_VERSION=$(grep "^#define ORIOLEDB_VERSION" /usr/src/postgresql/contrib/orioledb/include/orioledb.h | cut -d'"' -f2) ; \ + ORIOLEDB_BUILDTIME=$(date -Iseconds) ; \ + ALPINE_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f 2 | cut -d . -f 1,2 | cut -d _ -f 1) ; \ + \ + # To get support for all locales: IF >=Alpine3.16 THEN install icu-data-full + # https://wiki.alpinelinux.org/wiki/Release_Notes_for_Alpine_3.16.0#ICU_data_split + # https://github.com/docker-library/postgres/issues/327#issuecomment-1201582069 + case "$ALPINE_VERSION" in 3.13 | 3.14 | 3.15 ) EXTRA_ICU_PACKAGES='' ;; \ + 3.16 | 3.17 | 3.18 | 3.19 | 3.20 | 3.21* ) EXTRA_ICU_PACKAGES=icu-data-full ;; \ + *) : ;; \ + esac ; \ + \ + echo "PG_MAJOR=$PG_MAJOR" ; \ + echo "PGTAG=$PGTAG" ; \ + echo "BUILD_CC_COMPILER=$BUILD_CC_COMPILER" ; \ + echo "ORIOLEDB_VERSION=$ORIOLEDB_VERSION" ; \ + echo "ORIOLEDB_BUILDTIME=$ORIOLEDB_BUILDTIME" ; \ + echo "ALPINE_VERSION=$ALPINE_VERSION" ; \ + echo "EXTRA_ICU_PACKAGES=$EXTRA_ICU_PACKAGES" ; \ + echo "DOCKER_PG_LLVM_DEPS=$DOCKER_PG_LLVM_DEPS" ; \ + echo "DEBUG_MODE=$DEBUG_MODE" ; \ + \ + # check if the custom llvm version is set, and if so, set the LLVM_CONFIG and CLANG variables + CUSTOM_LLVM_VERSION=$(echo "$DOCKER_PG_LLVM_DEPS" | sed -n 's/.*llvm\([0-9]*\).*/\1/p') ; \ + if [ ! -z "${CUSTOM_LLVM_VERSION}" ]; then \ + echo "CUSTOM_LLVM_VERSION=$CUSTOM_LLVM_VERSION" ; \ + export LLVM_CONFIG="/usr/lib/llvm${CUSTOM_LLVM_VERSION}/bin/llvm-config" ; \ + export CLANG=clang-${CUSTOM_LLVM_VERSION} ; \ + if [[ "${BUILD_CC_COMPILER}" == "clang" ]]; then \ + export BUILD_CC_COMPILER=clang-${CUSTOM_LLVM_VERSION} ; \ + echo "fix: BUILD_CC_COMPILER=clang-${CUSTOM_LLVM_VERSION}" ; \ + fi ; \ + fi ; \ + \ + apk add --no-cache --virtual .build-deps \ + ${DOCKER_PG_LLVM_DEPS} \ + bison \ + coreutils \ + curl \ + dpkg-dev dpkg \ + flex \ + g++ \ + gcc \ + krb5-dev \ + libc-dev \ + libedit-dev \ + libxml2-dev \ + libxslt-dev \ + linux-headers \ +# needed for s3 support + curl-dev \ + make \ + openldap-dev \ + openssl-dev \ +# configure: error: prove not found + perl-utils \ +# configure: error: Perl module IPC::Run is required to run TAP tests + perl-ipc-run \ + perl-dev \ + python3 \ + python3-dev \ + tcl-dev \ + util-linux-dev \ + zlib-dev \ + zstd-dev \ +# https://www.postgresql.org/docs/10/static/release-10.html#id-1.11.6.9.5.13 + icu-dev \ +# https://www.postgresql.org/docs/14/release-14.html#id-1.11.6.5.5.3.7 + lz4-dev \ + ; \ + \ + curl -o postgresql.tar.gz \ + --header "Accept: application/vnd.github.v3.raw" \ + --remote-name \ + --location https://github.com/orioledb/postgres/tarball/$PGTAG; \ + mkdir -p /usr/src/postgresql; \ + tar \ + --extract \ + --file postgresql.tar.gz \ + --directory /usr/src/postgresql \ + --strip-components 1 \ + ; \ + rm postgresql.tar.gz; \ + \ + cd /usr/src/postgresql; \ + \ + POSTGRESQL_VERSION=$(grep "PACKAGE_VERSION=" ./configure | cut -d"'" -f2) ; \ + echo "POSTGRESQL_VERSION=$POSTGRESQL_VERSION" ; \ + \ +# update "DEFAULT_PGSOCKET_DIR" to "/var/run/postgresql" (matching Debian) +# see https://anonscm.debian.org/git/pkg-postgresql/postgresql.git/tree/debian/patches/51-default-sockets-in-var.patch?id=8b539fcb3e093a521c095e70bdfa76887217b89f + awk '$1 == "#define" && $2 == "DEFAULT_PGSOCKET_DIR" && $3 == "\"/tmp\"" { $3 = "\"/var/run/postgresql\""; print; next } { print }' src/include/pg_config_manual.h > src/include/pg_config_manual.h.new; \ + grep '/var/run/postgresql' src/include/pg_config_manual.h.new; \ + mv src/include/pg_config_manual.h.new src/include/pg_config_manual.h; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ +# explicitly update autoconf config.guess and config.sub so they support more arches/libcs + cp /usr/src/postgresql/contrib/orioledb/docker/config.guess config/config.guess; \ + cp /usr/src/postgresql/contrib/orioledb/docker/config.sub config/config.sub; \ +# configure options taken from: +# https://anonscm.debian.org/cgit/pkg-postgresql/postgresql.git/tree/debian/rules?h=9.5 + ( CC=${BUILD_CC_COMPILER} ./configure \ + --build="$gnuArch" \ +# "/usr/src/postgresql/src/backend/access/common/tupconvert.c:105: undefined reference to `libintl_gettext'" +# --enable-nls \ + --enable-integer-datetimes \ + --enable-thread-safety \ + --enable-tap-tests \ +# skip debugging info -- we want tiny size instead +# --enable-debug \ + --disable-rpath \ + --with-uuid=e2fs \ + --with-gnu-ld \ + --with-pgport=5432 \ + --with-system-tzdata=/usr/share/zoneinfo \ + --prefix=/usr/local \ + --with-includes=/usr/local/include \ + --with-libraries=/usr/local/lib \ + --with-gssapi \ + --with-ldap \ + --with-tcl \ + --with-perl \ + --with-python \ +# --with-pam \ + --with-openssl \ + --with-libxml \ + --with-libxslt \ + --with-icu \ + --with-llvm \ + --with-lz4 \ + --with-zstd \ + # The "testgres" package expects the PostgreSQL version as the last word. + # Therefore, the extra ${POSTGRESQL_VERSION} is added as a workaround. + --with-extra-version=" ${ORIOLEDB_VERSION} PGTAG=${PGTAG} alpine:${ALPINE_VERSION}+${BUILD_CC_COMPILER} build:${ORIOLEDB_BUILDTIME} ${POSTGRESQL_VERSION}" \ + || cat config.log ); \ + if printf "%s\n" "$PGTAG" | grep -Fqe "patches${PG_MAJOR}_"; then \ + echo "ORIOLEDB_PATCHSET_VERSION = `echo $PGTAG | cut -d'_' -f2`" >> src/Makefile.global; \ + else \ + echo "ORIOLEDB_PATCHSET_VERSION = $PGTAG" >> src/Makefile.global; \ + fi ; \ + # install postgresql + make -j "$(nproc)"; \ + make -C contrib -j "$(nproc)"; \ + make install; \ + make -C contrib install; \ + # install orioledb extension + cd /usr/src/postgresql/contrib/orioledb; \ + # Remove any stale generated SQL files that may have leaked into the + # build context, so the Makefile regenerates them with IS_DEV=1. + make USE_PGXS=1 clean; \ + make USE_PGXS=1 IS_DEV=1 -j "$(nproc)"; \ + make USE_PGXS=1 IS_DEV=1 install; \ + \ + # Clean up only if not in debug mode + if [ "$DEBUG_MODE" != "true" ]; then \ + runDeps="$( \ + scanelf --needed --nobanner --format '%n#p' --recursive /usr/local \ + | tr ',' '\n' \ + | sort -u \ + | awk 'system("[ -e /usr/local/lib/" $1 " ]") == 0 { next } { print "so:" $1 }' \ + # Remove plperl, plpython and pltcl dependencies by default to save image size + # To use the pl extensions, those have to be installed in a derived image + | grep -v -e perl -e python -e tcl \ + )"; \ + apk add --no-cache --virtual .postgresql-rundeps \ + $runDeps \ + ; \ + apk del --no-network .build-deps; \ + rm -rf \ + /usr/src/postgresql \ + /usr/local/share/doc \ + /usr/local/share/man \ + /tmp/* \ + ; \ + fi ; \ + \ + apk add --no-cache \ + bash \ + su-exec \ + # tzdata is optional, but only adds around 1Mb to image size and is recommended by Django documentation: + # https://docs.djangoproject.com/en/1.10/ref/databases/#optimizing-postgresql-s-configuration + tzdata \ + # install extra icu packages ( >=Alpine3.16 ) + $EXTRA_ICU_PACKAGES \ + ; \ + cd / ; \ + postgres --version ; \ + initdb --version + +ENV PGDATA=/var/lib/postgresql/data +# this 777 will be replaced by 700 at runtime (allows semi-arbitrary "--user" values) +RUN mkdir -p "$PGDATA" && chown -R postgres:postgres "$PGDATA" && chmod 777 "$PGDATA" +VOLUME /var/lib/postgresql/data + +RUN mkdir -p /etc/postgresql && chown -R postgres:postgres /etc/postgresql && chmod 700 /etc/postgresql +COPY --chown=postgres:postgres docker/init/postgresql.docker.conf /etc/postgresql/postgresql.conf +ENV PG_CONF=/etc/postgresql/postgresql.conf + +ENV POSTGRES_INITDB_ARGS="--locale-provider=icu --icu-locale=en" + +COPY docker/init/docker-entrypoint.sh /usr/local/bin/ +COPY docker/init/default-orioledb.sh /docker-default-initdb.d/ +ENTRYPOINT ["docker-entrypoint.sh"] + +# We set the default STOPSIGNAL to SIGINT, which corresponds to what PostgreSQL +# calls "Fast Shutdown mode" wherein new connections are disallowed and any +# in-progress transactions are aborted, allowing PostgreSQL to stop cleanly and +# flush tables to disk, which is the best compromise available to avoid data +# corruption. +# +# Users who know their applications do not keep open long-lived idle connections +# may way to use a value of SIGTERM instead, which corresponds to "Smart +# Shutdown mode" in which any existing sessions are allowed to finish and the +# server stops when all sessions are terminated. +# +# See https://www.postgresql.org/docs/12/server-shutdown.html for more details +# about available PostgreSQL server shutdown signals. +# +# See also https://www.postgresql.org/docs/12/server-start.html for further +# justification of this as the default value, namely that the example (and +# shipped) systemd service files use the "Fast Shutdown mode" for service +# termination. +# +STOPSIGNAL SIGINT +# +# An additional setting that is recommended for all users regardless of this +# value is the runtime "--stop-timeout" (or your orchestrator/runtime's +# equivalent) for controlling how long to wait between sending the defined +# STOPSIGNAL and sending SIGKILL (which is likely to cause data corruption). +# +# The default in most runtimes (such as Docker) is 10 seconds, and the +# documentation at https://www.postgresql.org/docs/12/server-start.html notes +# that even 90 seconds may not be long enough in many instances. + +EXPOSE 5432 +CMD ["postgres", "-D", "/etc/postgresql"] diff --git a/contrib/orioledb/docker/Dockerfile.ubuntu b/contrib/orioledb/docker/Dockerfile.ubuntu new file mode 100644 index 00000000000..9075044ec57 --- /dev/null +++ b/contrib/orioledb/docker/Dockerfile.ubuntu @@ -0,0 +1,318 @@ +# This is modified Dockerfile from 16/bookworm in +# https://github.com/docker-library/postgres + +# Set UBUNTU_VERSION = [ devel 25.04 24.10 24.04 22.04 ] +# or [ devel plucky oracular noble jammy ] + +ARG UBUNTU_VERSION=noble +FROM ubuntu:${UBUNTU_VERSION} + +ARG UBUNTU_VERSION + +# Set PG_MAJOR = [ 17 16 ] +ARG PG_MAJOR=17 +ENV PG_MAJOR=${PG_MAJOR} + +# set compiler: [ clang gcc ] +ARG BUILD_CC_COMPILER=clang +ENV BUILD_CC_COMPILER=${BUILD_CC_COMPILER} + +# Enable debug mode and preserve the build environments for debugging. +# In this case, each image size exceeds ~1GB +ARG DEBUG_MODE=false +ENV DEBUG_MODE=${DEBUG_MODE} + +# Define build dependencies for LLVM [ llvm-dev clang ] +# These include the specific versions of 'llvm-dev' and 'clang' suitable for the current version of PostgreSQL. +# Reference: https://github.com/docker-library/postgres/pull/1077 +ARG DOCKER_PG_LLVM_DEPS="llvm-dev clang" +ENV DOCKER_PG_LLVM_DEPS=${DOCKER_PG_LLVM_DEPS} + +# explicitly set user/group IDs +RUN set -eux; \ + groupadd -r postgres --gid=999; \ +# https://salsa.debian.org/postgresql/postgresql-common/blob/997d842ee744687d99a2b2d95c1083a2615c79e8/debian/postgresql-common.postinst#L32-35 + useradd -r -g postgres --uid=999 --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; \ +# also create the postgres user's home directory with appropriate permissions +# see https://github.com/docker-library/postgres/issues/274 + mkdir -p /var/lib/postgresql; \ + chown -R postgres:postgres /var/lib/postgresql + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get full-upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + gnupg \ + locales \ + tzdata \ + \ + # Ensures compatibility with the official PostgreSQL Docker image + libnss-wrapper \ + xz-utils \ + zstd \ + ; \ + echo 'en_US.UTF-8 UTF-8' >> /etc/locale.gen; \ + locale-gen; \ + locale -a | grep 'en_US.utf8' ; \ + \ + rm -rf /var/lib/apt/lists/* ; \ + apt-get clean + +# make the "en_US.UTF-8" locale so postgres will be utf-8 enabled by default +ENV LANG=en_US.utf8 + +# grab gosu for easy step-down from root +# https://github.com/tianon/gosu/releases +ENV GOSU_VERSION=1.19 +RUN set -eux; \ + savedAptMark="$(apt-mark showmanual)"; \ + apt-get update; \ + apt-get install -y --no-install-recommends ca-certificates wget; \ + rm -rf /var/lib/apt/lists/*; \ + dpkgArch="$(dpkg --print-architecture | awk -F- '{ print $NF }')"; \ + wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$dpkgArch"; \ + wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$dpkgArch.asc"; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4; \ + gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc; \ + apt-mark auto '.*' > /dev/null; \ + [ -z "$savedAptMark" ] || apt-mark manual $savedAptMark > /dev/null; \ + apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false; \ + chmod +x /usr/local/bin/gosu; \ + gosu --version; \ + gosu nobody true + +RUN mkdir -p /usr/src/postgresql/contrib/orioledb + +COPY . /usr/src/postgresql/contrib/orioledb + +RUN mkdir /docker-entrypoint-initdb.d /docker-default-initdb.d + +RUN mkdir -p /var/run/postgresql && chown -R postgres:postgres /var/run/postgresql && chmod 2777 /var/run/postgresql + +ENV PATH=$PATH:/usr/lib/postgresql/$PG_MAJOR/bin +RUN set -eux; \ + \ + PGTAG=$(grep "^$PG_MAJOR: " /usr/src/postgresql/contrib/orioledb/.pgtags | cut -d' ' -f2-) ; \ + ORIOLEDB_VERSION=$(grep "^#define ORIOLEDB_VERSION" /usr/src/postgresql/contrib/orioledb/include/orioledb.h | cut -d'"' -f2) ; \ + ORIOLEDB_BUILDTIME=$(date -Iseconds) ; \ + \ + echo "PG_MAJOR=$PG_MAJOR" ; \ + echo "PGTAG=$PGTAG" ; \ + echo "BUILD_CC_COMPILER=$BUILD_CC_COMPILER" ; \ + echo "ORIOLEDB_VERSION=$ORIOLEDB_VERSION" ; \ + echo "ORIOLEDB_BUILDTIME=$ORIOLEDB_BUILDTIME" ; \ + echo "DOCKER_PG_LLVM_DEPS=$DOCKER_PG_LLVM_DEPS" ; \ + echo "DEBUG_MODE=$DEBUG_MODE" ; \ + \ + LLVM_RUNTIME_DEPS=$(echo "$DOCKER_PG_LLVM_DEPS" | grep -o 'llvm[0-9]*') ; \ + echo "LLVM_RUNTIME_DEPS=$LLVM_RUNTIME_DEPS" ; \ + \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get -y install --no-install-recommends \ + build-essential \ + ${LLVM_RUNTIME_DEPS} \ + ${DOCKER_PG_LLVM_DEPS} \ + bison \ + curl \ + flex \ + gdb \ + git \ + libcurl4-openssl-dev \ + libicu-dev \ + libipc-run-perl \ + libkrb5-dev \ + libldap-dev \ + liblz4-1 \ + liblz4-dev \ + libperl-dev \ + libssl-dev \ + libreadline-dev \ + libuuid1 \ + libxml2 \ + libxml2-dev \ + libxslt1.1 \ + libxslt1-dev \ + libzstd1 \ + libzstd-dev \ + make \ + pkg-config \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-testresources \ + tcl-dev \ + uuid-dev \ + wget \ + ; \ + \ + curl -o postgresql.tar.gz \ + --header "Accept: application/vnd.github.v3.raw" \ + --remote-name \ + --location https://github.com/orioledb/postgres/tarball/$PGTAG; \ + mkdir -p /usr/src/postgresql; \ + tar \ + --extract \ + --file postgresql.tar.gz \ + --directory /usr/src/postgresql \ + --strip-components 1 \ + ; \ + rm postgresql.tar.gz; \ + \ + cd /usr/src/postgresql; \ + \ + POSTGRESQL_VERSION=$(grep "PACKAGE_VERSION=" ./configure | cut -d"'" -f2) ; \ + echo "POSTGRESQL_VERSION=$POSTGRESQL_VERSION" ; \ + \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ +# explicitly update autoconf config.guess and config.sub so they support more arches/libcs + cp /usr/src/postgresql/contrib/orioledb/docker/config.guess config/config.guess; \ + cp /usr/src/postgresql/contrib/orioledb/docker/config.sub config/config.sub; \ +# configure options taken from: +# https://anonscm.debian.org/cgit/pkg-postgresql/postgresql.git/tree/debian/rules?h=9.5 + ( CC=${BUILD_CC_COMPILER} ./configure \ + --build="$gnuArch" \ +# "/usr/src/postgresql/src/backend/access/common/tupconvert.c:105: undefined reference to `libintl_gettext'" +# --enable-nls \ + --enable-integer-datetimes \ + --enable-thread-safety \ + --enable-tap-tests \ +# skip debugging info -- we want tiny size instead +# --enable-debug \ + --disable-rpath \ + --with-uuid=e2fs \ + --with-gnu-ld \ + --with-pgport=5432 \ + --with-system-tzdata=/usr/share/zoneinfo \ + --prefix=/usr/local \ + --with-includes=/usr/local/include \ + --with-libraries=/usr/local/lib \ + --with-krb5 \ + --with-gssapi \ + --with-ldap \ + --with-tcl \ + --with-perl \ + --with-python \ +# --with-pam \ + --with-openssl \ + --with-libxml \ + --with-libxslt \ + --with-icu \ + --with-llvm \ + --with-lz4 \ + --with-zstd \ + # The "testgres" package expects the PostgreSQL version as the last word. + # Therefore, the extra ${POSTGRESQL_VERSION} is added as a workaround. + --with-extra-version=" ${ORIOLEDB_VERSION} PGTAG=${PGTAG} ubuntu:${UBUNTU_VERSION}+${BUILD_CC_COMPILER} build:${ORIOLEDB_BUILDTIME} ${POSTGRESQL_VERSION}" \ + || cat config.log ); \ + if printf "%s\n" "$PGTAG" | grep -Fqe "patches${PG_MAJOR}_"; then \ + echo "ORIOLEDB_PATCHSET_VERSION = `echo $PGTAG | cut -d'_' -f2`" >> src/Makefile.global; \ + else \ + echo "ORIOLEDB_PATCHSET_VERSION = $PGTAG" >> src/Makefile.global; \ + fi ; \ + # install postgresql + make -j "$(nproc)"; \ + make -C contrib -j "$(nproc)"; \ + make install; \ + make -C contrib install; \ + # install orioledb extension + cd /usr/src/postgresql/contrib/orioledb; \ + # Remove any stale generated SQL files that may have leaked into the + # build context, so the Makefile regenerates them with IS_DEV=1. + make USE_PGXS=1 clean; \ + make USE_PGXS=1 IS_DEV=1 -j "$(nproc)"; \ + make USE_PGXS=1 IS_DEV=1 install; \ + \ + # Clean up only if not in debug mode + if [ "$DEBUG_MODE" != "true" ]; then \ + apt-get -y remove \ + ${DOCKER_PG_LLVM_DEPS} \ + bison \ + build-essential \ + curl \ + flex \ + gdb \ + git \ + libicu-dev \ + libipc-run-perl \ + liblz4-dev \ + libreadline-dev \ + libxml2-dev \ + libxslt1-dev \ + libzstd-dev \ + make \ + pkg-config \ + python3-dev \ + python3-pip \ + uuid-dev \ + wget \ + ; \ + apt-get -y autoremove; \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/*; \ + apt-get clean; \ + rm -rf \ + /usr/src/postgresql \ + /usr/local/share/doc \ + /usr/local/share/man \ + /tmp/* \ + ; \ + fi ; \ + \ + cd /; \ + # Verify PostgreSQL installation + ldconfig ; \ + postgres --version ; \ + initdb --version + +ENV PGDATA=/var/lib/postgresql/data +# this 777 will be replaced by 700 at runtime (allows semi-arbitrary "--user" values) +RUN mkdir -p "$PGDATA" && chown -R postgres:postgres "$PGDATA" && chmod 777 "$PGDATA" +VOLUME /var/lib/postgresql/data + +RUN mkdir -p /etc/postgresql && chown -R postgres:postgres /etc/postgresql && chmod 700 /etc/postgresql +COPY --chown=postgres:postgres docker/init/postgresql.docker.conf /etc/postgresql/postgresql.conf +ENV PG_CONF=/etc/postgresql/postgresql.conf + +ENV POSTGRES_INITDB_ARGS="--locale-provider=icu --icu-locale=en" + +COPY docker/init/docker-entrypoint.sh /usr/local/bin/ +COPY docker/init/default-orioledb.sh /docker-default-initdb.d/ +RUN sed -i -e 's/su-exec/gosu/g' "/usr/local/bin/docker-entrypoint.sh" +ENTRYPOINT ["docker-entrypoint.sh"] + +# We set the default STOPSIGNAL to SIGINT, which corresponds to what PostgreSQL +# calls "Fast Shutdown mode" wherein new connections are disallowed and any +# in-progress transactions are aborted, allowing PostgreSQL to stop cleanly and +# flush tables to disk, which is the best compromise available to avoid data +# corruption. +# +# Users who know their applications do not keep open long-lived idle connections +# may way to use a value of SIGTERM instead, which corresponds to "Smart +# Shutdown mode" in which any existing sessions are allowed to finish and the +# server stops when all sessions are terminated. +# +# See https://www.postgresql.org/docs/12/server-shutdown.html for more details +# about available PostgreSQL server shutdown signals. +# +# See also https://www.postgresql.org/docs/12/server-start.html for further +# justification of this as the default value, namely that the example (and +# shipped) systemd service files use the "Fast Shutdown mode" for service +# termination. +# +STOPSIGNAL SIGINT +# +# An additional setting that is recommended for all users regardless of this +# value is the runtime "--stop-timeout" (or your orchestrator/runtime's +# equivalent) for controlling how long to wait between sending the defined +# STOPSIGNAL and sending SIGKILL (which is likely to cause data corruption). +# +# The default in most runtimes (such as Docker) is 10 seconds, and the +# documentation at https://www.postgresql.org/docs/12/server-start.html notes +# that even 90 seconds may not be long enough in many instances. + +EXPOSE 5432 +CMD ["postgres", "-D", "/etc/postgresql"] diff --git a/contrib/orioledb/docker/README.md b/contrib/orioledb/docker/README.md new file mode 100644 index 00000000000..b4870841266 --- /dev/null +++ b/contrib/orioledb/docker/README.md @@ -0,0 +1,50 @@ +# Testing OrioleDB Docker images + +Running the Docker Official Image tests against orioledb images, +see: "Docker Official Images Test Suite": +* https://github.com/docker-library/official-images/tree/master/test + +Used by: +* `./ci/docker_matrix.sh` +* `./.github/workflows/dockertest.yml` +* todo: add to `./.github/workflows/docker.yml` + +## Running docker test suite + +```bash +# clone official-images test suite +OFFIMG_LOCAL_CLONE=./log_docker_build/official-images +OFFIMG_REPO_URL=https://github.com/docker-library/official-images.git +mkdir -p "$OFFIMG_LOCAL_CLONE" +git clone "$OFFIMG_REPO_URL" "$OFFIMG_LOCAL_CLONE" + +"${OFFIMG_LOCAL_CLONE}/test/run.sh" \ + -c "${OFFIMG_LOCAL_CLONE}/test/config.sh" \ + -c "docker/orioledb-config.sh" \ + "orioletest:17-gcc-ubuntu-24.04" +``` + +If the test is ok, you can see: + +```bash +testing orioletest:17-gcc-ubuntu-24.04 + 'utc' [1/6]...passed + 'no-hard-coded-passwords' [2/6]...passed + 'override-cmd' [3/6]...passed + 'postgres-basics' [4/6]....passed + 'postgres-initdb' [5/6]....passed + 'orioledb-basics' [6/6]...passed +``` + +## test: postgres-basics + +https://github.com/docker-library/official-images/blob/master/test/tests/postgres-basics/run.sh + +## test: postgres-initdb + +https://github.com/docker-library/official-images/blob/master/test/tests/postgres-initdb/run.sh +https://github.com/docker-library/official-images/blob/master/test/tests/postgres-initdb/initdb.sql + +## test: orioledb-basics + +* `./tests/orioledb-basics/run.sh` diff --git a/contrib/orioledb/docker/config.guess b/contrib/orioledb/docker/config.guess new file mode 100644 index 00000000000..48a684601bd --- /dev/null +++ b/contrib/orioledb/docker/config.guess @@ -0,0 +1,1815 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright 1992-2024 Free Software Foundation, Inc. + +# shellcheck disable=SC2006,SC2268 # see below for rationale + +timestamp='2024-07-27' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). +# +# Originally written by Per Bothner; maintained since 2000 by Ben Elliston. +# +# You can get the latest version of this script from: +# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess +# +# Please send patches to . + + +# The "shellcheck disable" line above the timestamp inhibits complaints +# about features and limitations of the classic Bourne shell that were +# superseded or lifted in POSIX. However, this script identifies a wide +# variety of pre-POSIX systems that do not have POSIX shells at all, and +# even some reasonably current systems (Solaris 10 as case-in-point) still +# have a pre-POSIX /bin/sh. + + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system '$me' is run on. + +Options: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright 1992-2024 Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try '$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +# Just in case it came from the environment. +GUESS= + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, 'CC_FOR_BUILD' used to be named 'HOST_CC'. We still +# use 'HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +tmp= +# shellcheck disable=SC2172 +trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15 + +set_cc_for_build() { + # prevent multiple calls if $tmp is already set + test "$tmp" && return 0 + : "${TMPDIR=/tmp}" + # shellcheck disable=SC2039,SC3028 + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } + dummy=$tmp/dummy + case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in + ,,) echo "int x;" > "$dummy.c" + for driver in cc gcc c17 c99 c89 ; do + if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then + CC_FOR_BUILD=$driver + break + fi + done + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; + esac +} + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if test -f /.attbin/uname ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +case $UNAME_SYSTEM in +Linux|GNU|GNU/*) + LIBC=unknown + + set_cc_for_build + cat <<-EOF > "$dummy.c" + #if defined(__ANDROID__) + LIBC=android + #else + #include + #if defined(__UCLIBC__) + LIBC=uclibc + #elif defined(__dietlibc__) + LIBC=dietlibc + #elif defined(__GLIBC__) + LIBC=gnu + #elif defined(__LLVM_LIBC__) + LIBC=llvm + #else + #include + /* First heuristic to detect musl libc. */ + #ifdef __DEFINED_va_list + LIBC=musl + #endif + #endif + #endif + EOF + cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` + eval "$cc_set_libc" + + # Second heuristic to detect musl libc. + if [ "$LIBC" = unknown ] && + command -v ldd >/dev/null && + ldd --version 2>&1 | grep -q ^musl; then + LIBC=musl + fi + + # If the system lacks a compiler, then just pick glibc. + # We could probably try harder. + if [ "$LIBC" = unknown ]; then + LIBC=gnu + fi + ;; +esac + +# Note: order is significant - the case branches are not exclusive. + +case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ + /sbin/sysctl -n hw.machine_arch 2>/dev/null || \ + /usr/sbin/sysctl -n hw.machine_arch 2>/dev/null || \ + echo unknown)` + case $UNAME_MACHINE_ARCH in + aarch64eb) machine=aarch64_be-unknown ;; + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + earmv*) + arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'` + endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'` + machine=${arch}${endian}-unknown + ;; + *) machine=$UNAME_MACHINE_ARCH-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently (or will in the future) and ABI. + case $UNAME_MACHINE_ARCH in + earm*) + os=netbsdelf + ;; + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ELF__ + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # Determine ABI tags. + case $UNAME_MACHINE_ARCH in + earm*) + expr='s/^earmv[0-9]/-eabi/;s/eb$//' + abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"` + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case $UNAME_VERSION in + Debian*) + release='-gnu' + ;; + *) + release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + GUESS=$machine-${os}${release}${abi-} + ;; + *:Bitrig:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-bitrig$UNAME_RELEASE + ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-openbsd$UNAME_RELEASE + ;; + *:SecBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/SecBSD.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-secbsd$UNAME_RELEASE + ;; + *:LibertyBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-libertybsd$UNAME_RELEASE + ;; + *:MidnightBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-midnightbsd$UNAME_RELEASE + ;; + *:ekkoBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-ekkobsd$UNAME_RELEASE + ;; + *:SolidBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-solidbsd$UNAME_RELEASE + ;; + *:OS108:*:*) + GUESS=$UNAME_MACHINE-unknown-os108_$UNAME_RELEASE + ;; + macppc:MirBSD:*:*) + GUESS=powerpc-unknown-mirbsd$UNAME_RELEASE + ;; + *:MirBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-mirbsd$UNAME_RELEASE + ;; + *:Sortix:*:*) + GUESS=$UNAME_MACHINE-unknown-sortix + ;; + *:Twizzler:*:*) + GUESS=$UNAME_MACHINE-unknown-twizzler + ;; + *:Redox:*:*) + GUESS=$UNAME_MACHINE-unknown-redox + ;; + mips:OSF1:*.*) + GUESS=mips-dec-osf1 + ;; + alpha:OSF1:*:*) + # Reset EXIT trap before exiting to avoid spurious non-zero exit code. + trap '' 0 + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case $ALPHA_CPU_TYPE in + "EV4 (21064)") + UNAME_MACHINE=alpha ;; + "EV4.5 (21064)") + UNAME_MACHINE=alpha ;; + "LCA4 (21066/21068)") + UNAME_MACHINE=alpha ;; + "EV5 (21164)") + UNAME_MACHINE=alphaev5 ;; + "EV5.6 (21164A)") + UNAME_MACHINE=alphaev56 ;; + "EV5.6 (21164PC)") + UNAME_MACHINE=alphapca56 ;; + "EV5.7 (21164PC)") + UNAME_MACHINE=alphapca57 ;; + "EV6 (21264)") + UNAME_MACHINE=alphaev6 ;; + "EV6.7 (21264A)") + UNAME_MACHINE=alphaev67 ;; + "EV6.8CB (21264C)") + UNAME_MACHINE=alphaev68 ;; + "EV6.8AL (21264B)") + UNAME_MACHINE=alphaev68 ;; + "EV6.8CX (21264D)") + UNAME_MACHINE=alphaev68 ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE=alphaev69 ;; + "EV7 (21364)") + UNAME_MACHINE=alphaev7 ;; + "EV7.9 (21364A)") + UNAME_MACHINE=alphaev79 ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + OSF_REL=`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + GUESS=$UNAME_MACHINE-dec-osf$OSF_REL + ;; + Amiga*:UNIX_System_V:4.0:*) + GUESS=m68k-unknown-sysv4 + ;; + *:[Aa]miga[Oo][Ss]:*:*) + GUESS=$UNAME_MACHINE-unknown-amigaos + ;; + *:[Mm]orph[Oo][Ss]:*:*) + GUESS=$UNAME_MACHINE-unknown-morphos + ;; + *:OS/390:*:*) + GUESS=i370-ibm-openedition + ;; + *:z/VM:*:*) + GUESS=s390-ibm-zvmoe + ;; + *:OS400:*:*) + GUESS=powerpc-ibm-os400 + ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + GUESS=arm-acorn-riscix$UNAME_RELEASE + ;; + arm*:riscos:*:*|arm*:RISCOS:*:*) + GUESS=arm-unknown-riscos + ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + GUESS=hppa1.1-hitachi-hiuxmpp + ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + case `(/bin/universe) 2>/dev/null` in + att) GUESS=pyramid-pyramid-sysv3 ;; + *) GUESS=pyramid-pyramid-bsd ;; + esac + ;; + NILE*:*:*:dcosx) + GUESS=pyramid-pyramid-svr4 + ;; + DRS?6000:unix:4.0:6*) + GUESS=sparc-icl-nx6 + ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) GUESS=sparc-icl-nx7 ;; + esac + ;; + s390x:SunOS:*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=$UNAME_MACHINE-ibm-solaris2$SUN_REL + ;; + sun4H:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-hal-solaris2$SUN_REL + ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-sun-solaris2$SUN_REL + ;; + i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) + GUESS=i386-pc-auroraux$UNAME_RELEASE + ;; + i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) + set_cc_for_build + SUN_ARCH=i386 + # If there is a compiler, see if it is configured for 64-bit objects. + # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. + # This test works for both compilers. + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -m64 -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + SUN_ARCH=x86_64 + fi + fi + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=$SUN_ARCH-pc-solaris2$SUN_REL + ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-sun-solaris3$SUN_REL + ;; + sun4*:SunOS:*:*) + case `/usr/bin/arch -k` in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like '4.1.3-JL'. + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'` + GUESS=sparc-sun-sunos$SUN_REL + ;; + sun3*:SunOS:*:*) + GUESS=m68k-sun-sunos$UNAME_RELEASE + ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3 + case `/bin/arch` in + sun3) + GUESS=m68k-sun-sunos$UNAME_RELEASE + ;; + sun4) + GUESS=sparc-sun-sunos$UNAME_RELEASE + ;; + esac + ;; + aushp:SunOS:*:*) + GUESS=sparc-auspex-sunos$UNAME_RELEASE + ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + GUESS=m68k-milan-mint$UNAME_RELEASE + ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + GUESS=m68k-hades-mint$UNAME_RELEASE + ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + GUESS=m68k-unknown-mint$UNAME_RELEASE + ;; + m68k:machten:*:*) + GUESS=m68k-apple-machten$UNAME_RELEASE + ;; + powerpc:machten:*:*) + GUESS=powerpc-apple-machten$UNAME_RELEASE + ;; + RISC*:Mach:*:*) + GUESS=mips-dec-mach_bsd4.3 + ;; + RISC*:ULTRIX:*:*) + GUESS=mips-dec-ultrix$UNAME_RELEASE + ;; + VAX*:ULTRIX*:*:*) + GUESS=vax-dec-ultrix$UNAME_RELEASE + ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + GUESS=clipper-intergraph-clix$UNAME_RELEASE + ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && + dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`"$dummy" "$dummyarg"` && + { echo "$SYSTEM_NAME"; exit; } + GUESS=mips-mips-riscos$UNAME_RELEASE + ;; + Motorola:PowerMAX_OS:*:*) + GUESS=powerpc-motorola-powermax + ;; + Motorola:*:4.3:PL8-*) + GUESS=powerpc-harris-powermax + ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + GUESS=powerpc-harris-powermax + ;; + Night_Hawk:Power_UNIX:*:*) + GUESS=powerpc-harris-powerunix + ;; + m88k:CX/UX:7*:*) + GUESS=m88k-harris-cxux7 + ;; + m88k:*:4*:R4*) + GUESS=m88k-motorola-sysv4 + ;; + m88k:*:3*:R3*) + GUESS=m88k-motorola-sysv3 + ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110 + then + if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \ + test "$TARGET_BINARY_INTERFACE"x = x + then + GUESS=m88k-dg-dgux$UNAME_RELEASE + else + GUESS=m88k-dg-dguxbcs$UNAME_RELEASE + fi + else + GUESS=i586-dg-dgux$UNAME_RELEASE + fi + ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + GUESS=m88k-dolphin-sysv3 + ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + GUESS=m88k-motorola-sysv3 + ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + GUESS=m88k-tektronix-sysv3 + ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + GUESS=m68k-tektronix-bsd + ;; + *:IRIX*:*:*) + IRIX_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/g'` + GUESS=mips-sgi-irix$IRIX_REL + ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + GUESS=romp-ibm-aix # uname -m gives an 8 hex-code CPU id + ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + GUESS=i386-ibm-aix + ;; + ia64:AIX:*:*) + if test -x /usr/bin/oslevel ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=$UNAME_VERSION.$UNAME_RELEASE + fi + GUESS=$UNAME_MACHINE-ibm-aix$IBM_REV + ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + #include + + int + main () + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` + then + GUESS=$SYSTEM_NAME + else + GUESS=rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + GUESS=rs6000-ibm-aix3.2.4 + else + GUESS=rs6000-ibm-aix3.2 + fi + ;; + *:AIX:*:[4567]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if test -x /usr/bin/lslpp ; then + IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | \ + awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` + else + IBM_REV=$UNAME_VERSION.$UNAME_RELEASE + fi + GUESS=$IBM_ARCH-ibm-aix$IBM_REV + ;; + *:AIX:*:*) + GUESS=rs6000-ibm-aix + ;; + ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*) + GUESS=romp-ibm-bsd4.4 + ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + GUESS=romp-ibm-bsd$UNAME_RELEASE # 4.3 with uname added to + ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + GUESS=rs6000-bull-bosx + ;; + DPX/2?00:B.O.S.:*:*) + GUESS=m68k-bull-sysv3 + ;; + 9000/[34]??:4.3bsd:1.*:*) + GUESS=m68k-hp-bsd + ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + GUESS=m68k-hp-bsd4.4 + ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'` + case $UNAME_MACHINE in + 9000/31?) HP_ARCH=m68000 ;; + 9000/[34]??) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if test -x /usr/bin/getconf; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case $sc_cpu_version in + 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 + 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case $sc_kernel_bits in + 32) HP_ARCH=hppa2.0n ;; + 64) HP_ARCH=hppa2.0w ;; + '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 + esac ;; + esac + fi + if test "$HP_ARCH" = ""; then + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + + #define _HPUX_SOURCE + #include + #include + + int + main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if test "$HP_ARCH" = hppa2.0w + then + set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | + grep -q __LP64__ + then + HP_ARCH=hppa2.0w + else + HP_ARCH=hppa64 + fi + fi + GUESS=$HP_ARCH-hp-hpux$HPUX_REV + ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'` + GUESS=ia64-hp-hpux$HPUX_REV + ;; + 3050*:HI-UX:*:*) + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` && + { echo "$SYSTEM_NAME"; exit; } + GUESS=unknown-hitachi-hiuxwe2 + ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*) + GUESS=hppa1.1-hp-bsd + ;; + 9000/8??:4.3bsd:*:*) + GUESS=hppa1.0-hp-bsd + ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + GUESS=hppa1.0-hp-mpeix + ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*) + GUESS=hppa1.1-hp-osf + ;; + hp8??:OSF1:*:*) + GUESS=hppa1.0-hp-osf + ;; + i*86:OSF1:*:*) + if test -x /usr/sbin/sysversion ; then + GUESS=$UNAME_MACHINE-unknown-osf1mk + else + GUESS=$UNAME_MACHINE-unknown-osf1 + fi + ;; + parisc*:Lites*:*:*) + GUESS=hppa1.1-hp-lites + ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + GUESS=c1-convex-bsd + ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + GUESS=c34-convex-bsd + ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + GUESS=c38-convex-bsd + ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + GUESS=c4-convex-bsd + ;; + CRAY*Y-MP:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=ymp-cray-unicos$CRAY_REL + ;; + CRAY*[A-Z]90:*:*:*) + echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=t90-cray-unicos$CRAY_REL + ;; + CRAY*T3E:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=alphaev5-cray-unicosmk$CRAY_REL + ;; + CRAY*SV1:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=sv1-cray-unicos$CRAY_REL + ;; + *:UNICOS/mp:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=craynv-cray-unicosmp$CRAY_REL + ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'` + GUESS=${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL} + ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` + GUESS=sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL} + ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + GUESS=$UNAME_MACHINE-pc-bsdi$UNAME_RELEASE + ;; + sparc*:BSD/OS:*:*) + GUESS=sparc-unknown-bsdi$UNAME_RELEASE + ;; + *:BSD/OS:*:*) + GUESS=$UNAME_MACHINE-unknown-bsdi$UNAME_RELEASE + ;; + arm:FreeBSD:*:*) + UNAME_PROCESSOR=`uname -p` + set_cc_for_build + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabi + else + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabihf + fi + ;; + *:FreeBSD:*:*) + UNAME_PROCESSOR=`uname -p` + case $UNAME_PROCESSOR in + amd64) + UNAME_PROCESSOR=x86_64 ;; + i386) + UNAME_PROCESSOR=i586 ;; + esac + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL + ;; + i*:CYGWIN*:*) + GUESS=$UNAME_MACHINE-pc-cygwin + ;; + *:MINGW64*:*) + GUESS=$UNAME_MACHINE-pc-mingw64 + ;; + *:MINGW*:*) + GUESS=$UNAME_MACHINE-pc-mingw32 + ;; + *:MSYS*:*) + GUESS=$UNAME_MACHINE-pc-msys + ;; + i*:PW*:*) + GUESS=$UNAME_MACHINE-pc-pw32 + ;; + *:SerenityOS:*:*) + GUESS=$UNAME_MACHINE-pc-serenity + ;; + *:Interix*:*) + case $UNAME_MACHINE in + x86) + GUESS=i586-pc-interix$UNAME_RELEASE + ;; + authenticamd | genuineintel | EM64T) + GUESS=x86_64-unknown-interix$UNAME_RELEASE + ;; + IA64) + GUESS=ia64-unknown-interix$UNAME_RELEASE + ;; + esac ;; + i*:UWIN*:*) + GUESS=$UNAME_MACHINE-pc-uwin + ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + GUESS=x86_64-pc-cygwin + ;; + prep*:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=powerpcle-unknown-solaris2$SUN_REL + ;; + *:GNU:*:*) + # the GNU system + GNU_ARCH=`echo "$UNAME_MACHINE" | sed -e 's,[-/].*$,,'` + GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's,/.*$,,'` + GUESS=$GNU_ARCH-unknown-$LIBC$GNU_REL + ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + GNU_SYS=`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"` + GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC + ;; + x86_64:[Mm]anagarm:*:*|i?86:[Mm]anagarm:*:*) + GUESS="$UNAME_MACHINE-pc-managarm-mlibc" + ;; + *:[Mm]anagarm:*:*) + GUESS="$UNAME_MACHINE-unknown-managarm-mlibc" + ;; + *:Minix:*:*) + GUESS=$UNAME_MACHINE-unknown-minix + ;; + aarch64:Linux:*:*) + set_cc_for_build + CPU=$UNAME_MACHINE + LIBCABI=$LIBC + if test "$CC_FOR_BUILD" != no_compiler_found; then + ABI=64 + sed 's/^ //' << EOF > "$dummy.c" + #ifdef __ARM_EABI__ + #ifdef __ARM_PCS_VFP + ABI=eabihf + #else + ABI=eabi + #endif + #endif +EOF + cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` + eval "$cc_set_abi" + case $ABI in + eabi | eabihf) CPU=armv8l; LIBCABI=$LIBC$ABI ;; + esac + fi + GUESS=$CPU-unknown-linux-$LIBCABI + ;; + aarch64_be:Linux:*:*) + UNAME_MACHINE=aarch64_be + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep -q ld.so.1 + if test "$?" = 0 ; then LIBC=gnulibc1 ; fi + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + arc:Linux:*:* | arceb:Linux:*:* | arc32:Linux:*:* | arc64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + arm*:Linux:*:*) + set_cc_for_build + if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_EABI__ + then + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + else + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabi + else + GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabihf + fi + fi + ;; + avr32*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + cris:Linux:*:*) + GUESS=$UNAME_MACHINE-axis-linux-$LIBC + ;; + crisv32:Linux:*:*) + GUESS=$UNAME_MACHINE-axis-linux-$LIBC + ;; + e2k:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + frv:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + hexagon:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + i*86:Linux:*:*) + GUESS=$UNAME_MACHINE-pc-linux-$LIBC + ;; + ia64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + k1om:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + kvx:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + kvx:cos:*:*) + GUESS=$UNAME_MACHINE-unknown-cos + ;; + kvx:mbr:*:*) + GUESS=$UNAME_MACHINE-unknown-mbr + ;; + loongarch32:Linux:*:* | loongarch64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + m32r*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + m68*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + mips:Linux:*:* | mips64:Linux:*:*) + set_cc_for_build + IS_GLIBC=0 + test x"${LIBC}" = xgnu && IS_GLIBC=1 + sed 's/^ //' << EOF > "$dummy.c" + #undef CPU + #undef mips + #undef mipsel + #undef mips64 + #undef mips64el + #if ${IS_GLIBC} && defined(_ABI64) + LIBCABI=gnuabi64 + #else + #if ${IS_GLIBC} && defined(_ABIN32) + LIBCABI=gnuabin32 + #else + LIBCABI=${LIBC} + #endif + #endif + + #if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa64r6 + #else + #if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa32r6 + #else + #if defined(__mips64) + CPU=mips64 + #else + CPU=mips + #endif + #endif + #endif + + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + MIPS_ENDIAN=el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + MIPS_ENDIAN= + #else + MIPS_ENDIAN= + #endif + #endif +EOF + cc_set_vars=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'` + eval "$cc_set_vars" + test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; } + ;; + mips64el:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + openrisc*:Linux:*:*) + GUESS=or1k-unknown-linux-$LIBC + ;; + or32:Linux:*:* | or1k*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + padre:Linux:*:*) + GUESS=sparc-unknown-linux-$LIBC + ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + GUESS=hppa64-unknown-linux-$LIBC + ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) GUESS=hppa1.1-unknown-linux-$LIBC ;; + PA8*) GUESS=hppa2.0-unknown-linux-$LIBC ;; + *) GUESS=hppa-unknown-linux-$LIBC ;; + esac + ;; + ppc64:Linux:*:*) + GUESS=powerpc64-unknown-linux-$LIBC + ;; + ppc:Linux:*:*) + GUESS=powerpc-unknown-linux-$LIBC + ;; + ppc64le:Linux:*:*) + GUESS=powerpc64le-unknown-linux-$LIBC + ;; + ppcle:Linux:*:*) + GUESS=powerpcle-unknown-linux-$LIBC + ;; + riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + s390:Linux:*:* | s390x:Linux:*:*) + GUESS=$UNAME_MACHINE-ibm-linux-$LIBC + ;; + sh64*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + sh*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + tile*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + vax:Linux:*:*) + GUESS=$UNAME_MACHINE-dec-linux-$LIBC + ;; + x86_64:Linux:*:*) + set_cc_for_build + CPU=$UNAME_MACHINE + LIBCABI=$LIBC + if test "$CC_FOR_BUILD" != no_compiler_found; then + ABI=64 + sed 's/^ //' << EOF > "$dummy.c" + #ifdef __i386__ + ABI=x86 + #else + #ifdef __ILP32__ + ABI=x32 + #endif + #endif +EOF + cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` + eval "$cc_set_abi" + case $ABI in + x86) CPU=i686 ;; + x32) LIBCABI=${LIBC}x32 ;; + esac + fi + GUESS=$CPU-pc-linux-$LIBCABI + ;; + xtensa*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + GUESS=i386-sequent-sysv4 + ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION + ;; + i*86:OS/2:*:*) + # If we were able to find 'uname', then EMX Unix compatibility + # is probably installed. + GUESS=$UNAME_MACHINE-pc-os2-emx + ;; + i*86:XTS-300:*:STOP) + GUESS=$UNAME_MACHINE-unknown-stop + ;; + i*86:atheos:*:*) + GUESS=$UNAME_MACHINE-unknown-atheos + ;; + i*86:syllable:*:*) + GUESS=$UNAME_MACHINE-pc-syllable + ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) + GUESS=i386-unknown-lynxos$UNAME_RELEASE + ;; + i*86:*DOS:*:*) + GUESS=$UNAME_MACHINE-pc-msdosdjgpp + ;; + i*86:*:4.*:*) + UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + GUESS=$UNAME_MACHINE-univel-sysv$UNAME_REL + else + GUESS=$UNAME_MACHINE-pc-sysv$UNAME_REL + fi + ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + GUESS=$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + GUESS=$UNAME_MACHINE-pc-sco$UNAME_REL + else + GUESS=$UNAME_MACHINE-pc-sysv32 + fi + ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i586. + # Note: whatever this is, it MUST be the same as what config.sub + # prints for the "djgpp" host, or else GDB configure will decide that + # this is a cross-build. + GUESS=i586-pc-msdosdjgpp + ;; + Intel:Mach:3*:*) + GUESS=i386-pc-mach3 + ;; + paragon:*:*:*) + GUESS=i860-intel-osf1 + ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + GUESS=i860-stardent-sysv$UNAME_RELEASE # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + GUESS=i860-unknown-sysv$UNAME_RELEASE # Unknown i860-SVR4 + fi + ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + GUESS=m68010-convergent-sysv + ;; + mc68k:UNIX:SYSTEM5:3.51m) + GUESS=m68k-convergent-sysv + ;; + M680?0:D-NIX:5.3:*) + GUESS=m68k-diab-dnix + ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + NCR*:*:4.2:* | MPRAS*:*:4.2:*) + OS_REL='.3' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + GUESS=m68k-unknown-lynxos$UNAME_RELEASE + ;; + mc68030:UNIX_System_V:4.*:*) + GUESS=m68k-atari-sysv4 + ;; + TSUNAMI:LynxOS:2.*:*) + GUESS=sparc-unknown-lynxos$UNAME_RELEASE + ;; + rs6000:LynxOS:2.*:*) + GUESS=rs6000-unknown-lynxos$UNAME_RELEASE + ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) + GUESS=powerpc-unknown-lynxos$UNAME_RELEASE + ;; + SM[BE]S:UNIX_SV:*:*) + GUESS=mips-dde-sysv$UNAME_RELEASE + ;; + RM*:ReliantUNIX-*:*:*) + GUESS=mips-sni-sysv4 + ;; + RM*:SINIX-*:*:*) + GUESS=mips-sni-sysv4 + ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + GUESS=$UNAME_MACHINE-sni-sysv4 + else + GUESS=ns32k-sni-sysv + fi + ;; + PENTIUM:*:4.0*:*) # Unisys 'ClearPath HMP IX 4000' SVR4/MP effort + # says + GUESS=i586-unisys-sysv4 + ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + GUESS=hppa1.1-stratus-sysv4 + ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + GUESS=i860-stratus-sysv4 + ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + GUESS=$UNAME_MACHINE-stratus-vos + ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + GUESS=hppa1.1-stratus-vos + ;; + mc68*:A/UX:*:*) + GUESS=m68k-apple-aux$UNAME_RELEASE + ;; + news*:NEWS-OS:6*:*) + GUESS=mips-sony-newsos6 + ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if test -d /usr/nec; then + GUESS=mips-nec-sysv$UNAME_RELEASE + else + GUESS=mips-unknown-sysv$UNAME_RELEASE + fi + ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + GUESS=powerpc-be-beos + ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + GUESS=powerpc-apple-beos + ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + GUESS=i586-pc-beos + ;; + BePC:Haiku:*:*) # Haiku running on Intel PC compatible. + GUESS=i586-pc-haiku + ;; + ppc:Haiku:*:*) # Haiku running on Apple PowerPC + GUESS=powerpc-apple-haiku + ;; + *:Haiku:*:*) # Haiku modern gcc (not bound by BeOS compat) + GUESS=$UNAME_MACHINE-unknown-haiku + ;; + SX-4:SUPER-UX:*:*) + GUESS=sx4-nec-superux$UNAME_RELEASE + ;; + SX-5:SUPER-UX:*:*) + GUESS=sx5-nec-superux$UNAME_RELEASE + ;; + SX-6:SUPER-UX:*:*) + GUESS=sx6-nec-superux$UNAME_RELEASE + ;; + SX-7:SUPER-UX:*:*) + GUESS=sx7-nec-superux$UNAME_RELEASE + ;; + SX-8:SUPER-UX:*:*) + GUESS=sx8-nec-superux$UNAME_RELEASE + ;; + SX-8R:SUPER-UX:*:*) + GUESS=sx8r-nec-superux$UNAME_RELEASE + ;; + SX-ACE:SUPER-UX:*:*) + GUESS=sxace-nec-superux$UNAME_RELEASE + ;; + Power*:Rhapsody:*:*) + GUESS=powerpc-apple-rhapsody$UNAME_RELEASE + ;; + *:Rhapsody:*:*) + GUESS=$UNAME_MACHINE-apple-rhapsody$UNAME_RELEASE + ;; + arm64:Darwin:*:*) + GUESS=aarch64-apple-darwin$UNAME_RELEASE + ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + if command -v xcode-select > /dev/null 2> /dev/null && \ + ! xcode-select --print-path > /dev/null 2> /dev/null ; then + # Avoid executing cc if there is no toolchain installed as + # cc will be a stub that puts up a graphical alert + # prompting the user to install developer tools. + CC_FOR_BUILD=no_compiler_found + else + set_cc_for_build + fi + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + case $UNAME_PROCESSOR in + i386) UNAME_PROCESSOR=x86_64 ;; + powerpc) UNAME_PROCESSOR=powerpc64 ;; + esac + fi + # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc + if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_PPC >/dev/null + then + UNAME_PROCESSOR=powerpc + fi + elif test "$UNAME_PROCESSOR" = i386 ; then + # uname -m returns i386 or x86_64 + UNAME_PROCESSOR=$UNAME_MACHINE + fi + GUESS=$UNAME_PROCESSOR-apple-darwin$UNAME_RELEASE + ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = x86; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + GUESS=$UNAME_PROCESSOR-$UNAME_MACHINE-nto-qnx$UNAME_RELEASE + ;; + *:QNX:*:4*) + GUESS=i386-pc-qnx + ;; + NEO-*:NONSTOP_KERNEL:*:*) + GUESS=neo-tandem-nsk$UNAME_RELEASE + ;; + NSE-*:NONSTOP_KERNEL:*:*) + GUESS=nse-tandem-nsk$UNAME_RELEASE + ;; + NSR-*:NONSTOP_KERNEL:*:*) + GUESS=nsr-tandem-nsk$UNAME_RELEASE + ;; + NSV-*:NONSTOP_KERNEL:*:*) + GUESS=nsv-tandem-nsk$UNAME_RELEASE + ;; + NSX-*:NONSTOP_KERNEL:*:*) + GUESS=nsx-tandem-nsk$UNAME_RELEASE + ;; + *:NonStop-UX:*:*) + GUESS=mips-compaq-nonstopux + ;; + BS2000:POSIX*:*:*) + GUESS=bs2000-siemens-sysv + ;; + DS/*:UNIX_System_V:*:*) + GUESS=$UNAME_MACHINE-$UNAME_SYSTEM-$UNAME_RELEASE + ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "${cputype-}" = 386; then + UNAME_MACHINE=i386 + elif test "x${cputype-}" != x; then + UNAME_MACHINE=$cputype + fi + GUESS=$UNAME_MACHINE-unknown-plan9 + ;; + *:TOPS-10:*:*) + GUESS=pdp10-unknown-tops10 + ;; + *:TENEX:*:*) + GUESS=pdp10-unknown-tenex + ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + GUESS=pdp10-dec-tops20 + ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + GUESS=pdp10-xkl-tops20 + ;; + *:TOPS-20:*:*) + GUESS=pdp10-unknown-tops20 + ;; + *:ITS:*:*) + GUESS=pdp10-unknown-its + ;; + SEI:*:*:SEIUX) + GUESS=mips-sei-seiux$UNAME_RELEASE + ;; + *:DragonFly:*:*) + DRAGONFLY_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_MACHINE-unknown-dragonfly$DRAGONFLY_REL + ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case $UNAME_MACHINE in + A*) GUESS=alpha-dec-vms ;; + I*) GUESS=ia64-dec-vms ;; + V*) GUESS=vax-dec-vms ;; + esac ;; + *:XENIX:*:SysV) + GUESS=i386-pc-xenix + ;; + i*86:skyos:*:*) + SKYOS_REL=`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'` + GUESS=$UNAME_MACHINE-pc-skyos$SKYOS_REL + ;; + i*86:rdos:*:*) + GUESS=$UNAME_MACHINE-pc-rdos + ;; + i*86:Fiwix:*:*) + GUESS=$UNAME_MACHINE-pc-fiwix + ;; + *:AROS:*:*) + GUESS=$UNAME_MACHINE-unknown-aros + ;; + x86_64:VMkernel:*:*) + GUESS=$UNAME_MACHINE-unknown-esx + ;; + amd64:Isilon\ OneFS:*:*) + GUESS=x86_64-unknown-onefs + ;; + *:Unleashed:*:*) + GUESS=$UNAME_MACHINE-unknown-unleashed$UNAME_RELEASE + ;; + *:Ironclad:*:*) + GUESS=$UNAME_MACHINE-unknown-ironclad + ;; +esac + +# Do we have a guess based on uname results? +if test "x$GUESS" != x; then + echo "$GUESS" + exit +fi + +# No uname command or uname output not recognized. +set_cc_for_build +cat > "$dummy.c" < +#include +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#include +#if defined(_SIZE_T_) || defined(SIGLOST) +#include +#endif +#endif +#endif +int +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); +#endif + +#if defined (vax) +#if !defined (ultrix) +#include +#if defined (BSD) +#if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +#else +#if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#endif +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#else +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname un; + uname (&un); + printf ("vax-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("vax-dec-ultrix\n"); exit (0); +#endif +#endif +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname *un; + uname (&un); + printf ("mips-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("mips-dec-ultrix\n"); exit (0); +#endif +#endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`"$dummy"` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. +test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; } + +echo "$0: unable to guess system type" >&2 + +case $UNAME_MACHINE:$UNAME_SYSTEM in + mips:Linux | mips64:Linux) + # If we got here on MIPS GNU/Linux, output extra information. + cat >&2 <&2 <&2 </dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = "$UNAME_MACHINE" +UNAME_RELEASE = "$UNAME_RELEASE" +UNAME_SYSTEM = "$UNAME_SYSTEM" +UNAME_VERSION = "$UNAME_VERSION" +EOF +fi + +exit 1 + +# Local variables: +# eval: (add-hook 'before-save-hook 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/contrib/orioledb/docker/config.sub b/contrib/orioledb/docker/config.sub new file mode 100644 index 00000000000..4aaae46f6f7 --- /dev/null +++ b/contrib/orioledb/docker/config.sub @@ -0,0 +1,2354 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright 1992-2024 Free Software Foundation, Inc. + +# shellcheck disable=SC2006,SC2268,SC2162 # see below for rationale + +timestamp='2024-05-27' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). + + +# Please send patches to . +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# You can get the latest version of this script from: +# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +# The "shellcheck disable" line above the timestamp inhibits complaints +# about features and limitations of the classic Bourne shell that were +# superseded or lifted in POSIX. However, this script identifies a wide +# variety of pre-POSIX systems that do not have POSIX shells at all, and +# even some reasonably current systems (Solaris 10 as case-in-point) still +# have a pre-POSIX /bin/sh. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS + +Canonicalize a configuration name. + +Options: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright 1992-2024 Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try '$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo "$1" + exit ;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Split fields of configuration type +saved_IFS=$IFS +IFS="-" read field1 field2 field3 field4 <&2 + exit 1 + ;; + *-*-*-*) + basic_machine=$field1-$field2 + basic_os=$field3-$field4 + ;; + *-*-*) + # Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two + # parts + maybe_os=$field2-$field3 + case $maybe_os in + cloudabi*-eabi* \ + | kfreebsd*-gnu* \ + | knetbsd*-gnu* \ + | kopensolaris*-gnu* \ + | linux-* \ + | managarm-* \ + | netbsd*-eabi* \ + | netbsd*-gnu* \ + | nto-qnx* \ + | os2-emx* \ + | rtmk-nova* \ + | storm-chaos* \ + | uclinux-gnu* \ + | uclinux-uclibc* \ + | windows-* ) + basic_machine=$field1 + basic_os=$maybe_os + ;; + android-linux) + basic_machine=$field1-unknown + basic_os=linux-android + ;; + *) + basic_machine=$field1-$field2 + basic_os=$field3 + ;; + esac + ;; + *-*) + case $field1-$field2 in + # Shorthands that happen to contain a single dash + convex-c[12] | convex-c3[248]) + basic_machine=$field2-convex + basic_os= + ;; + decstation-3100) + basic_machine=mips-dec + basic_os= + ;; + *-*) + # Second component is usually, but not always the OS + case $field2 in + # Do not treat sunos as a manufacturer + sun*os*) + basic_machine=$field1 + basic_os=$field2 + ;; + # Manufacturers + 3100* \ + | 32* \ + | 3300* \ + | 3600* \ + | 7300* \ + | acorn \ + | altos* \ + | apollo \ + | apple \ + | atari \ + | att* \ + | axis \ + | be \ + | bull \ + | cbm \ + | ccur \ + | cisco \ + | commodore \ + | convergent* \ + | convex* \ + | cray \ + | crds \ + | dec* \ + | delta* \ + | dg \ + | digital \ + | dolphin \ + | encore* \ + | gould \ + | harris \ + | highlevel \ + | hitachi* \ + | hp \ + | ibm* \ + | intergraph \ + | isi* \ + | knuth \ + | masscomp \ + | microblaze* \ + | mips* \ + | motorola* \ + | ncr* \ + | news \ + | next \ + | ns \ + | oki \ + | omron* \ + | pc533* \ + | rebel \ + | rom68k \ + | rombug \ + | semi \ + | sequent* \ + | siemens \ + | sgi* \ + | siemens \ + | sim \ + | sni \ + | sony* \ + | stratus \ + | sun \ + | sun[234]* \ + | tektronix \ + | tti* \ + | ultra \ + | unicom* \ + | wec \ + | winbond \ + | wrs) + basic_machine=$field1-$field2 + basic_os= + ;; + zephyr*) + basic_machine=$field1-unknown + basic_os=$field2 + ;; + *) + basic_machine=$field1 + basic_os=$field2 + ;; + esac + ;; + esac + ;; + *) + # Convert single-component short-hands not valid as part of + # multi-component configurations. + case $field1 in + 386bsd) + basic_machine=i386-pc + basic_os=bsd + ;; + a29khif) + basic_machine=a29k-amd + basic_os=udi + ;; + adobe68k) + basic_machine=m68010-adobe + basic_os=scout + ;; + alliant) + basic_machine=fx80-alliant + basic_os= + ;; + altos | altos3068) + basic_machine=m68k-altos + basic_os= + ;; + am29k) + basic_machine=a29k-none + basic_os=bsd + ;; + amdahl) + basic_machine=580-amdahl + basic_os=sysv + ;; + amiga) + basic_machine=m68k-unknown + basic_os= + ;; + amigaos | amigados) + basic_machine=m68k-unknown + basic_os=amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + basic_os=sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + basic_os=sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + basic_os=bsd + ;; + aros) + basic_machine=i386-pc + basic_os=aros + ;; + aux) + basic_machine=m68k-apple + basic_os=aux + ;; + balance) + basic_machine=ns32k-sequent + basic_os=dynix + ;; + blackfin) + basic_machine=bfin-unknown + basic_os=linux + ;; + cegcc) + basic_machine=arm-unknown + basic_os=cegcc + ;; + cray) + basic_machine=j90-cray + basic_os=unicos + ;; + crds | unos) + basic_machine=m68k-crds + basic_os= + ;; + da30) + basic_machine=m68k-da30 + basic_os= + ;; + decstation | pmax | pmin | dec3100 | decstatn) + basic_machine=mips-dec + basic_os= + ;; + delta88) + basic_machine=m88k-motorola + basic_os=sysv3 + ;; + dicos) + basic_machine=i686-pc + basic_os=dicos + ;; + djgpp) + basic_machine=i586-pc + basic_os=msdosdjgpp + ;; + ebmon29k) + basic_machine=a29k-amd + basic_os=ebmon + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + basic_os=ose + ;; + gmicro) + basic_machine=tron-gmicro + basic_os=sysv + ;; + go32) + basic_machine=i386-pc + basic_os=go32 + ;; + h8300hms) + basic_machine=h8300-hitachi + basic_os=hms + ;; + h8300xray) + basic_machine=h8300-hitachi + basic_os=xray + ;; + h8500hms) + basic_machine=h8500-hitachi + basic_os=hms + ;; + harris) + basic_machine=m88k-harris + basic_os=sysv3 + ;; + hp300 | hp300hpux) + basic_machine=m68k-hp + basic_os=hpux + ;; + hp300bsd) + basic_machine=m68k-hp + basic_os=bsd + ;; + hppaosf) + basic_machine=hppa1.1-hp + basic_os=osf + ;; + hppro) + basic_machine=hppa1.1-hp + basic_os=proelf + ;; + i386mach) + basic_machine=i386-mach + basic_os=mach + ;; + isi68 | isi) + basic_machine=m68k-isi + basic_os=sysv + ;; + m68knommu) + basic_machine=m68k-unknown + basic_os=linux + ;; + magnum | m3230) + basic_machine=mips-mips + basic_os=sysv + ;; + merlin) + basic_machine=ns32k-utek + basic_os=sysv + ;; + mingw64) + basic_machine=x86_64-pc + basic_os=mingw64 + ;; + mingw32) + basic_machine=i686-pc + basic_os=mingw32 + ;; + mingw32ce) + basic_machine=arm-unknown + basic_os=mingw32ce + ;; + monitor) + basic_machine=m68k-rom68k + basic_os=coff + ;; + morphos) + basic_machine=powerpc-unknown + basic_os=morphos + ;; + moxiebox) + basic_machine=moxie-unknown + basic_os=moxiebox + ;; + msdos) + basic_machine=i386-pc + basic_os=msdos + ;; + msys) + basic_machine=i686-pc + basic_os=msys + ;; + mvs) + basic_machine=i370-ibm + basic_os=mvs + ;; + nacl) + basic_machine=le32-unknown + basic_os=nacl + ;; + ncr3000) + basic_machine=i486-ncr + basic_os=sysv4 + ;; + netbsd386) + basic_machine=i386-pc + basic_os=netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + basic_os=linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + basic_os=newsos + ;; + news1000) + basic_machine=m68030-sony + basic_os=newsos + ;; + necv70) + basic_machine=v70-nec + basic_os=sysv + ;; + nh3000) + basic_machine=m68k-harris + basic_os=cxux + ;; + nh[45]000) + basic_machine=m88k-harris + basic_os=cxux + ;; + nindy960) + basic_machine=i960-intel + basic_os=nindy + ;; + mon960) + basic_machine=i960-intel + basic_os=mon960 + ;; + nonstopux) + basic_machine=mips-compaq + basic_os=nonstopux + ;; + os400) + basic_machine=powerpc-ibm + basic_os=os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + basic_os=ose + ;; + os68k) + basic_machine=m68k-none + basic_os=os68k + ;; + paragon) + basic_machine=i860-intel + basic_os=osf + ;; + parisc) + basic_machine=hppa-unknown + basic_os=linux + ;; + psp) + basic_machine=mipsallegrexel-sony + basic_os=psp + ;; + pw32) + basic_machine=i586-unknown + basic_os=pw32 + ;; + rdos | rdos64) + basic_machine=x86_64-pc + basic_os=rdos + ;; + rdos32) + basic_machine=i386-pc + basic_os=rdos + ;; + rom68k) + basic_machine=m68k-rom68k + basic_os=coff + ;; + sa29200) + basic_machine=a29k-amd + basic_os=udi + ;; + sei) + basic_machine=mips-sei + basic_os=seiux + ;; + sequent) + basic_machine=i386-sequent + basic_os= + ;; + sps7) + basic_machine=m68k-bull + basic_os=sysv2 + ;; + st2000) + basic_machine=m68k-tandem + basic_os= + ;; + stratus) + basic_machine=i860-stratus + basic_os=sysv4 + ;; + sun2) + basic_machine=m68000-sun + basic_os= + ;; + sun2os3) + basic_machine=m68000-sun + basic_os=sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + basic_os=sunos4 + ;; + sun3) + basic_machine=m68k-sun + basic_os= + ;; + sun3os3) + basic_machine=m68k-sun + basic_os=sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + basic_os=sunos4 + ;; + sun4) + basic_machine=sparc-sun + basic_os= + ;; + sun4os3) + basic_machine=sparc-sun + basic_os=sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + basic_os=sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + basic_os=solaris2 + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + basic_os= + ;; + sv1) + basic_machine=sv1-cray + basic_os=unicos + ;; + symmetry) + basic_machine=i386-sequent + basic_os=dynix + ;; + t3e) + basic_machine=alphaev5-cray + basic_os=unicos + ;; + t90) + basic_machine=t90-cray + basic_os=unicos + ;; + toad1) + basic_machine=pdp10-xkl + basic_os=tops20 + ;; + tpf) + basic_machine=s390x-ibm + basic_os=tpf + ;; + udi29k) + basic_machine=a29k-amd + basic_os=udi + ;; + ultra3) + basic_machine=a29k-nyu + basic_os=sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + basic_os=none + ;; + vaxv) + basic_machine=vax-dec + basic_os=sysv + ;; + vms) + basic_machine=vax-dec + basic_os=vms + ;; + vsta) + basic_machine=i386-pc + basic_os=vsta + ;; + vxworks960) + basic_machine=i960-wrs + basic_os=vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + basic_os=vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + basic_os=vxworks + ;; + xbox) + basic_machine=i686-pc + basic_os=mingw32 + ;; + ymp) + basic_machine=ymp-cray + basic_os=unicos + ;; + *) + basic_machine=$1 + basic_os= + ;; + esac + ;; +esac + +# Decode 1-component or ad-hoc basic machines +case $basic_machine in + # Here we handle the default manufacturer of certain CPU types. It is in + # some cases the only manufacturer, in others, it is the most popular. + w89k) + cpu=hppa1.1 + vendor=winbond + ;; + op50n) + cpu=hppa1.1 + vendor=oki + ;; + op60c) + cpu=hppa1.1 + vendor=oki + ;; + ibm*) + cpu=i370 + vendor=ibm + ;; + orion105) + cpu=clipper + vendor=highlevel + ;; + mac | mpw | mac-mpw) + cpu=m68k + vendor=apple + ;; + pmac | pmac-mpw) + cpu=powerpc + vendor=apple + ;; + + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + cpu=m68000 + vendor=att + ;; + 3b*) + cpu=we32k + vendor=att + ;; + bluegene*) + cpu=powerpc + vendor=ibm + basic_os=cnk + ;; + decsystem10* | dec10*) + cpu=pdp10 + vendor=dec + basic_os=tops10 + ;; + decsystem20* | dec20*) + cpu=pdp10 + vendor=dec + basic_os=tops20 + ;; + delta | 3300 | delta-motorola | 3300-motorola | motorola-delta | motorola-3300) + cpu=m68k + vendor=motorola + ;; + # This used to be dpx2*, but that gets the RS6000-based + # DPX/20 and the x86-based DPX/2-100 wrong. See + # https://oldskool.silicium.org/stations/bull_dpx20.htm + # https://www.feb-patrimoine.com/english/bull_dpx2.htm + # https://www.feb-patrimoine.com/english/unix_and_bull.htm + dpx2 | dpx2[23]00 | dpx2[23]xx) + cpu=m68k + vendor=bull + ;; + dpx2100 | dpx21xx) + cpu=i386 + vendor=bull + ;; + dpx20) + cpu=rs6000 + vendor=bull + ;; + encore | umax | mmax) + cpu=ns32k + vendor=encore + ;; + elxsi) + cpu=elxsi + vendor=elxsi + basic_os=${basic_os:-bsd} + ;; + fx2800) + cpu=i860 + vendor=alliant + ;; + genix) + cpu=ns32k + vendor=ns + ;; + h3050r* | hiux*) + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + cpu=m68000 + vendor=hp + ;; + hp9k3[2-9][0-9]) + cpu=m68k + vendor=hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + cpu=hppa1.1 + vendor=hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + i*86v32) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv32 + ;; + i*86v4*) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv4 + ;; + i*86v) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv + ;; + i*86sol2) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=solaris2 + ;; + j90 | j90-cray) + cpu=j90 + vendor=cray + basic_os=${basic_os:-unicos} + ;; + iris | iris4d) + cpu=mips + vendor=sgi + case $basic_os in + irix*) + ;; + *) + basic_os=irix4 + ;; + esac + ;; + miniframe) + cpu=m68000 + vendor=convergent + ;; + *mint | mint[0-9]* | *MiNT | *MiNT[0-9]*) + cpu=m68k + vendor=atari + basic_os=mint + ;; + news-3600 | risc-news) + cpu=mips + vendor=sony + basic_os=newsos + ;; + next | m*-next) + cpu=m68k + vendor=next + ;; + np1) + cpu=np1 + vendor=gould + ;; + op50n-* | op60c-*) + cpu=hppa1.1 + vendor=oki + basic_os=proelf + ;; + pa-hitachi) + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 + ;; + pbd) + cpu=sparc + vendor=tti + ;; + pbb) + cpu=m68k + vendor=tti + ;; + pc532) + cpu=ns32k + vendor=pc532 + ;; + pn) + cpu=pn + vendor=gould + ;; + power) + cpu=power + vendor=ibm + ;; + ps2) + cpu=i386 + vendor=ibm + ;; + rm[46]00) + cpu=mips + vendor=siemens + ;; + rtpc | rtpc-*) + cpu=romp + vendor=ibm + ;; + sde) + cpu=mipsisa32 + vendor=sde + basic_os=${basic_os:-elf} + ;; + simso-wrs) + cpu=sparclite + vendor=wrs + basic_os=vxworks + ;; + tower | tower-32) + cpu=m68k + vendor=ncr + ;; + vpp*|vx|vx-*) + cpu=f301 + vendor=fujitsu + ;; + w65) + cpu=w65 + vendor=wdc + ;; + w89k-*) + cpu=hppa1.1 + vendor=winbond + basic_os=proelf + ;; + none) + cpu=none + vendor=none + ;; + leon|leon[3-9]) + cpu=sparc + vendor=$basic_machine + ;; + leon-*|leon[3-9]-*) + cpu=sparc + vendor=`echo "$basic_machine" | sed 's/-.*//'` + ;; + + *-*) + saved_IFS=$IFS + IFS="-" read cpu vendor <&2 + exit 1 + ;; + esac + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $vendor in + digital*) + vendor=dec + ;; + commodore*) + vendor=cbm + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if test x"$basic_os" != x +then + +# First recognize some ad-hoc cases, or perhaps split kernel-os, or else just +# set os. +obj= +case $basic_os in + gnu/linux*) + kernel=linux + os=`echo "$basic_os" | sed -e 's|gnu/linux|gnu|'` + ;; + os2-emx) + kernel=os2 + os=`echo "$basic_os" | sed -e 's|os2-emx|emx|'` + ;; + nto-qnx*) + kernel=nto + os=`echo "$basic_os" | sed -e 's|nto-qnx|qnx|'` + ;; + *-*) + saved_IFS=$IFS + IFS="-" read kernel os <&2 + fi + ;; + *) + echo "Invalid configuration '$1': OS '$os' not recognized" 1>&2 + exit 1 + ;; +esac + +case $obj in + aout* | coff* | elf* | pe*) + ;; + '') + # empty is fine + ;; + *) + echo "Invalid configuration '$1': Machine code format '$obj' not recognized" 1>&2 + exit 1 + ;; +esac + +# Here we handle the constraint that a (synthetic) cpu and os are +# valid only in combination with each other and nowhere else. +case $cpu-$os in + # The "javascript-unknown-ghcjs" triple is used by GHC; we + # accept it here in order to tolerate that, but reject any + # variations. + javascript-ghcjs) + ;; + javascript-* | *-ghcjs) + echo "Invalid configuration '$1': cpu '$cpu' is not valid with os '$os$obj'" 1>&2 + exit 1 + ;; +esac + +# As a final step for OS-related things, validate the OS-kernel combination +# (given a valid OS), if there is a kernel. +case $kernel-$os-$obj in + linux-gnu*- | linux-android*- | linux-dietlibc*- | linux-llvm*- \ + | linux-mlibc*- | linux-musl*- | linux-newlib*- \ + | linux-relibc*- | linux-uclibc*- | linux-ohos*- ) + ;; + uclinux-uclibc*- | uclinux-gnu*- ) + ;; + managarm-mlibc*- | managarm-kernel*- ) + ;; + windows*-msvc*-) + ;; + -dietlibc*- | -llvm*- | -mlibc*- | -musl*- | -newlib*- | -relibc*- \ + | -uclibc*- ) + # These are just libc implementations, not actual OSes, and thus + # require a kernel. + echo "Invalid configuration '$1': libc '$os' needs explicit kernel." 1>&2 + exit 1 + ;; + -kernel*- ) + echo "Invalid configuration '$1': '$os' needs explicit kernel." 1>&2 + exit 1 + ;; + *-kernel*- ) + echo "Invalid configuration '$1': '$kernel' does not support '$os'." 1>&2 + exit 1 + ;; + *-msvc*- ) + echo "Invalid configuration '$1': '$os' needs 'windows'." 1>&2 + exit 1 + ;; + kfreebsd*-gnu*- | knetbsd*-gnu*- | netbsd*-gnu*- | kopensolaris*-gnu*-) + ;; + vxworks-simlinux- | vxworks-simwindows- | vxworks-spe-) + ;; + nto-qnx*-) + ;; + os2-emx-) + ;; + rtmk-nova-) + ;; + *-eabi*- | *-gnueabi*-) + ;; + none--*) + # None (no kernel, i.e. freestanding / bare metal), + # can be paired with an machine code file format + ;; + -*-) + # Blank kernel with real OS is always fine. + ;; + --*) + # Blank kernel and OS with real machine code file format is always fine. + ;; + *-*-*) + echo "Invalid configuration '$1': Kernel '$kernel' not known to work with OS '$os'." 1>&2 + exit 1 + ;; +esac + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +case $vendor in + unknown) + case $cpu-$os in + *-riscix*) + vendor=acorn + ;; + *-sunos* | *-solaris*) + vendor=sun + ;; + *-cnk* | *-aix*) + vendor=ibm + ;; + *-beos*) + vendor=be + ;; + *-hpux*) + vendor=hp + ;; + *-mpeix*) + vendor=hp + ;; + *-hiux*) + vendor=hitachi + ;; + *-unos*) + vendor=crds + ;; + *-dgux*) + vendor=dg + ;; + *-luna*) + vendor=omron + ;; + *-genix*) + vendor=ns + ;; + *-clix*) + vendor=intergraph + ;; + *-mvs* | *-opened*) + vendor=ibm + ;; + *-os400*) + vendor=ibm + ;; + s390-* | s390x-*) + vendor=ibm + ;; + *-ptx*) + vendor=sequent + ;; + *-tpf*) + vendor=ibm + ;; + *-vxsim* | *-vxworks* | *-windiss*) + vendor=wrs + ;; + *-aux*) + vendor=apple + ;; + *-hms*) + vendor=hitachi + ;; + *-mpw* | *-macos*) + vendor=apple + ;; + *-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*) + vendor=atari + ;; + *-vos*) + vendor=stratus + ;; + esac + ;; +esac + +echo "$cpu-$vendor${kernel:+-$kernel}${os:+-$os}${obj:+-$obj}" +exit + +# Local variables: +# eval: (add-hook 'before-save-hook 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/contrib/orioledb/docker/init/default-orioledb.sh b/contrib/orioledb/docker/init/default-orioledb.sh new file mode 100644 index 00000000000..f14134b6c5a --- /dev/null +++ b/contrib/orioledb/docker/init/default-orioledb.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "template1" <<-EOSQL +CREATE EXTENSION orioledb; +EOSQL + +psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL +CREATE EXTENSION orioledb; +EOSQL diff --git a/contrib/orioledb/docker/init/docker-entrypoint.sh b/contrib/orioledb/docker/init/docker-entrypoint.sh new file mode 100755 index 00000000000..ad355b64824 --- /dev/null +++ b/contrib/orioledb/docker/init/docker-entrypoint.sh @@ -0,0 +1,347 @@ +#!/usr/bin/env bash +set -Eeo pipefail +# TODO swap to -Eeuo pipefail above (after handling all potentially-unset variables) + +# usage: file_env VAR [DEFAULT] +# ie: file_env 'XYZ_DB_PASSWORD' 'example' +# (will allow for "$XYZ_DB_PASSWORD_FILE" to fill in the value of +# "$XYZ_DB_PASSWORD" from a file, especially for Docker's secrets feature) +file_env() { + local var="$1" + local fileVar="${var}_FILE" + local def="${2:-}" + if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then + echo >&2 "error: both $var and $fileVar are set (but are exclusive)" + exit 1 + fi + local val="$def" + if [ "${!var:-}" ]; then + val="${!var}" + elif [ "${!fileVar:-}" ]; then + val="$(< "${!fileVar}")" + fi + export "$var"="$val" + unset "$fileVar" +} + +# check to see if this file is being run or sourced from another script +_is_sourced() { + # https://unix.stackexchange.com/a/215279 + [ "${#FUNCNAME[@]}" -ge 2 ] \ + && [ "${FUNCNAME[0]}" = '_is_sourced' ] \ + && [ "${FUNCNAME[1]}" = 'source' ] +} + +# used to create initial postgres directories and if run as root, ensure ownership to the "postgres" user +docker_create_db_directories() { + local user; user="$(id -u)" + + mkdir -p "$PGDATA" + # ignore failure since there are cases where we can't chmod (and PostgreSQL might fail later anyhow - it's picky about permissions of this directory) + chmod 700 "$PGDATA" || : + + # ignore failure since it will be fine when using the image provided directory; see also https://github.com/docker-library/postgres/pull/289 + mkdir -p /var/run/postgresql || : + chmod 775 /var/run/postgresql || : + + # Create the transaction log directory before initdb is run so the directory is owned by the correct user + if [ -n "$POSTGRES_INITDB_WALDIR" ]; then + mkdir -p "$POSTGRES_INITDB_WALDIR" + if [ "$user" = '0' ]; then + find "$POSTGRES_INITDB_WALDIR" \! -user postgres -exec chown postgres '{}' + + fi + chmod 700 "$POSTGRES_INITDB_WALDIR" + fi + + # allow the container to be started with `--user` + if [ "$user" = '0' ]; then + find "$PGDATA" \! -user postgres -exec chown postgres '{}' + + find /var/run/postgresql \! -user postgres -exec chown postgres '{}' + + fi +} + +# initialize empty PGDATA directory with new database via 'initdb' +# arguments to `initdb` can be passed via POSTGRES_INITDB_ARGS or as arguments to this function +# `initdb` automatically creates the "postgres", "template0", and "template1" dbnames +# this is also where the database user is created, specified by `POSTGRES_USER` env +docker_init_database_dir() { + # "initdb" is particular about the current user existing in "/etc/passwd", so we use "nss_wrapper" to fake that if necessary + # see https://github.com/docker-library/postgres/pull/253, https://github.com/docker-library/postgres/issues/359, https://cwrap.org/nss_wrapper.html + local uid; uid="$(id -u)" + if ! getent passwd "$uid" &> /dev/null; then + # see if we can find a suitable "libnss_wrapper.so" (https://salsa.debian.org/sssd-team/nss-wrapper/-/commit/b9925a653a54e24d09d9b498a2d913729f7abb15) + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local gid; gid="$(id -g)" + echo "postgres:x:$uid:$gid:PostgreSQL:$PGDATA:/bin/false" > "$NSS_WRAPPER_PASSWD" + echo "postgres:x:$gid:" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi + + if [ -n "$POSTGRES_INITDB_WALDIR" ]; then + set -- --waldir "$POSTGRES_INITDB_WALDIR" "$@" + fi + + eval 'initdb --username="$POSTGRES_USER" --pwfile=<(echo "$POSTGRES_PASSWORD") '"$POSTGRES_INITDB_ARGS"' "$@"' + + # unset/cleanup "nss_wrapper" bits + if [[ "${LD_PRELOAD:-}" == */libnss_wrapper.so ]]; then + rm -f "$NSS_WRAPPER_PASSWD" "$NSS_WRAPPER_GROUP" + unset LD_PRELOAD NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + fi +} + +# print large warning if POSTGRES_PASSWORD is long +# error if both POSTGRES_PASSWORD is empty and POSTGRES_HOST_AUTH_METHOD is not 'trust' +# print large warning if POSTGRES_HOST_AUTH_METHOD is set to 'trust' +# assumes database is not set up, ie: [ -z "$DATABASE_ALREADY_EXISTS" ] +docker_verify_minimum_env() { + # check password first so we can output the warning before postgres + # messes it up + if [ "${#POSTGRES_PASSWORD}" -ge 100 ]; then + cat >&2 <<-'EOWARN' + + WARNING: The supplied POSTGRES_PASSWORD is 100+ characters. + + This will not work if used via PGPASSWORD with "psql". + + https://www.postgresql.org/message-id/flat/E1Rqxp2-0004Qt-PL%40wrigleys.postgresql.org (BUG #6412) + https://github.com/docker-library/postgres/issues/507 + + EOWARN + fi + if [ -z "$POSTGRES_PASSWORD" ] && [ 'trust' != "$POSTGRES_HOST_AUTH_METHOD" ]; then + # The - option suppresses leading tabs but *not* spaces. :) + cat >&2 <<-'EOE' + Error: Database is uninitialized and superuser password is not specified. + You must specify POSTGRES_PASSWORD to a non-empty value for the + superuser. For example, "-e POSTGRES_PASSWORD=password" on "docker run". + + You may also use "POSTGRES_HOST_AUTH_METHOD=trust" to allow all + connections without a password. This is *not* recommended. + + See PostgreSQL documentation about "trust": + https://www.postgresql.org/docs/current/auth-trust.html + EOE + exit 1 + fi + if [ 'trust' = "$POSTGRES_HOST_AUTH_METHOD" ]; then + cat >&2 <<-'EOWARN' + ******************************************************************************** + WARNING: POSTGRES_HOST_AUTH_METHOD has been set to "trust". This will allow + anyone with access to the Postgres port to access your database without + a password, even if POSTGRES_PASSWORD is set. See PostgreSQL + documentation about "trust": + https://www.postgresql.org/docs/current/auth-trust.html + In Docker's default configuration, this is effectively any other + container on the same system. + + It is not recommended to use POSTGRES_HOST_AUTH_METHOD=trust. Replace + it with "-e POSTGRES_PASSWORD=password" instead to set a password in + "docker run". + ******************************************************************************** + EOWARN + fi +} + +# usage: docker_process_init_files [file [file [...]]] +# ie: docker_process_init_files /always-initdb.d/* +# process initializer files, based on file extensions and permissions +docker_process_init_files() { + # psql here for backwards compatibility "${psql[@]}" + psql=( docker_process_sql ) + + echo + local f + for f; do + case "$f" in + *.sh) + # https://github.com/docker-library/postgres/issues/450#issuecomment-393167936 + # https://github.com/docker-library/postgres/pull/452 + if [ -x "$f" ]; then + echo "$0: running $f" + "$f" + else + echo "$0: sourcing $f" + . "$f" + fi + ;; + *.sql) echo "$0: running $f"; docker_process_sql -f "$f"; echo ;; + *.sql.gz) echo "$0: running $f"; gunzip -c "$f" | docker_process_sql; echo ;; + *.sql.xz) echo "$0: running $f"; xzcat "$f" | docker_process_sql; echo ;; + *) echo "$0: ignoring $f" ;; + esac + echo + done +} + +# Execute sql script, passed via stdin (or -f flag of pqsl) +# usage: docker_process_sql [psql-cli-args] +# ie: docker_process_sql --dbname=mydb <<<'INSERT ...' +# ie: docker_process_sql -f my-file.sql +# ie: docker_process_sql > "$PGDATA/pg_hba.conf" +} + +# start socket-only postgresql server for setting up or running scripts +# all arguments will be passed along as arguments to `postgres` (via pg_ctl) +docker_temp_server_start() { + if [ "$1" = 'postgres' ]; then + shift + fi + + # internal start of server in order to allow setup using psql client + # does not listen on external TCP/IP and waits until start finishes + set -- "$@" -c listen_addresses='' -p "${PGPORT:-5432}" + + PGUSER="${PGUSER:-$POSTGRES_USER}" \ + pg_ctl -D "$PGDATA" \ + -o "$(printf '%q ' "$@")" \ + -w start +} + +# stop postgresql server after done setting up user and running scripts +docker_temp_server_stop() { + PGUSER="${PGUSER:-postgres}" \ + pg_ctl -D "$PGDATA" -m fast -w stop +} + +# check arguments for an option that would cause postgres to stop +# return true if there is one +_pg_want_help() { + local arg + for arg; do + case "$arg" in + # postgres --help | grep 'then exit' + # leaving out -C on purpose since it always fails and is unhelpful: + # postgres: could not access the server configuration file "/var/lib/postgresql/data/postgresql.conf": No such file or directory + -'?'|--help|--describe-config|-V|--version) + return 0 + ;; + esac + done + return 1 +} + +_main() { + # if first arg looks like a flag, assume we want to run postgres server + if [ "${1:0:1}" = '-' ]; then + set -- postgres "$@" + fi + + if [ "$1" = 'postgres' ] && ! _pg_want_help "$@"; then + docker_setup_env + # setup data directories and permissions (when run as root) + docker_create_db_directories + if [ "$(id -u)" = '0' ]; then + # then restart script as postgres user + exec su-exec postgres "$BASH_SOURCE" "$@" + fi + + # only run initialization on an empty data directory + if [ -z "$DATABASE_ALREADY_EXISTS" ]; then + docker_verify_minimum_env + + # check dir permissions to reduce likelihood of half-initialized database + ls /docker-entrypoint-initdb.d/ > /dev/null + ls /docker-default-initdb.d/ > /dev/null + + docker_init_database_dir + pg_setup_hba_conf "$@" + + # PGPASSWORD is required for psql when authentication is required for 'local' connections via pg_hba.conf and is otherwise harmless + # e.g. when '--auth=md5' or '--auth-local=md5' is used in POSTGRES_INITDB_ARGS + export PGPASSWORD="${PGPASSWORD:-$POSTGRES_PASSWORD}" + docker_temp_server_start "$@" + + docker_setup_db + docker_process_init_files /docker-default-initdb.d/* + docker_process_init_files /docker-entrypoint-initdb.d/* + + docker_temp_server_stop + unset PGPASSWORD + + echo + echo 'PostgreSQL init process complete; ready for start up.' + echo + else + echo + echo 'PostgreSQL Database directory appears to contain a database; Skipping initialization' + echo + fi + fi + + exec "$@" +} + +if ! _is_sourced; then + _main "$@" +fi diff --git a/contrib/orioledb/docker/init/postgresql.docker.conf b/contrib/orioledb/docker/init/postgresql.docker.conf new file mode 100644 index 00000000000..f8fdcfaf16b --- /dev/null +++ b/contrib/orioledb/docker/init/postgresql.docker.conf @@ -0,0 +1,8 @@ +data_directory = '/var/lib/postgresql/data' +hba_file = '/var/lib/postgresql/data/pg_hba.conf' +listen_addresses = '*' +shared_preload_libraries = 'orioledb' +default_table_access_method = 'orioledb' +orioledb.main_buffers = 512MB +orioledb.undo_buffers = 256MB +max_wal_size = 8GB diff --git a/contrib/orioledb/docker/orioledb-config.sh b/contrib/orioledb/docker/orioledb-config.sh new file mode 100644 index 00000000000..57bc0b7fc37 --- /dev/null +++ b/contrib/orioledb/docker/orioledb-config.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2034,SC2154 + +testAlias[orioletest]=postgres + +imageTests[orioletest]=' + orioledb-basics +' diff --git a/contrib/orioledb/docker/tests/orioledb-basics/run.sh b/contrib/orioledb/docker/tests/orioledb-basics/run.sh new file mode 100755 index 00000000000..c1bf4aee145 --- /dev/null +++ b/contrib/orioledb/docker/tests/orioledb-basics/run.sh @@ -0,0 +1,259 @@ +#!/bin/bash +# shellcheck disable=SC2119,SC2120 +set -eo pipefail + +image="$1" + +export POSTGRES_USER='my cool orioledb user' +export POSTGRES_PASSWORD='my cool orioledb password' +export POSTGRES_DB='my cool orioledb database' + +cname="orioletest-container-$RANDOM-$RANDOM" +cid="$(docker run -d -e POSTGRES_USER -e POSTGRES_PASSWORD -e POSTGRES_DB --name "$cname" "$image")" +trap 'docker rm -vf "$cid" > /dev/null' EXIT + +psql() { + docker run --rm -i \ + --link "$cname":orioletest \ + --entrypoint psql \ + -e PGPASSWORD="$POSTGRES_PASSWORD" \ + "$image" \ + --host orioletest \ + --username "$POSTGRES_USER" \ + --dbname "$POSTGRES_DB" \ + --quiet --no-align --tuples-only \ + --set=ON_ERROR_STOP=1 \ + "$@" +} + +# Set default values for POSTGRES_TEST_TRIES and POSTGRES_TEST_SLEEP if they are not set. +# You can change the default value of POSTGRES_TEST_TRIES and the POSTGRES_TEST_SLEEP in the CI build settings. +# For special cases like Buildx/qemu tests, you may need to set POSTGRES_TEST_TRIES to 42. +: "${POSTGRES_TEST_TRIES:=15}" +: "${POSTGRES_TEST_SLEEP:=2}" +tries="$POSTGRES_TEST_TRIES" +while ! echo 'SELECT 1' | psql &>/dev/null; do + ((tries--)) + if [ $tries -le 0 ]; then + echo >&2 'postgres failed to accept connections in a reasonable amount of time!' + echo 'SELECT 1' | psql # to hopefully get a useful error message + false + fi + sleep "$POSTGRES_TEST_SLEEP" +done + + +# minimal OrioleDB test +psql <<'EOSQL' + + CREATE EXTENSION IF NOT EXISTS orioledb; + SELECT orioledb_commit_hash(); + CREATE TABLE o_test_generated ( + a int, + b int GENERATED ALWAYS AS (a * 2) STORED + ) USING orioledb; + INSERT INTO o_test_generated VALUES (1), (2); + SELECT * FROM o_test_generated; + +EOSQL + +echo "SELECT version();" | psql +echo "\dx" | psql + +# Helper: assert two values are equal; print PASS/FAIL and exit on failure +assert_eq() { + local description="$1" + local expected="$2" + local actual="$3" + if [ "$expected" = "$actual" ]; then + echo "PASS: $description" + else + echo "FAIL: $description" + echo " expected: $(printf '%q' "$expected")" + echo " actual: $(printf '%q' "$actual")" + exit 1 + fi +} + +echo "" +echo "=== Test: Dev functions available (required for regression tests) ===" +# Verify that IS_DEV=1 build produced all dev-only functions. +# If any are missing, .dockerignore likely leaks a stale generated SQL file. +result=$(echo "SELECT count(*) FROM pg_proc WHERE proname = 'orioledb_parallel_debug_start';" | psql) +assert_eq "dev functions: orioledb_parallel_debug_start (orioledb--1.0_dev.sql)" "1" "$result" + +result=$(echo "SELECT count(*) FROM pg_proc WHERE proname = 'orioledb_rewind_set_complete';" | psql) +assert_eq "dev functions: orioledb_rewind_set_complete (orioledb--1.4--1.5_dev.sql)" "1" "$result" + +result=$(echo "SELECT count(*) FROM pg_proc WHERE proname = 'orioledb_insert_sys_xid_undo_location';" | psql) +assert_eq "dev functions: orioledb_insert_sys_xid_undo_location (orioledb--1.5--1.6_dev.sql)" "1" "$result" + +echo "" +echo "=== Test: Primary key CRUD ===" +psql <<'EOSQL' + CREATE TABLE o_test_pk ( + id integer NOT NULL PRIMARY KEY, + val text + ) USING orioledb; + INSERT INTO o_test_pk VALUES (1, 'one'), (2, 'two'), (3, 'three'); + UPDATE o_test_pk SET val = 'ONE' WHERE id = 1; + DELETE FROM o_test_pk WHERE id = 3; + SELECT id, val FROM o_test_pk ORDER BY id; +EOSQL + +result=$(echo "SELECT val FROM o_test_pk WHERE id = 1;" | psql) +assert_eq "primary key: UPDATE changed value" "ONE" "$result" + +result=$(echo "SELECT count(*) FROM o_test_pk;" | psql) +assert_eq "primary key: 2 rows after insert/update/delete" "2" "$result" + +echo "" +echo "=== Test: Secondary index scan ===" +psql <<'EOSQL' + CREATE TABLE o_test_idx ( + id integer NOT NULL PRIMARY KEY, + score integer + ) USING orioledb; + CREATE INDEX o_test_idx_score ON o_test_idx (score); + INSERT INTO o_test_idx SELECT i, i * 10 FROM generate_series(1, 20) AS i; +EOSQL + +result=$(echo "SELECT count(*) FROM o_test_idx WHERE score BETWEEN 50 AND 100;" | psql) +assert_eq "secondary index: rows with score 50-100" "6" "$result" + +# Force index scan and verify the planner actually uses our secondary index. +explain_plan=$(echo "SET enable_seqscan = off; EXPLAIN SELECT count(*) FROM o_test_idx WHERE score BETWEEN 50 AND 100;" | psql) +if [[ "$explain_plan" != *"o_test_idx_score"* ]]; then + echo "FAIL: secondary index: expected EXPLAIN to use o_test_idx_score" >&2 + echo "$explain_plan" >&2 + exit 1 +fi +echo "PASS: secondary index: EXPLAIN confirms index scan on o_test_idx_score" + +echo "" +echo "=== Test: NULL handling ===" +psql <<'EOSQL' + CREATE TABLE o_test_nulls ( + id integer NOT NULL PRIMARY KEY, + val integer + ) USING orioledb; + INSERT INTO o_test_nulls VALUES (1, 10), (2, NULL), (3, 30), (4, NULL); +EOSQL + +result=$(echo "SELECT count(*) FROM o_test_nulls WHERE val IS NULL;" | psql) +assert_eq "nulls: IS NULL count" "2" "$result" + +result=$(echo "SELECT count(*) FROM o_test_nulls WHERE val IS NOT NULL;" | psql) +assert_eq "nulls: IS NOT NULL count" "2" "$result" + +echo "" +echo "=== Test: INSERT ON CONFLICT DO NOTHING ===" +psql <<'EOSQL' + CREATE TABLE o_test_ioc ( + id integer NOT NULL PRIMARY KEY, + val text + ) USING orioledb; + INSERT INTO o_test_ioc VALUES (1, 'original'); + INSERT INTO o_test_ioc VALUES (1, 'conflict') ON CONFLICT (id) DO NOTHING; + INSERT INTO o_test_ioc VALUES (2, 'new') ON CONFLICT (id) DO NOTHING; +EOSQL + +result=$(echo "SELECT val FROM o_test_ioc WHERE id = 1;" | psql) +assert_eq "ioc: DO NOTHING keeps original value" "original" "$result" + +result=$(echo "SELECT count(*) FROM o_test_ioc;" | psql) +assert_eq "ioc: 2 rows total" "2" "$result" + +echo "" +echo "=== Test: INSERT ON CONFLICT DO UPDATE (upsert) ===" +psql <<'EOSQL' + CREATE TABLE o_test_upsert ( + id integer NOT NULL PRIMARY KEY, + hits integer NOT NULL DEFAULT 0 + ) USING orioledb; + INSERT INTO o_test_upsert VALUES (1, 1); + INSERT INTO o_test_upsert AS t VALUES (1, 1) + ON CONFLICT (id) DO UPDATE SET hits = t.hits + EXCLUDED.hits; + INSERT INTO o_test_upsert AS t VALUES (1, 1) + ON CONFLICT (id) DO UPDATE SET hits = t.hits + EXCLUDED.hits; +EOSQL + +result=$(echo "SELECT hits FROM o_test_upsert WHERE id = 1;" | psql) +assert_eq "upsert: hits accumulated to 3" "3" "$result" + +echo "" +echo "=== Test: DDL ALTER TABLE ADD COLUMN ===" +psql <<'EOSQL' + CREATE TABLE o_test_ddl ( + id integer NOT NULL PRIMARY KEY, + val text + ) USING orioledb; + INSERT INTO o_test_ddl VALUES (1, 'hello'); + ALTER TABLE o_test_ddl ADD COLUMN extra integer DEFAULT 42; +EOSQL + +result=$(echo "SELECT extra FROM o_test_ddl WHERE id = 1;" | psql) +assert_eq "ddl: added column has default value" "42" "$result" + +echo "" +echo "=== Test: Foreign key constraint ===" +psql <<'EOSQL' + CREATE TABLE o_test_fk_parent ( + id integer NOT NULL PRIMARY KEY, + val text + ) USING orioledb; + CREATE TABLE o_test_fk_child ( + id integer NOT NULL PRIMARY KEY, + parent_id integer NOT NULL REFERENCES o_test_fk_parent (id) + ) USING orioledb; + INSERT INTO o_test_fk_parent VALUES (1, 'parent'); + INSERT INTO o_test_fk_child VALUES (10, 1); +EOSQL + +result=$(echo "SELECT count(*) FROM o_test_fk_child;" | psql) +assert_eq "fk: child row inserted" "1" "$result" + +# FK violation must raise an error +fk_err=$(psql <<'EOSQL' 2>&1 || true + INSERT INTO o_test_fk_child VALUES (99, 999); +EOSQL +) +if echo "$fk_err" | grep -q 'violates foreign key constraint'; then + echo "PASS: fk: violation correctly rejected" +else + echo "FAIL: fk: expected FK violation error, got: $fk_err" + exit 1 +fi + +echo "" +echo "=== Test: TRUNCATE ===" +psql <<'EOSQL' + CREATE TABLE o_test_truncate ( + id integer NOT NULL PRIMARY KEY + ) USING orioledb; + INSERT INTO o_test_truncate SELECT i FROM generate_series(1, 100) AS i; + TRUNCATE o_test_truncate; +EOSQL + +result=$(echo "SELECT count(*) FROM o_test_truncate;" | psql) +assert_eq "truncate: table is empty after TRUNCATE" "0" "$result" + +echo "" +echo "=== Test: Generated columns ===" +psql <<'EOSQL' + CREATE TABLE o_test_generated_multi ( + a integer NOT NULL PRIMARY KEY, + b integer GENERATED ALWAYS AS (a * a) STORED, + c text GENERATED ALWAYS AS ('item_' || a::text) STORED + ) USING orioledb; + INSERT INTO o_test_generated_multi (a) VALUES (3), (5); +EOSQL + +result=$(echo "SELECT b FROM o_test_generated_multi WHERE a = 5;" | psql) +assert_eq "generated: square of 5 is 25" "25" "$result" + +result=$(echo "SELECT c FROM o_test_generated_multi WHERE a = 3;" | psql) +assert_eq "generated: text column for a=3" "item_3" "$result" + +echo "" +echo "All smoke tests passed." diff --git a/contrib/orioledb/include/btree/btree.h b/contrib/orioledb/include/btree/btree.h new file mode 100644 index 00000000000..4fcd5ec69b7 --- /dev/null +++ b/contrib/orioledb/include/btree/btree.h @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * btree.h + * General declarations for OrioleDB B-tree implementation + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/btree.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_H__ +#define __BTREE_H__ + +#include "transam/oxid.h" +#include "transam/undo.h" +#include "utils/seq_buf.h" + +#include "access/sdir.h" +#include "lib/stringinfo.h" +#include "storage/bufpage.h" +#include "storage/fd.h" +#include "storage/off.h" + +#define BTREE_NUM_META_LWLOCKS (128) + +typedef struct BTreeDescr BTreeDescr; +typedef struct BTreeIterator BTreeIterator; +typedef struct CheckpointFileHeader CheckpointFileHeader; + +typedef uint16 OIndexNumber; +typedef uint64 OTupleXactInfo; + +#define PrimaryIndexNumber (0) +#define BridgeIndexNumber (0xFFFD) +#define TOASTIndexNumber (0xFFFE) +#define InvalidIndexNumber (0xFFFF) + +typedef enum BTreeKeyType +{ + BTreeKeyLeafTuple, + BTreeKeyNonLeafKey, + BTreeKeyBound, + BTreeKeyUniqueLowerBound, + BTreeKeyUniqueUpperBound, + /* following values are never passed to comparison function */ + BTreeKeyNone, + BTreeKeyPageHiKey, + BTreeKeyRightmost +} BTreeKeyType; + +#define IS_BOUND_KEY_TYPE(keyType) \ + ((keyType) == BTreeKeyBound || \ + (keyType) == BTreeKeyUniqueLowerBound || \ + (keyType) == BTreeKeyUniqueUpperBound) + +typedef int (*OBTreeKeyCmp) (BTreeDescr *descr, + void *p1, BTreeKeyType k1, + void *p2, BTreeKeyType k2); + +typedef struct +{ + OInMemoryBlkno rootPageBlkno; + uint32 rootPageChangeCount; + OInMemoryBlkno metaPageBlkno; +} BTreeRootInfo; + +typedef enum +{ + /* just in memory BTree, no eviction and no checkpoint support */ + BTreeStorageInMemory, + /* no checkpoint support, but pages can be evicted into a disk */ + BTreeStorageTemporary, + /* like BTreeStoragePersistence, but no wal for data modifications */ + BTreeStorageUnlogged, + /* checkpoint and eviction for pages support */ + BTreeStoragePersistence +} BTreeStorageType; + +typedef enum BTreeOperationType +{ + BTreeOperationInsert, + BTreeOperationLock, + BTreeOperationUpdate, + BTreeOperationDelete +} BTreeOperationType; + +typedef enum BTreeLeafTupleDeletedStatus +{ + BTreeLeafTupleNonDeleted = 0, + BTreeLeafTupleDeleted = 1, + BTreeLeafTupleMovedPartitions = 2, + BTreeLeafTuplePKChanged = 3 +} BTreeLeafTupleDeletedStatus; + +typedef struct +{ + Pointer data; + uint8 formatFlags; +} OTuple; + +#define O_TUPLE_IS_NULL(tup) ((tup).data == NULL) +#define O_TUPLE_SET_NULL(tup) \ + do { \ + (tup).data = NULL; \ + (tup).formatFlags = 0; \ + } while (false) + +typedef union +{ + struct + { + File *files; + int filesAllocated; + } array; + struct s3Files_hash *hash; +} OSmgr; + +typedef enum +{ + OTupleLength, + OKeyLength, + OTupleKeyLength, + OTupleKeyLengthNoVersion +} OLengthType; + +typedef struct +{ + /* + * Get the length of a given `tuple` of a `type`. Must be safe for + * critical sections. + */ + int (*len) (BTreeDescr *desc, OTuple tuple, OLengthType type); + + /* + * Changes BTreeKeyLeafTuple to BTreeKeyNonLeafKey. If `data` is given, + * then write data there. Otherwise, it may allocate memory or use static + * memory for the result (the `*allocated` flag reflects this). When + * `data` is given, this function must be safe for the critical section. + */ + OTuple (*tuple_make_key) (BTreeDescr *desc, OTuple tuple, Pointer data, + bool keepVersion, bool *allocated); + + JsonbValue *(*key_to_jsonb) (BTreeDescr *desc, OTuple key, + JsonbParseState **state); + bool (*needs_undo) (BTreeDescr *desc, BTreeOperationType action, + OTuple oldTuple, OTupleXactInfo oldXactInfo, bool oldDeleted, + OTuple newTuple, OXid newOxid); + uint32 (*hash) (BTreeDescr *desc, OTuple tuple, BTreeKeyType tupleType); + uint32 (*unique_hash) (BTreeDescr *desc, OTuple tuple); + OBTreeKeyCmp cmp; +} BTreeOps; + +#define MAX_NUM_DIRTY_PARTS 4 + +/* + * Pending data file parts to be synchronized with S3. + */ +typedef struct +{ + struct + { + uint32 chkpNum; + int32 segNum; + int32 partNum; + } dirtyParts[MAX_NUM_DIRTY_PARTS]; + S3TaskLocation writeMaxLocation; +} BTreeS3PartsInfo; + +/* + * Backend-local free-extent list used by user temporary trees. + * + * User temporary trees live entirely inside the process that created them + * (root, meta page and data file are all backend-private), so their free + * space map must not touch shared checkpoint state. Instead of routing + * freed extents through the checkpoint-tagged seq bufs, we keep a plain + * array of free extents on the descriptor. get_free_disk_extent() serves + * allocations from this array first, falling back to extending the data + * file. + * + * This path is taken only when `ppool` is the backend-local pool; system + * trees that happen to be BTreeStorageTemporary still share their pool and + * continue to use the shared seq-buf machinery. + */ +typedef struct BTreeLocalFreeExtents +{ + FileExtent *items; + int size; + int capacity; +} BTreeLocalFreeExtents; + +struct BTreeDescr +{ + BTreeRootInfo rootInfo; + void *arg; + OSmgr smgr; + ORelOids oids; + Oid tablespace; + OIndexType type; + PagePool *ppool; + OCompress compress; + uint8 fillfactor; + UndoLogType undoType; + BTreeStorageType storageType; + + /* + * Per-backend private seq buf descriptors. The corresponding shared + * state lives in the BTreeMetaPage (freeBuf, nextChkp[], tmpBuf[]). + * + * freeBuf – reads the free-extent file produced by the previous + * checkpoint so that the current checkpoint can reuse those disk + * locations. + * + * nextChkp[2] – writes the checkpoint map file for the current + * checkpoint. Indexed by (checkpointNumber % 2) so that two consecutive + * checkpoints use different slots and can coexist without interference. + * Only one slot is active at any time; the other is uninitialised or + * belongs to the previous (already completed) checkpoint. + * + * tmpBuf[2] – writes the temporary file used during the current + * checkpoint walk to track dirty and newly placed pages. Also indexed by + * (checkpointNumber % 2). The file is consumed during post-processing + * (sort + hole-punch) and removed once the checkpoint is complete. + */ + SeqBufDescPrivate freeBuf; + SeqBufDescPrivate nextChkp[2]; + SeqBufDescPrivate tmpBuf[2]; + BTreeS3PartsInfo buildPartsInfo[2]; + OXid createOxid; + BTreeOps *ops; + + /* + * Backend-local free space map for BTreeStorageTemporary trees. Lazily + * allocated on first free_extent_for_checkpoint() call; NULL otherwise. + */ + BTreeLocalFreeExtents *localFreeExtents; +}; + +static inline int +o_btree_len(BTreeDescr *desc, OTuple tuple, OLengthType type) +{ + return desc->ops->len(desc, tuple, type); +} + +static inline OTuple +o_btree_tuple_make_key(BTreeDescr *desc, OTuple tuple, Pointer data, + bool keepVersion, bool *allocated) +{ + return desc->ops->tuple_make_key(desc, tuple, data, keepVersion, allocated); +} + +static inline JsonbValue * +o_btree_key_to_jsonb(BTreeDescr *desc, OTuple key, JsonbParseState **state) +{ + return desc->ops->key_to_jsonb(desc, key, state); +} + +static inline bool +o_btree_needs_undo(BTreeDescr *desc, BTreeOperationType action, + OTuple oldTuple, OTupleXactInfo oldXactInfo, bool oldDeleted, + OTuple newTuple, OXid newOxid) +{ + return (desc->ops->needs_undo != NULL) && + desc->ops->needs_undo(desc, action, oldTuple, oldXactInfo, + oldDeleted, newTuple, newOxid); +} + +static inline uint32 +o_btree_hash(BTreeDescr *desc, OTuple tuple, BTreeKeyType tupleType) +{ + return desc->ops->hash(desc, tuple, tupleType); +} + +static inline uint32 +o_btree_unique_hash(BTreeDescr *desc, OTuple tuple) +{ + return desc->ops->unique_hash(desc, tuple); +} + +static inline int +o_btree_cmp(BTreeDescr *desc, void *p1, BTreeKeyType k1, + void *p2, BTreeKeyType k2) +{ + return desc->ops->cmp(desc, p1, k1, p2, k2); +} + + +typedef struct BTreePageItemLocator BTreePageItemLocator; + +typedef struct +{ + OInMemoryBlkno blkno; + uint32 pageChangeCount; +} BTreeLocationHint; + +typedef struct +{ + BTreeLocationHint hint; + CommitSeqNo csn; + uint32 version; +} ORowIdAddendumCtid; + +typedef struct +{ + BTreeLocationHint hint; + CommitSeqNo csn; + uint8 flags; +} ORowIdAddendumNonCtid; + +typedef struct +{ + ItemPointerData bridgeCtid; + bool bridgeChanged; +} ORowIdBridgeData; + +bytea *o_new_rowid(OIndexDescr *primary, TupleTableSlot *slot, + Datum *rowid_values, bool *rowid_isnull, + CommitSeqNo tupleCsn, BTreeLocationHint *hint); + +/* + * Check if given tree has assigned datoid, reloid and relnode. + */ +#define TREE_HAS_OIDS(desc) (ORelOidsIsValid((desc)->oids)) + +/* + * Get number of tree leaf pages. + */ +#define TREE_NUM_LEAF_PAGES(desc) \ + (pg_atomic_read_u32(&BTREE_GET_META(desc)->leafPagesNum)) + +/* + * Check if given tree needs WAL and XIP records. Currently, only primary index + * tree and TOAST tree need it. Argument is (BTreeDescr *). + */ +#define TREE_NEEDS_WAL(desc) \ + (TREE_HAS_OIDS(desc) && \ + ((desc)->type == oIndexPrimary || (desc)->type == oIndexToast)) + +/* btree.c */ +typedef enum OBTreeModifyCallbackAction +{ + OBTreeCallbackActionDoNothing = 1, + OBTreeCallbackActionUpdate = 2, + OBTreeCallbackActionDelete = 3, + OBTreeCallbackActionLock = 4, + OBTreeCallbackActionUndo = 5 +} OBTreeModifyCallbackAction; + +typedef enum OBTreeWaitCallbackAction +{ + OBTreeCallbackActionXidNoWait = 1, + OBTreeCallbackActionXidWait = 2, + OBTreeCallbackActionXidExit = 3 +} OBTreeWaitCallbackAction; + +typedef enum OBTreeModifyResult +{ + OBTreeModifyResultInserted = 1, + OBTreeModifyResultUpdated = 2, + OBTreeModifyResultDeleted = 3, + OBTreeModifyResultLocked = 4, + OBTreeModifyResultFound = 5, + OBTreeModifyResultNotFound = 6 +} OBTreeModifyResult; + +typedef enum RowLockMode +{ + RowLockKeyShare = 0, + RowLockShare = 1, + RowLockNoKeyUpdate = 2, + RowLockUpdate = 3 +} RowLockMode; + +#define ROW_LOCKS_CONFLICT(lock1, lock2) ((lock1) + (lock2) >= 3) + +/* + * OTupleXactInfo contains information about transaction, lock mode, lock only + * flag. + */ +#define XACT_INFO_LOCK_ONLY_BIT \ + UINT64CONST(0x1000000000000000) +#define XACT_INFO_LOCK_MODE_MASK \ + UINT64CONST(0x0C00000000000000) +#define XACT_INFO_LOCK_OXID_MASK \ + UINT64CONST(0x03FFFFFFFFFFFFFF) +#define XACT_INFO_LOCK_MODE_SHIFT \ + (58) +#define XACT_INFO_IS_LOCK_ONLY(xactInfo) \ + ((xactInfo) & XACT_INFO_LOCK_ONLY_BIT) +#define XACT_INFO_MAP_CSN(xactInfo) \ + (oxid_get_csn(XACT_INFO_GET_OXID((xactInfo)), false)) +#define XACT_INFO_GET_OXID(xactInfo) \ + ((xactInfo) & XACT_INFO_LOCK_OXID_MASK) +#define XACT_INFO_OXID_EQ(xactInfo, oxid) \ + (XACT_INFO_GET_OXID((xactInfo)) == (oxid)) +#define XACT_INFO_OXID_IS_CURRENT(xactInfo) \ + (XACT_INFO_GET_OXID((xactInfo)) == get_current_oxid_if_any()) +#define XACT_INFO_IS_FINISHED(xactInfo) \ + (xid_is_finished(XACT_INFO_GET_OXID(xactInfo))) +#define XACT_INFO_FINISHED_FOR_EVERYBODY(xactInfo) \ + (xid_is_finished_for_everybody(XACT_INFO_GET_OXID(xactInfo))) +#define XACT_INFO_GET_LOCK_MODE(xactInfo) \ + (((xactInfo) & XACT_INFO_LOCK_MODE_MASK) >> XACT_INFO_LOCK_MODE_SHIFT) +#define OXID_GET_XACT_INFO(oxid, lockmode, lockonly) \ + (AssertMacro(((lockmode) & (XACT_INFO_LOCK_MODE_MASK >> XACT_INFO_LOCK_MODE_SHIFT)) == (lockmode)), \ + (OTupleXactInfo)(oxid) | ((OTupleXactInfo) (lockmode) << XACT_INFO_LOCK_MODE_SHIFT) | \ + ((lockonly) ? XACT_INFO_LOCK_ONLY_BIT : 0)) + +/* btree/btree.c */ +extern LWLockPadded *unique_locks; +extern int num_unique_locks; +typedef struct ItemPointerData ItemPointerData; + +extern void o_btree_check_size_of_tuple(int len, char *relation_name, bool index); +extern void o_btree_init_unique_lwlocks(void); +extern void o_btree_init(BTreeDescr *descr); +extern void o_btree_cleanup_pages(OInMemoryBlkno root, OInMemoryBlkno metaPageBlkno, + uint32 rootPageChangeCount); +extern ItemPointerData btree_ctid_get_and_inc(BTreeDescr *desc); +extern ItemPointerData btree_bridge_ctid_get_and_inc(BTreeDescr *desc, bool *overflow); +extern void btree_ctid_update_if_needed(BTreeDescr *desc, ItemPointerData ctid); +extern void btree_desc_stopevent_params_internal(BTreeDescr *desc, + JsonbParseState **state); +extern void btree_page_stopevent_params_internal(BTreeDescr *desc, Page p, + JsonbParseState **state); +extern Jsonb *btree_page_stopevent_params(BTreeDescr *desc, Page p); +extern Jsonb *btree_downlink_stopevent_params(BTreeDescr *desc, Page p, + BTreePageItemLocator *loc); + +#endif /* __BTREE_H__ */ diff --git a/contrib/orioledb/include/btree/build.h b/contrib/orioledb/include/btree/build.h new file mode 100644 index 00000000000..ed6040ddab7 --- /dev/null +++ b/contrib/orioledb/include/btree/build.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * build.h + * Declarations for sort-based B-tree index building. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/build.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_BUILD_H__ +#define __BTREE_BUILD_H__ + +#include "btree.h" + +typedef struct Tuplesortstate Tuplesortstate; + +extern void btree_write_index_data(BTreeDescr *desc, TupleDesc tupdesc, + Tuplesortstate *sortstate, + uint64 ctid, uint64 bridge_ctid, + CheckpointFileHeader *file_header); +extern S3TaskLocation btree_write_file_header(BTreeDescr *desc, + CheckpointFileHeader *file_header); + +#endif /* __BTREE_BUILD_H__ */ diff --git a/contrib/orioledb/include/btree/check.h b/contrib/orioledb/include/btree/check.h new file mode 100644 index 00000000000..30b50cfe334 --- /dev/null +++ b/contrib/orioledb/include/btree/check.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * check.h + * Declarations for checking the OrioleDB B-tree structure. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/check.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_CHECK_H__ +#define __BTREE_CHECK_H__ + +#include "btree/btree.h" + +typedef struct +{ + int from; + int to; + int leaf_count; + int node_count; +} BTreeCompressRange; + +typedef struct +{ + int errors; + int oversize; + int nranges; + int64 totalSize; + int64 totalCompressedSize; + BTreeCompressRange *ranges; +} BTreeCompressStats; + +extern bool check_btree(BTreeDescr *desc, bool force_file_check, + bool wait_for_checkpoint); +extern void check_btree_compression(BTreeDescr *desc, + BTreeCompressStats *stats, + OCompress lvl); + +#endif /* __BTREE_CHECK_H__ */ diff --git a/contrib/orioledb/include/btree/fastpath.h b/contrib/orioledb/include/btree/fastpath.h new file mode 100644 index 00000000000..f14b0af5d0f --- /dev/null +++ b/contrib/orioledb/include/btree/fastpath.h @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * fastpath.h + * Declarations for fastpath intra-page navigation in B-tree. + * + * Copyright (c) 2025-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/fastpath.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_FASTPATH_H__ +#define __BTREE_FASTPATH_H__ + +#include "btree/btree.h" +#include "btree/find.h" +#include "btree/page_contents.h" + +#define FASTPATH_FIND_DOWNLINK_MAX_KEYS (4) +#define FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF (1) +#define FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF (2) + +typedef void (*ArraySearchFunc) (Pointer p, int stride, + int *lower, int *upper, Datum keyDatum); + +typedef struct +{ + bool enabled; + bool inclusive; + int numKeys; + int length; + + Datum offsets[FASTPATH_FIND_DOWNLINK_MAX_KEYS]; + ArraySearchFunc funcs[FASTPATH_FIND_DOWNLINK_MAX_KEYS]; + Datum values[FASTPATH_FIND_DOWNLINK_MAX_KEYS]; + uint8 flags[FASTPATH_FIND_DOWNLINK_MAX_KEYS]; +} FastpathFindDownlinkMeta; + +typedef enum +{ + OBTreeFastPathFindOK, + OBTreeFastPathFindRetry, + OBTreeFastPathFindFailure, + OBTreeFastPathFindSlowpath +} OBTreeFastPathFindResult; + +extern void can_fastpath_find_downlink(OBTreeFindPageContext *context, + void *key, + BTreeKeyType keyType, + FastpathFindDownlinkMeta *meta); +extern OBTreeFastPathFindResult fastpath_find_chunk(Pointer pagePtr, + OInMemoryBlkno blkno, + FastpathFindDownlinkMeta *meta, + int *chunkIndex); +extern OBTreeFastPathFindResult fastpath_find_downlink(Pointer pagePtr, + OInMemoryBlkno blkno, + FastpathFindDownlinkMeta *meta, + BTreePageItemLocator *loc, + BTreeNonLeafTuphdr **tuphdrPtr); + +#endif /* __BTREE_FASTPATH_H__ */ diff --git a/contrib/orioledb/include/btree/find.h b/contrib/orioledb/include/btree/find.h new file mode 100644 index 00000000000..ab74b5bcc50 --- /dev/null +++ b/contrib/orioledb/include/btree/find.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * find.h + * Declarations for finding page in orioledb B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/find.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_FIND_H__ +#define __BTREE_FIND_H__ + +#include "btree.h" +#include "btree/page_contents.h" + +typedef struct +{ + OInMemoryBlkno blkno; + uint32 pageChangeCount; + BTreePageItemLocator locator; +} OBtreePageFindItem; + +struct PartialPageState +{ + Page src; + bool isPartial; + bool hikeysChunkIsLoaded; + bool chunkIsLoaded[BTREE_PAGE_MAX_CHUNKS]; +}; + +typedef struct +{ + BTreeDescr *desc; + Pointer img; + Pointer parentImg; + char imgData[ORIOLEDB_BLCKSZ]; + char parentImgData[ORIOLEDB_BLCKSZ]; + PartialPageState partial; + CommitSeqNo csn; + CommitSeqNo imgReadCsn; + UndoLocation imgUndoLoc; + int index; + OBtreePageFindItem items[ORIOLEDB_MAX_DEPTH]; + OTupleXactInfo insertXactInfo; + OTuple insertTuple; + + /* + * When BTREE_PAGE_FIND_LOKEY_SIBLING is not set, then lokey contains + * hikey of left sibling of parent. Otherwise, contain hikey of left + * sibling. + */ + OFixedKey lokey; + + /* + * Helps to avoid overwriting of a hikey by non-consistency read of the + * image from undo log. + * + * BTREE_PAGE_FIND_LOKEY_UNDO is set when present. + */ + OFixedKey undoLokey; + uint16 flags; +} OBTreeFindPageContext; + +/* OBTreeFindPageContext flags */ +#define BTREE_PAGE_FIND_KEEP_LOKEY (0x0001) +#define BTREE_PAGE_FIND_LOKEY_EXISTS (0x0002) +#define BTREE_PAGE_FIND_LOKEY_SIBLING (0x0004) +#define BTREE_PAGE_FIND_LOKEY_UNDO (0x0008) +#define BTREE_PAGE_FIND_TRY_LOCK (0x0010) +#define BTREE_PAGE_FIND_FIX_LEAF_SPLIT (0x0020) +#define BTREE_PAGE_FIND_NO_FIX_SPLIT (0x0040) +#define BTREE_PAGE_FIND_MODIFY (0x0080) +#define BTREE_PAGE_FIND_FETCH (0x0100) +#define BTREE_PAGE_FIND_IMAGE (0x0200) +#define BTREE_PAGE_FIND_DOWNLINK_LOCATION (0x0400) +#define BTREE_PAGE_FIND_READ_CSN (0x0800) + +#define BTREE_PAGE_FIND_SET(context, flag) ((context)->flags |= BTREE_PAGE_FIND_##flag) +#define BTREE_PAGE_FIND_UNSET(context, flag) ((context)->flags &= ~(BTREE_PAGE_FIND_##flag)) +#define BTREE_PAGE_FIND_IS(context, flag) (((context)->flags & BTREE_PAGE_FIND_##flag)? true : false) + +typedef enum +{ + OFindPageResultSuccess, + OFindPageResultFailure, + OFindPageResultInserted +} OFindPageResult; + +extern bool btree_page_search(BTreeDescr *desc, Page p, Pointer key, + BTreeKeyType keyType, PartialPageState *partial, + BTreePageItemLocator *locator); +extern void init_page_find_context(OBTreeFindPageContext *context, + BTreeDescr *desc, + CommitSeqNo csn, uint16 flags); + +extern OFindPageResult find_page(OBTreeFindPageContext *context, void *key, + BTreeKeyType keyType, uint16 targetLevel); +extern OFindPageResult refind_page(OBTreeFindPageContext *context, void *key, + BTreeKeyType keyType, uint16 level, + OInMemoryBlkno blkno, uint32 pageChangeCount); + +extern bool find_right_page(OBTreeFindPageContext *context, OFixedKey *hikey); +extern bool find_left_page(OBTreeFindPageContext *context, OFixedKey *hikey); +extern OTuple btree_find_context_lokey(OBTreeFindPageContext *context); +extern void btree_find_context_from_modify_to_read(OBTreeFindPageContext *context, + Pointer key, + BTreeKeyType keyType, + uint16 level); + +#endif /* __BTREE_FIND_H__ */ diff --git a/contrib/orioledb/include/btree/insert.h b/contrib/orioledb/include/btree/insert.h new file mode 100644 index 00000000000..0008a1b1560 --- /dev/null +++ b/contrib/orioledb/include/btree/insert.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * insert.h + * Declarations for inserting tuples into B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/insert.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_INSERT_H__ +#define __BTREE_INSERT_H__ + +#include "btree.h" +#include "btree/find.h" + +extern void o_btree_split_fix_and_unlock(BTreeDescr *descr, + OInMemoryBlkno left_blkno); +extern void o_btree_split_fix_for_right_page_and_unlock(BTreeDescr *desc, + OInMemoryBlkno rightBlkno); +extern void o_btree_insert_tuple_to_leaf(OBTreeFindPageContext *context, + OTuple tuple, LocationIndex tuplen, + BTreeLeafTuphdr *leaf_header, + bool replace, + int reserve_kind); +extern bool o_btree_split_is_incomplete(OInMemoryBlkno left_blkno, + uint32 pageChangeCount, + bool *relocked); + +#endif /* __BTREE_INSERT_H__ */ diff --git a/contrib/orioledb/include/btree/io.h b/contrib/orioledb/include/btree/io.h new file mode 100644 index 00000000000..d7f203b38ae --- /dev/null +++ b/contrib/orioledb/include/btree/io.h @@ -0,0 +1,71 @@ +/*------------------------------------------------------------------------- + * + * io.h + * Declarations for orioledb B-tree IO. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/io.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_IO_H__ +#define __BTREE_IO_H__ + +#include "btree.h" +#include "btree/find.h" +#include "btree/undo.h" + +typedef enum OWalkPageResult +{ + OWalkPageSkipped, + OWalkPageWritten, + OWalkPageEvicted, + OWalkPageMerged, +} OWalkPageResult; + +extern Size btree_io_shmem_needs(void); +extern void btree_io_shmem_init(Pointer buf, bool found); +extern void btree_io_error_cleanup(void); +extern void request_btree_io_lwlocks(void); +extern int assign_io_num(OInMemoryBlkno blkno, OffsetNumber offnum); +extern OWalkPageResult walk_page(OInMemoryBlkno blkno, bool evict); +extern void unlock_io(int ionum); +extern void wait_for_io_completion(int ionum); +extern bool cleanup_btree_files(OIndexKey key, bool fsync); +extern bool fsync_btree_files(OIndexKey key); +extern int OFileRead(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info); +extern int OFileWrite(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info); +extern void btree_init_smgr(BTreeDescr *descr); +extern void btree_open_smgr(BTreeDescr *descr); +extern void btree_close_smgr(BTreeDescr *descr); +extern char *btree_filename(OIndexKey key, int segno, uint32 chkpNum); +extern char *btree_smgr_filename(BTreeDescr *desc, off_t offset, + uint32 chkpNum); +extern int btree_smgr_read(BTreeDescr *desc, char *buffer, uint32 chkpNum, + int amount, off_t offset); +extern void btree_smgr_writeback(BTreeDescr *desc, uint32 chkpNum, + off_t offset, int amount); +extern void btree_smgr_sync(BTreeDescr *desc, uint32 chkpNum, off_t length); +extern void btree_smgr_punch_hole(BTreeDescr *desc, uint32 chkpNum, + off_t offset, int length); +extern void punch_fd_hole(int fd, off_t offset, off_t length, + const char *fileName); +extern void init_btree_io_lwlocks(void); +extern bool read_page_from_disk(BTreeDescr *desc, Pointer img, uint64 downlink, FileExtent *extent); +extern void load_page(OBTreeFindPageContext *context); +extern uint64 perform_page_io(BTreeDescr *desc, OInMemoryBlkno blkno, + Page img, uint32 checkpoint_number, + bool copy_blkno, bool *dirty_parent); +extern uint64 perform_page_io_autonomous(BTreeDescr *desc, uint32 chkpNum, + Page img, FileExtent *extent); +extern uint64 perform_page_io_build(BTreeDescr *desc, Page img, + FileExtent *extent, BTreeMetaPage *metaPageBlkno); +extern BTreeDescr *index_oids_get_btree_descr(ORelOids oids, OIndexType type); +extern void try_to_punch_holes(BTreeDescr *desc); + +#endif /* __BTREE_IO_H__ */ diff --git a/contrib/orioledb/include/btree/iterator.h b/contrib/orioledb/include/btree/iterator.h new file mode 100644 index 00000000000..9024ef2e091 --- /dev/null +++ b/contrib/orioledb/include/btree/iterator.h @@ -0,0 +1,93 @@ +/*------------------------------------------------------------------------- + * + * iterator.h + * Declarations of orioledb B-tree iterator. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/iterator.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_ITERATOR_H__ +#define __BTREE_ITERATOR_H__ + +#include "btree.h" +#include "btree/page_contents.h" + +/* + * Return values for TupleFetchCallback used by o_find_tuple_version(). + * + * The callback is invoked while traversing the undo chain: first for the + * on-page tuple, then for each historical version from undo. It selects + * among versions that share the same snapshot csn/xlogptr (e.g. multiple uncommitted + * versions within the same in-progress transaction). + * + * OTupleFetchMatch - the tuple matches; stop and return it. + * OTupleFetchNotMatch - no match found; stop and return NULL. + * OTupleFetchNext - skip this version; continue to the next undo record. + */ +typedef enum +{ + OTupleFetchNext, + OTupleFetchMatch, + OTupleFetchNotMatch +} TupleFetchCallbackResult; + +typedef TupleFetchCallbackResult (*TupleFetchCallback) (OTuple tuple, + OXid tupOxid, + OSnapshot *oSnapshot, + void *arg, + bool oxidIsFinished); + +extern OTuple o_btree_find_tuple_by_key(BTreeDescr *desc, void *key, + BTreeKeyType kind, + OSnapshot *read_o_snapshot, + CommitSeqNo *out_csn, + MemoryContext mcxt, + BTreeLocationHint *hint); + +extern BTreeIterator *o_btree_iterator_create(BTreeDescr *desc, void *key, + BTreeKeyType kind, + OSnapshot *o_snapshot, + ScanDirection scanDir); +extern void o_btree_iterator_set_tuple_ctx(BTreeIterator *it, + MemoryContext tupleCxt); +extern void o_btree_iterator_set_callback(BTreeIterator *it, + TupleFetchCallback callback, + void *arg); +extern OTuple o_btree_iterator_fetch(BTreeIterator *it, + CommitSeqNo *tuple_csn, + void *end, BTreeKeyType endType, + bool endIsIncluded, + BTreeLocationHint *hint); +extern OTuple btree_iterate_raw(BTreeIterator *it, void *end, + BTreeKeyType endKind, bool endInclude, + bool *scanEnd, BTreeLocationHint *hint); +extern OTuple btree_iterate_all(BTreeIterator *it, void *end, + BTreeKeyType endKind, bool endInclude, + bool *scanEnd, BTreeLocationHint *hint, + BTreeLeafTuphdr **tupHdr); +extern void btree_iterator_free(BTreeIterator *it); + +extern OTuple o_btree_find_tuple_by_key_cb(BTreeDescr *desc, void *key, + BTreeKeyType kind, + OSnapshot *read_o_snapshot, + CommitSeqNo *out_csn, + MemoryContext mcxt, + BTreeLocationHint *hint, + bool *deleted, + TupleFetchCallback cb, + void *arg); + +extern OTuple o_find_tuple_version(BTreeDescr *desc, Page p, + BTreePageItemLocator *loc, + OSnapshot *oSnapshot, + CommitSeqNo *tupleCsn, + MemoryContext mcxt, + TupleFetchCallback cb, + void *arg); + +#endif /* __BTREE_ITERATOR_H__ */ diff --git a/contrib/orioledb/include/btree/merge.h b/contrib/orioledb/include/btree/merge.h new file mode 100644 index 00000000000..796470a43f7 --- /dev/null +++ b/contrib/orioledb/include/btree/merge.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * merge.h + * Declarations for B-tree pages merge. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/merge.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_MERGE_H__ +#define __BTREE_MERGE_H__ + +#include "btree/btree.h" +#include "btree/page_contents.h" + +extern bool btree_try_merge_pages(BTreeDescr *desc, + OInMemoryBlkno parent_blkno, + OFixedKey *parent_hikey, + bool *merge_parent, + OInMemoryBlkno left_blkno, + BTreePageItemLocator *right_loc, + OInMemoryBlkno right_blkno, + bool checkpoint); +extern bool btree_try_merge_and_unlock(BTreeDescr *desc, OInMemoryBlkno blkno, + bool nested, bool wait_io); +extern bool is_page_too_sparse(BTreeDescr *desc, Page p); + +#endif /* __BTREE_MERGE_H__ */ diff --git a/contrib/orioledb/include/btree/modify.h b/contrib/orioledb/include/btree/modify.h new file mode 100644 index 00000000000..8e10ad1d7e3 --- /dev/null +++ b/contrib/orioledb/include/btree/modify.h @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * modify.h + * Declarations for OrioleDB B-tree modification. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/modify.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_MODIFY_H__ +#define __BTREE_MODIFY_H__ + +#include "btree.h" + +typedef struct BTreeModifyCallbackInfo +{ + OBTreeWaitCallbackAction (*waitCallback) (BTreeDescr *desc, + OTuple oldTup, + OTuple *newTup, + OXid oxid, + OTupleXactInfo prevXactInfo, + UndoLocation location, + RowLockMode *lockMode, + BTreeLocationHint *hint, + void *arg); + OBTreeModifyCallbackAction (*modifyCallback) (BTreeDescr *desc, + OTuple oldTup, + OTuple *newTup, + OXid oxid, + OTupleXactInfo prevXactInfo, + UndoLocation location, + RowLockMode *lockMode, + BTreeLocationHint *hint, + void *arg); + OBTreeModifyCallbackAction (*modifyDeletedCallback) (BTreeDescr *desc, + OTuple oldTup, + OTuple *newTup, + OXid oxid, + OTupleXactInfo prevXactInfo, + BTreeLeafTupleDeletedStatus deleted, + UndoLocation location, + RowLockMode *lockMode, + BTreeLocationHint *hint, + void *arg); + bool needsUndoForSelfCreated; + void *arg; + + /* + * Optional hook fired once per successful PK-side modification, while the + * affected leaf page is still locked. Called with the freshly created + * undo location (real value from make_undo_record), or with the + * WaitingSkUndoLoc sentinel when the self-created shortcut skips undo + * entirely. Used by the table AM to install the PK-applied/SK-pending + * marker before the page lock drops, eliminating the race window that + * would exist if the marker was written by the caller after + * o_btree_modify() returned. The same `arg` is passed as the other + * callbacks above receive; the hook is expected to extract the + * OTableDescr from whatever arg type the caller chose. + */ + void (*postUndoRecorded) (UndoLocation undoLoc, void *arg); +} BTreeModifyCallbackInfo; + +extern BTreeModifyCallbackInfo nullCallbackInfo; + +extern bool o_btree_autonomous_insert(BTreeDescr *desc, OTuple tuple); +extern bool o_btree_autonomous_delete(BTreeDescr *desc, OTuple key, BTreeKeyType keyType, + BTreeLocationHint *hint); +extern OBTreeModifyResult o_btree_modify(BTreeDescr *desc, + BTreeOperationType action, + OTuple tuple, + BTreeKeyType tupleType, + Pointer key, + BTreeKeyType keyType, + OXid oxid, CommitSeqNo csn, + RowLockMode lockMode, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo); +extern OBTreeModifyResult o_btree_delete_moved_partitions(BTreeDescr *desc, + Pointer key, + BTreeKeyType keyType, + OXid oxid, CommitSeqNo csn, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo); +extern OBTreeModifyResult o_btree_delete_pk_changed(BTreeDescr *desc, + Pointer key, + BTreeKeyType keyType, + OXid oxid, CommitSeqNo csn, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo); +extern OBTreeModifyResult o_btree_insert_unique(BTreeDescr *desc, + OTuple tuple, + BTreeKeyType tupleType, + Pointer key, + BTreeKeyType keyType, + OXid my_oxid, CommitSeqNo my_csn, + RowLockMode lock_mode, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo, + IndexUniqueCheck checkUnique); + +#endif /* __BTREE_MODIFY_H__ */ diff --git a/contrib/orioledb/include/btree/page_chunks.h b/contrib/orioledb/include/btree/page_chunks.h new file mode 100644 index 00000000000..a6000db06eb --- /dev/null +++ b/contrib/orioledb/include/btree/page_chunks.h @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * page_chunks.h + * Declarations for routines dealing with OrioleDB page chunks. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/page_chunks.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_PAGE_CHUNKS_H__ +#define __BTREE_PAGE_CHUNKS_H__ + +#include "btree/page_contents.h" + +typedef enum BTreeItemPageFitType +{ + BTreeItemPageFitAsIs, + BTreeItemPageFitCompactRequired, + BTreeItemPageFitSplitRequired +} BTreeItemPageFitType; + +typedef struct +{ + Pointer data; + LocationIndex size; + uint8 flags; +} BTreePageItem; + +extern bool partial_load_hikeys_chunk(PartialPageState *partial, Page img); +extern bool partial_load_chunk(PartialPageState *partial, Page img, + OffsetNumber chunkOffset, + BTreePageItemLocator *loc); +extern BTreeItemPageFitType page_locator_fits_item(BTreeDescr *desc, + Page p, + BTreePageItemLocator *locator, + LocationIndex size, + bool replace, + CommitSeqNo csn); +extern void o_btree_page_calculate_statistics(BTreeDescr *desc, Pointer p); +extern void init_page_first_chunk(BTreeDescr *desc, Page p, + LocationIndex hikeySize); +extern void page_chunk_fill_locator(Page p, OffsetNumber chunkOffset, + BTreePageItemLocator *locator); +extern void page_item_fill_locator(Page p, OffsetNumber itemOffset, + BTreePageItemLocator *locator); +extern void page_item_fill_locator_backwards(Page p, OffsetNumber itemOffset, + BTreePageItemLocator *locator); +extern bool page_locator_next_chunk(Page p, BTreePageItemLocator *locator); +extern bool page_locator_prev_chunk(Page p, BTreePageItemLocator *locator); +extern void page_locator_insert_item(Page p, BTreePageItemLocator *locator, + LocationIndex itemsize); +extern bool page_locator_fits_new_item(Page p, BTreePageItemLocator *locator, + LocationIndex itemsize); +extern LocationIndex page_locator_get_item_size(Page p, + BTreePageItemLocator *locator); +extern void page_locator_resize_item(Page p, BTreePageItemLocator *locator, + LocationIndex newsize); +extern void page_locator_delete_item(Page p, BTreePageItemLocator *locator); +extern void page_split_chunk_if_needed(BTreeDescr *desc, Page p, + BTreePageItemLocator *locator); +extern void btree_page_reorg(BTreeDescr *desc, Page p, BTreePageItem *items, + OffsetNumber count, LocationIndex hikeySize, + OTuple hikey); +extern void split_page_by_chunks(BTreeDescr *desc, Page p); +extern bool page_locator_find_real_item(Page p, PartialPageState *partial, + BTreePageItemLocator *locator); +extern OffsetNumber page_locator_get_offset(Page p, BTreePageItemLocator *locator); + +#endif /* __BTREE_PAGE_CHUNKS_H__ */ diff --git a/contrib/orioledb/include/btree/page_contents.h b/contrib/orioledb/include/btree/page_contents.h new file mode 100644 index 00000000000..a569483a51a --- /dev/null +++ b/contrib/orioledb/include/btree/page_contents.h @@ -0,0 +1,456 @@ +/*------------------------------------------------------------------------- + * + * page_contents.h + * Declarations of OrioleDB B-tree page structure. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/page_contents.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_PAGE_CONTENTS_H__ +#define __BTREE_PAGE_CONTENTS_H__ + +#include "btree/page_state.h" +#include "s3/queue.h" + +#define NUM_SEQ_SCANS_ARRAY_SIZE 32 + +/* The structure of BTree meta page. Referenced by metaPageBlkno. */ +typedef struct +{ + OrioleDBPageHeader o_header; + SeqBufDescShared freeBuf; + SeqBufDescShared nextChkp[2]; + SeqBufDescShared tmpBuf[2]; + pg_atomic_uint64 numFreeBlocks; + pg_atomic_uint64 datafileLength[2]; + LWLock metaLock; + LWLock copyBlknoLock; + + /* + * A surrogate index key value which can be incremented on an INSERT + * operation. + * + * It can be used (and incremented) as an index key by primary index only + * if an index key isn't defined. + */ + pg_atomic_uint64 ctid; + /* ctid used to map index record to primary key */ + pg_atomic_uint64 bridge_ctid; + pg_atomic_uint32 leafPagesNum; + + /* Number of running sequential scans depending on the checkpoint number */ + pg_atomic_uint32 numSeqScans[NUM_SEQ_SCANS_ARRAY_SIZE]; + + /* + * Additional protection: set when btree pages are freed while the + * resource owner hasn't released its seq scans yet (other transactions + * are excluded by locks). Defers freeing the meta page until the last + * scan is released. + */ + bool toBeFreedOnSeqScanRelease; + + bool dirtyFlag1; + bool dirtyFlag2; + + BTreeS3PartsInfo partsInfo[2]; + + LWLock punchHolesLock; + uint32 punchHolesChkpNum; +} BTreeMetaPage; + +StaticAssertDecl(sizeof(BTreeMetaPage) <= ORIOLEDB_BLCKSZ, + "BTreeMetaPage struct doesn't fit to the page size"); + +#define BTREE_GET_META(desc) \ + ((BTreeMetaPage *) O_GET_IN_MEMORY_PAGE((desc)->rootInfo.metaPageBlkno)) + +typedef struct +{ + uint32 shortLocation:12, + offset:10, + hikeyShortLocation:7, + chunkKeysFixed:1, + hikeyFlags:2; +} BTreePageChunkDesc; + +#define SHORT_LOCATION_MULTIPLIER (4) +#define HIKEY_SHORT_LOCATION_LIMIT ((1<<7) * SHORT_LOCATION_MULTIPLIER) +#define LOCATION_GET_SHORT(l) \ + (AssertMacro(((l) & 3) == 0), (l) / 4) +#define SHORT_GET_LOCATION(s) \ + ((s) * 4) + + +typedef struct +{ + LocationIndex items[1]; +} BTreePageChunk; + +#define BTREE_PAGE_MAX_ITEMS \ + ((ORIOLEDB_BLCKSZ - sizeof(BTreePageHeader)) / \ + (MAXIMUM_ALIGNOF + sizeof(LocationIndex))) + +#define BTREE_PAGE_MAX_CHUNKS \ + ((512 - offsetof(BTreePageHeader, chunkDesc)) / \ + (MAXIMUM_ALIGNOF + sizeof(BTreePageChunkDesc))) + +struct BTreePageItemLocator +{ + OffsetNumber chunkOffset; + OffsetNumber itemOffset; + OffsetNumber chunkItemsCount; + LocationIndex chunkSize; + BTreePageChunk *chunk; +}; + +/* The header of the B-tree pages */ +typedef struct +{ + OrioleDBPageHeader o_header; + + /* Link to the page-level undo item and corresponding CSN */ + UndoLocation undoLocation; + CommitSeqNo csn; + + uint64 rightLink; + uint32 flags:6, + + /* + * For non-leafs, level of page in the tree. Unused for leafs. + */ + field1:11, + + /* + * For leafs, number of bytes occupied by deleted tuples which could be + * potentially vacated during page compaction. For non-leafs, number of + * on-disk downlinks. + */ + field2:15; + + LocationIndex maxKeyLen; + OffsetNumber prevInsertOffset; + OffsetNumber chunksCount; + OffsetNumber itemsCount; + OffsetNumber hikeysEnd; + LocationIndex dataSize; + BTreePageChunkDesc chunkDesc[1]; +} BTreePageHeader; + +/* Flags of B-tree pages */ +#define O_BTREE_FLAG_LEFTMOST (0x0001) +#define O_BTREE_FLAG_RIGHTMOST (0x0002) +#define O_BTREE_FLAG_LEAF (0x0004) +#define O_BTREE_FLAG_BROKEN_SPLIT (0x0008) +#define O_BTREE_FLAG_PRE_CLEANUP (0x0010) +#define O_BTREE_FLAG_HIKEYS_FIXED (0x0020) +#define O_BTREE_FLAGS_ROOT_INIT (O_BTREE_FLAG_LEAF | O_BTREE_FLAG_RIGHTMOST | O_BTREE_FLAG_LEFTMOST) + +/* Check given property of B-tree page */ +#define O_PAGE_IS(page, property) ((((BTreePageHeader *)(page))->flags & O_BTREE_FLAG_##property) != 0) + +#define BTREE_PAGE_HIKEYS_END(desc, p) (O_PAGE_IS(p, LEAF) ? 256 : 512) + +typedef struct PartialPageState PartialPageState; + +/* Macros for accessing B-tree page items */ +#define ITEM_GET_OFFSET(item) ((item) & 0x3FFF) +#define ITEM_GET_FLAGS(item) ((item) >> 14) +#define ITEM_SET_FLAGS(item, flags) (flags ? (item) | ((LocationIndex) (1) << 14) : ((item) & ~((LocationIndex) (1) << 14))) + +#define BTREE_PAGE_LOCATOR_FIRST(p, locptr) \ + page_item_fill_locator((p), 0, (locptr)) +#define BTREE_PAGE_LOCATOR_LAST(p, locptr) \ + page_item_fill_locator_backwards((p), BTREE_PAGE_ITEMS_COUNT(p) - 1, (locptr)) +#define BTREE_PAGE_LOCATOR_TAIL(p, locptr) \ + page_item_fill_locator_backwards((p), BTREE_PAGE_ITEMS_COUNT(p), (locptr)) +#define BTREE_PAGE_LOCATOR_NEXT(p, locptr) \ + ((++(locptr)->itemOffset < (locptr)->chunkItemsCount) ? true : page_locator_next_chunk((p), (locptr))) +#define BTREE_PAGE_LOCATOR_PREV(p, locptr) \ + (((locptr)->itemOffset > 0) ? (locptr)->itemOffset-- : page_locator_prev_chunk((p), (locptr))) +#define BTREE_PAGE_LOCATOR_IS_VALID(p, locptr) \ + ((void) (p), (locptr)->chunk != NULL && (locptr)->itemOffset < (locptr)->chunkItemsCount) +#define BTREE_PAGE_FOREACH_ITEMS(p, locptr) \ + for (BTREE_PAGE_LOCATOR_FIRST((p), (locptr)); \ + BTREE_PAGE_LOCATOR_IS_VALID((p), (locptr)); \ + BTREE_PAGE_LOCATOR_NEXT((p), (locptr))) +#define BTREE_PAGE_LOCATOR_SET_INVALID(locptr) \ + ((locptr)->chunk = 0) +#define BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr) \ + ((void) (p), (Pointer) (locptr)->chunk + \ + ITEM_GET_OFFSET((locptr)->chunk->items[(locptr)->itemOffset])) +#define BTREE_PAGE_OFFSET_GET_LOCATOR(p, offset, locptr) \ + (page_item_fill_locator((p), (offset), (locptr))) +#define BTREE_PAGE_LOCATOR_GET_OFFSET(p, locptr) \ + (((BTreePageHeader *) (p))->chunkDesc[(locptr)->chunkOffset].offset + \ + (locptr)->itemOffset) +#define BTREE_PAGE_GET_ITEM_SIZE(p, locptr) \ + (page_locator_get_item_size((p), (locptr))) +#define BTREE_PAGE_GET_ITEM_OFFSET(p, locptr) \ + ((LocationIndex) ((Pointer) (locptr)->chunk - (Pointer) (p)) + \ + ITEM_GET_OFFSET((locptr)->chunk->items[(locptr)->itemOffset])) +#define BTREE_PAGE_GET_ITEM_FLAGS(p, locptr) \ + ((void) (p), ITEM_GET_FLAGS((locptr)->chunk->items[(locptr)->itemOffset])) +#define BTREE_PAGE_SET_ITEM_FLAGS(p, locptr, flags) \ + ((void) (p), (locptr)->chunk->items[(locptr)->itemOffset] = ITEM_SET_FLAGS((locptr)->chunk->items[(locptr)->itemOffset], (flags))) +#define BTREE_PAGE_READ_LEAF_ITEM(tuphdr, tup, p, locptr) \ + do { \ + Pointer __item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); \ + Assert(O_PAGE_IS(p, LEAF)); \ + (tuphdr) = (BTreeLeafTuphdr *) __item; \ + (tup).data = __item + BTreeLeafTuphdrSize; \ + (tup).formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locptr); \ + } while (false) +#define BTREE_PAGE_READ_INTERNAL_ITEM(tuphdr, tup, p, locptr) \ + do { \ + Pointer __item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); \ + Assert(!O_PAGE_IS(p, LEAF)); \ + (tuphdr) = (BTreeNonLeafTuphdr *) __item; \ + (tup).data = __item + BTreeNonLeafTuphdrSize; \ + (tup).formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locptr); \ + } while (false) +#define BTREE_PAGE_READ_LEAF_TUPLE(tup, p, locptr) \ + do { \ + Pointer __item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); \ + Assert(O_PAGE_IS(p, LEAF)); \ + (tup).formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locptr); \ + (tup).data = __item + BTreeLeafTuphdrSize; \ + } while (false) +#define BTREE_PAGE_READ_INTERNAL_TUPLE(tup, p, locptr) \ + do { \ + Pointer __item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); \ + Assert(!O_PAGE_IS(p, LEAF)); \ + (tup).formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locptr); \ + (tup).data = __item + BTreeNonLeafTuphdrSize; \ + } while (false) +#define BTREE_PAGE_READ_TUPLE(tup, p, locptr) \ + do { \ + Pointer __item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); \ + (tup).formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locptr); \ + if (O_PAGE_IS(p, LEAF)) \ + (tup).data = __item + BTreeLeafTuphdrSize; \ + else \ + (tup).data = __item + BTreeNonLeafTuphdrSize; \ + } while (false) +#define BTREE_PAGE_ITEMS_COUNT(p) \ + (((BTreePageHeader *)(p))->itemsCount) +#define BTREE_PAGE_READ_UNDO_ITEM(tuphdr, tup, rec) \ + do { \ + (tuphdr) = (BTreeLeafTuphdr *) (rec); \ + (tup).data = (Pointer) (rec) + BTreeLeafTuphdrSize; \ + (tup).formatFlags = (tuphdr)->formatFlags; \ + } while (false) +#define BTREE_PAGE_GET_HIKEY(hikey, p) \ + (hikey) = page_get_hikey((p)) +#define BTREE_PAGE_GET_HIKEY_SIZE(p) \ + (page_get_hikey_size((p))) +#define BTREE_PAGE_SET_HIKEY_FLAGS(p, flags) \ + (page_set_hikey_flags((p), (flags))) +#define BTREE_PAGE_FREE_SPACE(p) \ + (ORIOLEDB_BLCKSZ - ((BTreePageHeader *) (p))->dataSize) + +#define BTREE_PAGE_GET_RIGHTLINK(p) (((BTreePageHeader *)(p))->rightLink) + +#define PAGE_GET_LEVEL(p) (O_PAGE_IS(p, LEAF) ? 0 : ((BTreePageHeader *)(p))->field1) +#define PAGE_SET_LEVEL(p, level) (AssertMacro(!O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field1 = (level)) +#define PAGE_GET_N_ONDISK(p) (AssertMacro(!O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2) +#define PAGE_SET_N_ONDISK(p, n) (AssertMacro(!O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2 = (n)) +#define PAGE_INC_N_ONDISK(p) (AssertMacro(!O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2++) +#define PAGE_DEC_N_ONDISK(p) (AssertMacro(!O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2--) +#define PAGE_GET_N_VACATED(p) (AssertMacro(O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2) +#define PAGE_SET_N_VACATED(p, n) (AssertMacro(O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2 = (n)) +#define PAGE_ADD_N_VACATED(p, s) (AssertMacro(O_PAGE_IS(p, LEAF)), ((BTreePageHeader *)(p))->field2 += (s)) +#define PAGE_SUB_N_VACATED(p, s) (AssertMacro(O_PAGE_IS(p, LEAF)), \ + AssertMacro(((BTreePageHeader *)(p))->field2 >= (s)), \ + ((BTreePageHeader *)(p))->field2 -= (s)) + +/* Header of non-leaf tuple */ +typedef struct +{ + uint64 downlink; +} BTreeNonLeafTuphdr; + +/* Header of leaf tuple */ +typedef struct +{ + OTupleXactInfo xactInfo:61, + deleted:2, + chainHasLocks:1; + UndoLocation undoLocation:62, + formatFlags:2; +} BTreeLeafTuphdr; + +#define BTreeNonLeafTuphdrSize MAXALIGN(sizeof(BTreeNonLeafTuphdr)) +#define BTreeLeafTuphdrSize MAXALIGN(sizeof(BTreeLeafTuphdr)) + +#define DOWNLINK_DISK_BIT (UINT64CONST(1)<<63) +#define DOWNLINK_IO_BUF_MASK (UINT64CONST(0xFFFFFFFF00000000)) +#define DOWNLINK_GET_IN_MEMORY_BLKNO(downlink) ((uint32) (downlink)) +#define DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(downlink) (((uint32) ((downlink) >> 32)) & 0x7FFFFFFF) +#define MAKE_IN_MEMORY_DOWNLINK(blkno, changeCount) ((uint64) (blkno) | ((uint64) (changeCount) << 32)) +#define DOWNLINK_IS_IN_MEMORY(downlink) (((downlink) & DOWNLINK_DISK_BIT) == (uint64) 0) +#define DOWNLINK_IS_IN_IO(downlink) (((downlink) & DOWNLINK_IO_BUF_MASK) == DOWNLINK_IO_BUF_MASK) +#define DOWNLINK_IS_ON_DISK(downlink) (!DOWNLINK_IS_IN_MEMORY(downlink) && !DOWNLINK_IS_IN_IO(downlink)) +#define MAKE_IO_DOWNLINK(locknum) ((uint64)(locknum) | DOWNLINK_IO_BUF_MASK) +#define DOWNLINK_GET_IO_LOCKNUM(downlink) ((uint32) ((downlink) & UINT64CONST(0xFFFFFFFF))) + +#define MAKE_ON_DISK_DOWNLINK(extent) (((uint64)((extent).len) << 48) | (uint64)((extent).off) | DOWNLINK_DISK_BIT) +#define DOWNLINK_GET_DISK_OFF(downlink) ((uint64) ((downlink) & UINT64CONST(0xFFFFFFFFFFFF))) +#define DOWNLINK_GET_DISK_LEN(downlink) ((uint16) (((downlink) & UINT64CONST(0x7FFF000000000000)) >> 48)) +#define InvalidDiskDownlink UINT64_MAX +#define DiskDownlinkIsValid(downlink) ((downlink) != InvalidDiskDownlink) + +/* Macros for work with rightlink */ +#define InvalidRightLink (0xFFFFFFFFFFFFFFFF) +#define RightLinkIsValid(rightLink) ((rightLink) != InvalidRightLink) +#define MAKE_IN_MEMORY_RIGHTLINK(blkno, changeCount) (MAKE_IN_MEMORY_DOWNLINK((blkno), (changeCount))) +#define RIGHTLINK_GET_BLKNO(rightLink) (DOWNLINK_GET_IN_MEMORY_BLKNO((rightLink))) +#define RIGHTLINK_GET_CHANGECOUNT(rightLink) (DOWNLINK_GET_IN_MEMORY_CHANGECOUNT((rightLink))) + +/* Tuple and key max sizes */ +#define O_BTREE_MAX_TUPLE_SIZE MAXALIGN_DOWN((ORIOLEDB_BLCKSZ - sizeof(BTreePageHeader)) / 3 - sizeof(LocationIndex) - BTreeLeafTuphdrSize) +#define O_BTREE_MAX_KEY_SIZE O_BTREE_MAX_TUPLE_SIZE + +typedef struct +{ + OTuple tuple; + char fixedData[O_BTREE_MAX_TUPLE_SIZE]; +} OFixedTuple; + +typedef struct +{ + OTuple tuple; + char fixedData[O_BTREE_MAX_KEY_SIZE]; +} OFixedKey; + +/* + * Fixed structure for storage of B-tree key. Separate key length field, + * saves us from getting length of inconsistent key. + */ +typedef struct +{ + union + { + char fixedData[O_BTREE_MAX_KEY_SIZE]; + void *p; /* for alignment purposes */ + } data; + uint8 formatFlags; + bool notNull; + int len; +} OFixedShmemKey; + +typedef enum ReadPageResult +{ + ReadPageResultOK, + ReadPageResultWrongPageChangeCount, + ReadPageResultFailed +} ReadPageResult; + +extern bool o_btree_read_page(BTreeDescr *desc, OInMemoryBlkno blkno, + uint32 pageChangeCount, Page img, CommitSeqNo csn, + void *key, BTreeKeyType keyType, + OFixedKey *lokey, PartialPageState *partial, + bool loadHikeysChunk, UndoLocation *undoLocation, + CommitSeqNo *readCsn); +extern ReadPageResult o_btree_try_read_page(BTreeDescr *desc, + OInMemoryBlkno blkno, + uint32 pageChangeCount, Page img, + CommitSeqNo csn, + Pointer key, BTreeKeyType keyType, + PartialPageState *partial, + bool loadHikeysChunk, + CommitSeqNo *readCsn); +extern UndoLocation read_page_from_undo(BTreeDescr *desc, Page img, + UndoLocation undo_loc, + CommitSeqNo csn, void *key, + BTreeKeyType keyType, OFixedKey *lokey); + +extern void init_new_btree_page(BTreeDescr *desc, OInMemoryBlkno blkno, + uint16 flags, uint16 level, bool noLock); +extern void init_meta_page(OInMemoryBlkno blkno, uint32 leafPagesNum); +extern LocationIndex page_get_vacated_skip_item(BTreeDescr *desc, + Page p, + CommitSeqNo csn, + LocationIndex skipOffset); +extern LocationIndex page_get_vacated_space(BTreeDescr *desc, Page p, + CommitSeqNo csn); +extern void null_unused_bytes(Page img); +extern void put_page_image(OInMemoryBlkno blkno, Page img); +extern void page_cut_first_key(Page node); + +typedef struct ItemPointerData ItemPointerData; +extern ItemPointerData btree_ctid_get_and_inc(BTreeDescr *desc); +extern void btree_ctid_update_if_needed(BTreeDescr *desc, ItemPointerData ctid); + +extern void copy_fixed_tuple(BTreeDescr *desc, OFixedTuple *dst, OTuple src); +extern void copy_fixed_key(BTreeDescr *desc, OFixedKey *dst, OTuple src); +extern void copy_fixed_page_key(BTreeDescr *desc, OFixedKey *dst, + Page p, BTreePageItemLocator *loc); +extern void copy_fixed_hikey(BTreeDescr *desc, OFixedKey *dst, Page p); +extern void clear_fixed_tuple(OFixedTuple *dst); +extern void clear_fixed_key(OFixedKey *dst); + +extern void copy_fixed_shmem_key(BTreeDescr *desc, OFixedShmemKey *dst, + OTuple src); +extern void copy_fixed_shmem_page_key(BTreeDescr *desc, OFixedShmemKey *dst, + Page p, BTreePageItemLocator *loc); +extern void copy_fixed_shmem_hikey(BTreeDescr *desc, OFixedShmemKey *dst, + Page p); +extern void clear_fixed_shmem_key(OFixedShmemKey *dst); +extern OTuple fixed_shmem_key_get_tuple(OFixedShmemKey *src); +extern void copy_from_fixed_shmem_key(OFixedKey *dst, OFixedShmemKey *src); + +extern OTuple page_get_hikey(Page p); +extern int page_get_hikey_size(Page p); +extern void page_set_hikey_flags(Page p, uint8 flags); +extern bool page_fits_hikey(Page p, LocationIndex newHikeySize); +extern void page_resize_hikey(Page p, LocationIndex newHikeySize); +extern void btree_page_update_max_key_len(BTreeDescr *desc, Page p); + +typedef enum +{ + OPageWaitExclusive, + OPageWaitNonExclusive, + OPageWaitInsert, + OPageWaitWakeUp +} OPageWaiterStatus; + +/* + * Shared-memory state for a process waiting to insert a tuple into a page + * (or lock it). When the lock holder performs a group insert optimization, + * it creates undo records on behalf of the waiter using this state. + * + * This should be in page_state.h but depends on O_BTREE_MAX_KEY_SIZE. + */ +typedef struct +{ + ORelOids reloids; + OPageWaiterStatus status; + uint32 pageChangeCount; + + /* + * Waiter's autonomous nesting level at the time it queued. Used by the + * inserter in add_new_undo_stack_item_to_process() to index into the + * correct undoStackLocations slot. Must match the slot that + * GET_CUR_UNDO_STACK_LOCATIONS() would return for the waiter. + */ + int autonomousNestingLevel; + uint8 tupleFlags; + bool inserted; + Size reservedUndoSize; + UndoLocation undoLocation; + uint32 next; + union + { + char fixedData[BTreeLeafTuphdrSize + O_BTREE_MAX_KEY_SIZE]; + Datum datum; /* keep here for alignment */ + } tupleData; +} OPageWaiterShmemState; + +extern OPageWaiterShmemState *lockerStates; + +#endif /* __BTREE_PAGE_CONTENTS_H__ */ diff --git a/contrib/orioledb/include/btree/page_state.h b/contrib/orioledb/include/btree/page_state.h new file mode 100644 index 00000000000..aa4194b8d05 --- /dev/null +++ b/contrib/orioledb/include/btree/page_state.h @@ -0,0 +1,92 @@ +/*------------------------------------------------------------------------- + * + * page_state.h + * Declarations of OrioleDB B-tree page state. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/page_state.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_PAGE_STATE_H__ +#define __BTREE_PAGE_STATE_H__ + +#include "btree.h" +#include "page_contents.h" + +/* Flags stored in OrioleDBPageHeader.state */ +#define PAGE_STATE_LOCKED_FLAG UINT64CONST(0x0000000000040000) +#define PAGE_STATE_NO_READ_FLAG UINT64CONST(0x0000000000080000) +#define PAGE_STATE_CHANGE_COUNT_ONE UINT64CONST(0x0000000000100000) +#define PAGE_STATE_CHANGE_COUNT_MASK UINT64CONST(0x000FFFFFFFF00000) +#define PAGE_STATE_CHANGE_NON_WAITERS_MASK UINT64CONST(0x000FFFFFFFFC0000) +#define PAGE_STATE_CHANGE_USAGE_COUNT_MASK UINT64CONST(0x00F0000000000000) +#define PAGE_STATE_CHANGE_USAGE_COUNT_ONE UINT64CONST(0x0010000000000000) +#define PAGE_STATE_CHANGE_USAGE_COUNT_SHIFT (52) +#define PAGE_STATE_LIST_TAIL_MASK UINT64CONST(0x000000000003FFFF) + +#define PAGE_STATE_INVALID_PROCNO PAGE_STATE_LIST_TAIL_MASK + +/* Macros for dealing with OrioleDBPageHeader.state */ +#define O_PAGE_STATE_IS_LOCKED(state) ((state) & PAGE_STATE_LOCKED_FLAG) +#define O_PAGE_STATE_LOCK(state) ((state) | PAGE_STATE_LOCKED_FLAG) +#define O_PAGE_STATE_BLOCK_READ(state) ((state) | PAGE_STATE_LOCKED_FLAG | PAGE_STATE_NO_READ_FLAG) +#define O_PAGE_STATE_READ_IS_BLOCKED(state) ((state) & PAGE_STATE_NO_READ_FLAG) + +#define O_PAGE_STATE_GET_USAGE_COUNT(state) (((state) & PAGE_STATE_CHANGE_USAGE_COUNT_MASK) >> PAGE_STATE_CHANGE_USAGE_COUNT_SHIFT) +#define O_PAGE_STATE_SET_USAGE_COUNT(state, usageCount) (((state) & ~PAGE_STATE_CHANGE_USAGE_COUNT_MASK) | ((uint64) (usageCount) << PAGE_STATE_CHANGE_USAGE_COUNT_SHIFT)) + +#define BTREE_PAGE_MAX_CHUNK_ITEMS \ + (ORIOLEDB_BLCKSZ / (MAXIMUM_ALIGNOF + sizeof(LocationIndex))) + +#define BTREE_PAGE_MAX_SPLIT_ITEMS (2 * BTREE_PAGE_MAX_CHUNK_ITEMS) + +typedef enum +{ + OLockPageWithTupleResultLocked, + OLockPageWithTupleResultRefindNeeded, + OLockPageWithTupleResultInserted +} OLockPageWithTupleResult; + +/* + * Enable this to recheck page struct on every unlock. + */ +/* #define CHECK_PAGE_STRUCT */ + +#ifdef CHECK_PAGE_STRUCT +extern void o_check_page_struct(BTreeDescr *desc, Page p); +#endif + +extern Size page_state_shmem_needs(void); +extern void page_state_shmem_init(Pointer buf, bool found); +extern bool have_locked_pages(void); +extern int get_waiters_with_tuples(BTreeDescr *desc, + OInMemoryBlkno blkno, + int result[BTREE_PAGE_MAX_SPLIT_ITEMS]); +extern void mark_waiter_tuples_inserted(int procnums[BTREE_PAGE_MAX_SPLIT_ITEMS], + int count); +extern void lock_page(OInMemoryBlkno blkno); +extern OLockPageWithTupleResult lock_page_with_tuple(BTreeDescr *desc, + OInMemoryBlkno *blkno, + uint32 *pageChangeCount, + OTupleXactInfo xactInfo, + OTuple tuple); +extern void relock_page(OInMemoryBlkno blkno); +extern bool try_lock_page(OInMemoryBlkno blkno); +extern void delare_page_as_locked(OInMemoryBlkno blkno); +extern bool page_is_locked(OInMemoryBlkno blkno); +extern void page_block_reads(OInMemoryBlkno blkno); +extern void unlock_page(OInMemoryBlkno blkno); +extern void unlock_page_after_split(OInMemoryBlkno blkno); +extern void release_all_page_locks(void); +extern void page_wait_for_read_enable(OInMemoryBlkno blkno); +extern void btree_register_inprogress_split(OInMemoryBlkno rightBlkno); +extern void btree_unregister_inprogress_split(OInMemoryBlkno rightBlkno); +extern void btree_mark_incomplete_splits(void); +extern void btree_split_mark_finished(OInMemoryBlkno rightBlkno, bool use_lock, + bool success); + +#endif /* __BTREE_PAGE_STATE_H__ */ diff --git a/contrib/orioledb/include/btree/print.h b/contrib/orioledb/include/btree/print.h new file mode 100644 index 00000000000..16aaae82582 --- /dev/null +++ b/contrib/orioledb/include/btree/print.h @@ -0,0 +1,53 @@ +/*------------------------------------------------------------------------- + * + * print.h + * Declarations of OrioleDB B-tree printing routines. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/print.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_PRINT_H__ +#define __BTREE_PRINT_H__ + +#include "btree.h" + +typedef enum +{ + BTreeNotPrint = 0, + BTreePrintAbsolute, + BTreePrintRelative +} BTreePrintOption; + +typedef struct +{ + BTreePrintOption pagePrintType; + BTreePrintOption csnPrintType; + BTreePrintOption backendIdPrintType; + BTreePrintOption undoLogLocationPrintType; + BTreePrintOption idsPrintType; + BTreePrintOption changeCountPrintType; + BTreePrintOption checkpointNumPrintType; + bool printRowVersion; + bool printStateValue; + bool printFileOffset; + bool printFormatFlags; + bool printFixedFlags; + bool truncateValues; +} BTreePrintOptions; + +/* Tuples and keys printing func */ +typedef void (*PrintFunc) (BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +extern void o_print_btree_pages(BTreeDescr *desc, StringInfo outbuf, + PrintFunc keyPrintFunc, + PrintFunc tuplePrintFunc, + Pointer printArg, + BTreePrintOptions *options, int depth); + +#endif /* __BTREE_PRINT_H__ */ diff --git a/contrib/orioledb/include/btree/scan.h b/contrib/orioledb/include/btree/scan.h new file mode 100644 index 00000000000..45de80019b6 --- /dev/null +++ b/contrib/orioledb/include/btree/scan.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * scan.h + * Declarations for sequential scan of OrioleDB B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_SCAN_H__ +#define __BTREE_SCAN_H__ + +#include "btree/btree.h" +#include "btree/page_contents.h" + +#include "executor/tuptable.h" +#include "utils/sampling.h" + +typedef struct +{ + int pageLoadTrancheId, + downlinksPublishTrancheId; +} BTreeScanShmem; + +typedef struct BTreeSeqScan BTreeSeqScan; + +typedef struct BTreeSeqScanCallbacks +{ + bool (*isRangeValid) (OTuple low, OTuple high, void *arg); + bool (*getNextKey) (OFixedKey *key, bool inclusive, void *arg); +} BTreeSeqScanCallbacks; + +extern BTreeScanShmem *btreeScanShmem; + +extern Size btree_scan_shmem_needs(void); +extern void btree_scan_init_shmem(Pointer ptr, bool found); +extern BTreeSeqScan *make_btree_seq_scan(BTreeDescr *desc, + OSnapshot *oSnapshot, + void *poscan); +extern BTreeSeqScan *make_btree_seq_scan_cb(BTreeDescr *desc, + OSnapshot *oSnapshot, + BTreeSeqScanCallbacks *cb, + void *arg); +extern BTreeSeqScan *make_btree_sampling_scan(BTreeDescr *desc, + BlockSampler sampler); +extern OTuple btree_seq_scan_getnext(BTreeSeqScan *scan, MemoryContext mctx, + CommitSeqNo *tupleCsn, + BTreeLocationHint *hint); +extern OTuple btree_seq_scan_getnext_raw(BTreeSeqScan *scan, MemoryContext mctx, + bool *end, BTreeLocationHint *hint); +extern void free_btree_seq_scan(BTreeSeqScan *scan); +extern void seq_scans_cleanup(void); +extern int meta_page_get_num_seq_scans(OInMemoryBlkno metaPageBlkno); + +#endif /* __BTREE_SCAN_H__ */ diff --git a/contrib/orioledb/include/btree/split.h b/contrib/orioledb/include/btree/split.h new file mode 100644 index 00000000000..eee32db1d8c --- /dev/null +++ b/contrib/orioledb/include/btree/split.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * split.h + * Declarations for splitting B-tree pages. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/split.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_SPLIT_H__ +#define __BTREE_SPLIT_H__ + +#include "btree.h" +#include "btree/find.h" +#include "btree/page_chunks.h" + +typedef struct +{ + BTreePageItem items[BTREE_PAGE_MAX_SPLIT_ITEMS]; + int itemsCount; + int hikeySize; + int maxKeyLen; + int hikeysEnd; + bool leaf; +} BTreeSplitItems; + +extern void make_split_items(BTreeDescr *desc, Page page, + BTreeSplitItems *items, + OffsetNumber *offset, Pointer tupleheader, + OTuple tuple, LocationIndex tuplesize, + bool replace, CommitSeqNo csn); +extern void perform_page_compaction(BTreeDescr *desc, OInMemoryBlkno blkno, + BTreeSplitItems *items, bool needsUndo, + CommitSeqNo csn); +extern void perform_page_split(BTreeDescr *desc, OInMemoryBlkno blkno, + OInMemoryBlkno new_blkno, + BTreeSplitItems *items, + OffsetNumber left_count, + OTuple splitkey, LocationIndex splitkey_len, + CommitSeqNo csn, UndoLocation undoLoc); +extern OffsetNumber btree_page_split_location(BTreeDescr *desc, + BTreeSplitItems *items, + OffsetNumber targetLocation, + float4 spaceRatio, + OTuple *split_item); +extern bool split_items_fit_single_page(BTreeSplitItems *items); +extern bool btree_page_split_can_succeed(BTreeSplitItems *items); +OffsetNumber btree_get_split_left_count(BTreeDescr *desc, Page page, + OffsetNumber offset, bool replace, + BTreeSplitItems *items, + OTuple *split_key, + LocationIndex *split_key_len); + +#endif /* __BTREE_SPLIT_H__ */ diff --git a/contrib/orioledb/include/btree/undo.h b/contrib/orioledb/include/btree/undo.h new file mode 100644 index 00000000000..c50e5f6a632 --- /dev/null +++ b/contrib/orioledb/include/btree/undo.h @@ -0,0 +1,179 @@ +/*------------------------------------------------------------------------- + * + * undo.h + * Declarations of B-tree undo records and routines dealing with them. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/btree/undo.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BTREE_UNDO_H__ +#define __BTREE_UNDO_H__ + +#include "btree/page_contents.h" + +/* + * B-Tree page images types which can be stored in undo log. + */ +typedef enum +{ + /* produced by pages split */ + UndoPageImageCompact, + /* produced by pages split */ + UndoPageImageSplit, + /* produced by pages merge */ + UndoPageImageMerge, + /* unknown value for default init */ + UndoPageImageInvalid +} UndoPageImageType; + +/* + * B-Tree page images header in undo log. + */ +typedef struct +{ + UndoPageImageType type; + uint8 splitKeyFlags; + LocationIndex splitKeyLen; +} UndoPageImageHeader; + +/* + * Status of existing lock on the tuple made by the same transaction; + */ +typedef enum +{ + BTreeModifyNoLock = 1, + BTreeModifyWeakerLock = 2, + BTreeModifySameOrStrongerLock = 3 +} BTreeModifyLockStatus; + +/* Undo records */ +typedef struct +{ + UndoStackItem header; + BTreeOperationType action; + ORelOids oids; + OInMemoryBlkno blkno; + uint32 pageChangeCount; + BTreeLeafTuphdr tuphdr; +} BTreeModifyUndoStackItem; + +typedef struct +{ + OnCommitUndoStackItem header; + Oid datoid; + Oid relid; + Oid oldRelnode; + int oldNumTrees; + Oid newRelnode; + int newNumTrees; + bool fsync; + OIndexKey trees[FLEXIBLE_ARRAY_MEMBER]; +} RelnodeUndoStackItem; + +/* size of image in undo log produced by page compaction */ +#define O_COMPACT_UNDO_IMAGE_SIZE (MAXALIGN(sizeof(UndoPageImageHeader)) + ORIOLEDB_BLCKSZ) +/* max size of image in undo log produced by page split */ +#define O_MAX_SPLIT_UNDO_IMAGE_SIZE (MAXALIGN(sizeof(UndoPageImageHeader)) + ORIOLEDB_BLCKSZ + O_BTREE_MAX_KEY_SIZE) +/* size of image in undo log produced by page split */ +#define O_SPLIT_UNDO_IMAGE_SIZE(splitKeySize) (MAXALIGN(sizeof(UndoPageImageHeader)) + ORIOLEDB_BLCKSZ + MAXALIGN(splitKeySize)) +/* max size of update undo record */ +#define O_UPDATE_MAX_UNDO_SIZE (sizeof(BTreeModifyUndoStackItem) + O_BTREE_MAX_TUPLE_SIZE) +/* on modification we should reserve size for split and update undo records */ +#define O_MODIFY_UNDO_RESERVE_SIZE (2 * (O_MAX_SPLIT_UNDO_IMAGE_SIZE + O_UPDATE_MAX_UNDO_SIZE)) +/* size of image in undo log produced by pages merge */ +#define O_MERGE_UNDO_IMAGE_SIZE (MAXALIGN(sizeof(UndoPageImageHeader)) + ORIOLEDB_BLCKSZ * 2) +/* undo location of a page image */ +#define O_UNDO_GET_IMAGE_LOCATION(undo_loc, left) ((undo_loc) + MAXALIGN(sizeof(UndoPageImageHeader)) + ((left) ? 0 : ORIOLEDB_BLCKSZ)) +/* maximum size of undo record */ +#define O_MAX_UNDO_RECORD_SIZE O_MERGE_UNDO_IMAGE_SIZE + +extern bool page_item_rollback(BTreeDescr *desc, Page p, BTreePageItemLocator *locator, + bool loop, BTreeLeafTuphdr *non_lock_tuphdr_ptr, + UndoLocation nonLockUndoLocation); +extern UndoLocation make_undo_record(BTreeDescr *desc, OTuple tuple, + bool is_tuple, + BTreeOperationType action, + OInMemoryBlkno blkno, + uint32 pageChangeCount, + BTreeLeafTuphdr *curTupHdr); +extern void make_waiter_undo_record(BTreeDescr *desc, OInMemoryBlkno blkno, + int pgprocno, + OPageWaiterShmemState *lockerState); +extern void get_page_from_undo(BTreeDescr *desc, UndoLocation undo_loc, Pointer key, + BTreeKeyType kind, Pointer dest, + bool *is_left, bool *is_right, OFixedKey *lokey, + OFixedKey *page_lokey, OTuple *page_hikey); +extern UndoLocation page_add_image_to_undo(BTreeDescr *desc, Pointer p, + CommitSeqNo imageCsn, + OTuple *splitKey, LocationIndex splitKeyLen); +extern UndoLocation make_merge_undo_image(BTreeDescr *desc, Pointer left, + Pointer right, CommitSeqNo imageCsn); +extern bool row_lock_conflicts(BTreeLeafTuphdr *pageTuphdr, + BTreeLeafTuphdr *conflictTupHdr, + UndoLogType undoType, + UndoLocation *conflictUndoLocation, + RowLockMode mode, + OXid my_oxid, CommitSeqNo my_csn, + OInMemoryBlkno blkno, + UndoLocation savepointUndoLocation, + bool *redundant_row_locks, + BTreeModifyLockStatus *lock_status); +extern void remove_redundant_row_locks(BTreeLeafTuphdr *tuphdr_ptr, + BTreeLeafTuphdr *conflictTuphdrPtr, + UndoLogType undoType, + UndoLocation *conflictTupHdrUndoLocation, + RowLockMode mode, OXid my_oxid, + OInMemoryBlkno blkno, + UndoLocation savepointUndoLocation); +extern UndoLocation find_non_lock_only_undo_record(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr); +extern void modify_undo_callback(UndoLogType undoType, + UndoLocation location, + UndoStackItem *baseItem, + OXid oxid, + OUndoCallbackStage stage, + bool changeCountsValid); +extern void lock_undo_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, + OXid oxid, OUndoCallbackStage stage, + bool changeCountsValid); +extern void btree_relnode_undo_callback(UndoLogType undoType, + UndoLocation location, + UndoStackItem *baseItem, OXid oxid, + OUndoCallbackStage stage, + bool changeCountsValid); +extern void get_prev_leaf_header_from_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + bool inPage); +extern bool get_prev_leaf_header_from_undo_if_exists(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr); +extern void get_prev_leaf_header_and_tuple_from_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + OTuple *tuple, + LocationIndex sizeAvailable); +extern void update_leaf_header_in_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + UndoLocation location); +extern void add_undo_truncate_relnode(ORelOids oldOids, OIndexKey *oldTrees, + int oldNumTrees, + ORelOids newOids, OIndexKey *newTrees, + int newNumTrees, + bool fsync); +extern void add_undo_drop_relnode(ORelOids oids, OIndexKey *trees, + int numTrees); +extern void add_undo_create_relnode(ORelOids oids, OIndexKey *trees, + int numTrees, bool fsync); +extern void check_pending_truncates(void); +extern UndoLocation walk_undo_range_with_buf(UndoLogType undoType, UndoLocation location, + UndoLocation toLoc, + OXid oxid, OUndoCallbackStage stage, + UndoLocation *onCommitLocation, + bool changeCountsValid); + + +#endif /* __BTREE_UNDO_H__ */ diff --git a/contrib/orioledb/include/catalog/free_extents.h b/contrib/orioledb/include/catalog/free_extents.h new file mode 100644 index 00000000000..4c0294a9cc1 --- /dev/null +++ b/contrib/orioledb/include/catalog/free_extents.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * free_extents.h + * Routines for an orioledb free file extents list. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/free_extents.h + * + *------------------------------------------------------------------------- + */ +#ifndef __FREE_EXTENTS_H__ +#define __FREE_EXTENTS_H__ + +#include "catalog/sys_trees.h" + +extern FileExtent get_extent(BTreeDescr *desc, uint16 len); +extern void free_extent(BTreeDescr *desc, FileExtent extent); + +typedef void (*ForEachExtentCallback) (BTreeDescr *desc, FileExtent extent, void *arg); +extern void foreach_free_extent(BTreeDescr *desc, ForEachExtentCallback callback, + void *arg); +extern void add_free_extents_from_tmp(BTreeDescr *desc, bool remove); + +/* + * Returns true if `desc` uses the backend-local page pool. Only such trees + * (user temporary tables) need the backend-local free space map below; + * system trees that happen to be BTreeStorageTemporary still use a shared + * pool and continue to rely on the checkpoint-tagged seq bufs. + */ +extern bool btree_desc_is_local_temp(BTreeDescr *desc); + +/* + * Backend-local free extent list for user temporary trees. These helpers + * are process-local and do not consult any shared checkpoint state. + */ +extern void local_free_extents_push(BTreeDescr *desc, FileExtent extent); +extern bool local_free_extents_pop(BTreeDescr *desc, uint16 len, + FileExtent *extent); +extern void local_free_extents_cleanup(BTreeDescr *desc); + +#endif /* __FREE_EXTENTS_H__ */ diff --git a/contrib/orioledb/include/catalog/indices.h b/contrib/orioledb/include/catalog/indices.h new file mode 100644 index 00000000000..50ea391fa37 --- /dev/null +++ b/contrib/orioledb/include/catalog/indices.h @@ -0,0 +1,136 @@ +/*------------------------------------------------------------------------- + * + * indices.h + * Indices routines. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/indices.h + * + *------------------------------------------------------------------------- + */ +#ifndef __INDICES_H__ +#define __INDICES_H__ + +#include "postgres.h" + +#include "orioledb.h" + +#include "catalog/o_tables.h" +#include "tableam/descr.h" + +#define recovery_first_worker (0) +#define recovery_last_worker (recovery_pool_size_guc - 1) +#define recovery_workers (recovery_pool_size_guc) +#define index_build_leader (recovery_pool_size_guc) +#define index_build_first_worker (recovery_pool_size_guc + 1) +#define index_build_last_worker (recovery_pool_size_guc + recovery_idx_pool_size_guc - 1) +#define index_build_workers (recovery_idx_pool_size_guc - 1) + +/* + * Status record for spooling/sorting phase. + */ +typedef struct oIdxSpool +{ + Tuplesortstate **sortstates; /* state data for tuplesort.c */ + Relation index; + OTable *o_table; + OTable *old_o_table; + OTableDescr *descr; + OTableDescr *old_descr; + bool isunique; + +} oIdxSpool; + +/* + * Status for index builds performed in parallel. This is allocated in a + * dynamic shared memory segment or recovery workers shared memory pool. + * Note that there is a separate tuplesort TOC entry, private to tuplesort.c + * but allocated by this module on its behalf. + */ +typedef struct oIdxShared +{ + /* + * These fields are not modified during the sort. They primarily exist + * for the benefit of worker processes that need to create oIdxSpool state + * corresponding to that used by the leader. + */ + bool isunique; + bool isconcurrent; + int scantuplesortstates; + + /* + * workersdonecv is used to monitor the progress of workers. All parallel + * participants must indicate that they are done before leader can use + * mutable state that workers maintain during scan (and before leader can + * proceed to tuplesort_performsort()). + */ + ConditionVariable workersdonecv; + + /* recoverycv is used to coordinate index build queue in recovery */ + ConditionVariable recoveryjoinedcv; + + /* + * mutex protects all fields before heapdesc. + * + * These fields contain status information of interest to B-Tree index + * builds that must work just the same when an index is built in parallel. + */ + slock_t mutex; + + /* + * Mutable state that is maintained by workers, and reported back to + * leader at end of parallel scan. + * + * nparticipantsdone is number of worker processes finished. + * + * reltuples is the total number of input heap tuples. + * + * indtuples is the total number of tuples that made it into the index. + */ + int nparticipantsdone; + int nrecoveryworkersjoined; + + double reltuples; + double indtuples[INDEX_MAX_KEYS]; + + /* Oriole-specific */ + void (*worker_heap_sort_fn) (oIdxSpool *, void *, Sharedsort **, int worker_sortmem, bool progress); + ParallelOScanDescData poscan; + OIndexNumber ix_num; + Size o_table_size; + Size old_o_table_size; + bool isrebuild; + char o_table_serialized[FLEXIBLE_ARRAY_MEMBER]; + /* old_o_table_serialized follows */ +} oIdxShared; + +extern void o_define_index_validate(ORelOids oids, Relation index, IndexInfo *indexInfo, OTable *o_table); +extern void o_define_index(Relation heap, Relation index, Oid indoid, bool reindex, + OIndexNumber old_ix_num, Oid oldTblRelnode, + IndexBuildResult *result); + +extern void o_index_drop(Relation tbl, OIndexNumber ix_num); +extern OIndexNumber o_find_ix_num_by_name(OTableDescr *descr, + char *ix_name); +extern bool is_in_indexes_rebuild(void); + +extern void rebuild_indices_insert_placeholders(OTableDescr *descr); +extern void rebuild_indices(OTable *old_o_table, OTableDescr *old_descr, + OTable *o_table, OTableDescr *descr, + bool in_dedicated_recovery_worker, + IndexBuildResult *result); +extern void assign_new_oids(OTable *oTable, Relation rel, bool drop_pkey); +extern void recreate_o_table(OTable *old_o_table, OTable *o_table); +extern void build_secondary_index(Oid oldTblRelnode, OTable *o_table, + OTableDescr *descr, OIndexNumber ix_num, + bool in_dedicated_recovery_worker, + IndexBuildResult *result); +PGDLLEXPORT void _o_index_parallel_build_main(dsm_segment *seg, shm_toc *toc); +extern void _o_index_parallel_build_inner(dsm_segment *seg, shm_toc *toc, + OTable *recovery_o_table, OTable *recovery_old_o_table); +extern Size _o_index_parallel_estimate_shared(Size o_table_size); +extern void drop_primary_index(Relation rel, OTable *o_table); +#endif /* __INDICES_H__ */ diff --git a/contrib/orioledb/include/catalog/o_indices.h b/contrib/orioledb/include/catalog/o_indices.h new file mode 100644 index 00000000000..945c66a9aa2 --- /dev/null +++ b/contrib/orioledb/include/catalog/o_indices.h @@ -0,0 +1,121 @@ +/*------------------------------------------------------------------------- + * + * o_indices.h + * Declarations for orioledb indices system tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/o_indices.h + * + *------------------------------------------------------------------------- + */ +#ifndef __O_INDICES_H__ +#define __O_INDICES_H__ + +#include "orioledb.h" + +#include "catalog/o_tables.h" +#include "tuple/format.h" + +typedef struct +{ + ORelOids indexOids; + OIndexType indexType; + uint32 indexVersion; + ORelOids tableOids; + char table_persistence; + uint8 fillfactor; + uint16 data_version; + Oid tablespace; + OXid createOxid; + NameData name; + bool primaryIsCtid; + bool bridging; + OCompress compress; + bool nulls_not_distinct; + /* number of fields added using INCLUDE command explicitly */ + /* pkey fields added implicitly in o_o_define_index_validate not counted */ + uint16 nIncludedFields; + uint16 nLeafFields; + uint16 nNonLeafFields; + + /* + * TOAST index: pkey field amount, excluding included fields, including 2 + * fields: attnum and chunknum Primary index: amount of uniq fields in + * index Unique index: field amount, excluding included and pkey fields + * Regular index: all field amount + */ + uint16 nUniqueFields; + /* non-TOAST index: field amount, excluding included and pkey fields */ + /* TOAST index: pkey field amount, excluding included fields */ + uint16 nKeyFields; + /* size of primaryFieldsAttnums */ + uint16 nPrimaryFields; + /* where primary key fields located in index tuple */ + AttrNumber primaryFieldsAttnums[INDEX_MAX_KEYS]; + + /* + * Fields above are stored in SYS_TREES_O_INDICES and + * serialized/deserialized by serialize_o_index()/deserialize_o_index(). + * Fields below are also stored in SYS_TREES_O_INDICES, but they are + * palloc'ed by deserialize_o_index(). + * + * Be careful while adding new fields in order to not break binary + * backward compatibility of the database. + */ + + OTableField *leafTableFields; + OTableIndexField *leafFields; + List *predicate; /* list of Expr */ + char *predicate_str; + List *expressions; /* list of Expr */ + + /* + * duplicated non-pkey fields, elements: lists of 2 elements: (fieldnum, + * original fieldnum) primary index cannot have duplicate fields in + * postgres + */ + List *duplicates; + Oid *exclops; + bool immediate; + MemoryContext index_mctx; +} OIndex; + +/* callback for o_indices_foreach_oids() */ +typedef void (*OIndexOidsCallback) (OIndexType type, ORelOids treeOids, + ORelOids tableOids, Oid tablespace, void *arg); + +typedef enum +{ + OIndexVersionReset, + OIndexVersionPass, +} OIndexVersionMode; + +extern OIndex *make_o_index(OTable *table, OIndexNumber ixNum, OIndexVersionMode ixVerMode); + +typedef enum +{ + oTableSourceTable = 0, + oTableSourceContext = 1 +} OTableSource; + +extern void o_index_fill_descr(OIndexDescr *descr, OIndex *oIndex, void *o_table_source, OTableSource source); + +extern void free_o_index(OIndex *o_index); +extern bool o_indices_add(OTable *table, OIndexNumber ixNum, OXid oxid, + CommitSeqNo csn); +extern bool o_indices_del(OTable *table, OIndexNumber ixNum, OXid oxid, + CommitSeqNo csn); +extern OIndex *o_indices_get(ORelOids oids, OIndexType type); +extern OIndex *o_indices_get_extended(ORelOids oids, OIndexType type, OTableFetchContext ctx); + +extern bool o_indices_update(OTable *table, OIndexNumber ixNum, + OXid oxid, CommitSeqNo csn); +extern bool o_indices_find_table_oids(ORelOids indexOids, OIndexType type, + OSnapshot *oSnapshot, + ORelOids *tableOids); +extern void o_indices_foreach_oids(OIndexOidsCallback callback, void *arg); + +#endif diff --git a/contrib/orioledb/include/catalog/o_sys_cache.h b/contrib/orioledb/include/catalog/o_sys_cache.h new file mode 100644 index 00000000000..aa5795cc368 --- /dev/null +++ b/contrib/orioledb/include/catalog/o_sys_cache.h @@ -0,0 +1,669 @@ +/*------------------------------------------------------------------------- + * + * o_sys_cache.h + * Generic interface for system catalog duplicate trees. + * + * Generic system catalog tree interface that used to prevent syscache + * usage during recovery. System catalog cache trees shoud use o_sys_cache_* + * functions in sysTreesMeta (sys_trees.c), but if sys cache is + * not TOAST tup_print function should be also provided. + * Sys cache lookups are also cached in local backend mamory. + * Cache entry invalidation is performed by syscache hook. + * Instead of physical deletion of sys cache entry we mark it as deleted. + * Normally only not deleted entries used. During recovery we use + * sys cache entries accroding to current WAL position. + * Physical deletion of deleted values is performed during checkpoint, + * which is also called after successed recovery. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/o_sys_cache.h + * + *------------------------------------------------------------------------- + */ + +#ifndef __O_SYS_CACHE_H__ +#define __O_SYS_CACHE_H__ + +#include "orioledb.h" + +#include "btree/undo.h" +#include "catalog/o_tables.h" +#include "catalog/sys_trees.h" +#include "miscadmin.h" +#include "recovery/recovery.h" +#include "utils/catcache.h" +#include "utils/pg_locale.h" +#if PG_VERSION_NUM >= 170000 +#include "utils/resowner.h" +#endif +#include "utils/resowner_private.h" +#include "access/xlogrecovery.h" + +/* + * Database oid to be used for sys cache entries comparison. + */ +extern Oid o_sys_cache_search_datoid; + +typedef uint32 OSysCacheHashKey; /* GetSysCacheHashValue result type */ + +typedef struct OSysCache OSysCache; + +typedef struct OSysCacheKeyCommon +{ + Oid datoid; + XLogRecPtr lsn; + bool deleted; + int dataLength; +} OSysCacheKeyCommon; + +typedef struct OSysCacheKey +{ + OSysCacheKeyCommon common; + Datum keys[FLEXIBLE_ARRAY_MEMBER]; +} OSysCacheKey; + +#define O_KEY_GET_NAME(key, att_num) (((key)->common.dataLength == 0) ? \ + DatumGetName((key)->keys[att_num]) : \ + ((Name) (((Pointer) (key)) + (key)->keys[att_num]))) + +typedef struct OSysCacheBound +{ + OSysCacheKey *key; + int nkeys; +} OSysCacheBound; + +/* Key of entry stored in non-TOAST sys cache tree */ +typedef struct +{ + OSysCacheKeyCommon common; + Datum keys[1]; +} OSysCacheKey1; + +typedef struct +{ + OSysCacheKeyCommon common; + Datum keys[2]; +} OSysCacheKey2; + +typedef struct +{ + OSysCacheKeyCommon common; + Datum keys[3]; +} OSysCacheKey3; + +typedef struct +{ + OSysCacheKeyCommon common; + Datum keys[4]; +} OSysCacheKey4; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunkKeyCommon +{ + uint32 chunknum; +} OSysCacheToastChunkKeyCommon; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey sys_cache_key; +} OSysCacheToastChunkKey; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey1 sys_cache_key; +} OSysCacheToastChunkKey1; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey2 sys_cache_key; +} OSysCacheToastChunkKey2; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey3 sys_cache_key; +} OSysCacheToastChunkKey3; + +/* Key of chunks of entry stored in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey4 sys_cache_key; +} OSysCacheToastChunkKey4; + +/* Key by which entry searched in TOAST sys cache tree */ +typedef struct +{ + OSysCacheToastChunkKeyCommon common; + OSysCacheKey *key; + bool lsn_cmp; +} OSysCacheToastKeyBound; + +/* Chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunkCommon +{ + uint32 dataLength; +} OSysCacheToastChunkCommon; + +/* Chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunk1 +{ + OSysCacheToastChunkKey1 key; + OSysCacheToastChunkCommon common; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OSysCacheToastChunk1; + +/* Chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunk2 +{ + OSysCacheToastChunkKey2 key; + OSysCacheToastChunkCommon common; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OSysCacheToastChunk2; + +/* Chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunk3 +{ + OSysCacheToastChunkKey3 key; + OSysCacheToastChunkCommon common; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OSysCacheToastChunk3; + +/* Chunks of entry stored in TOAST sys cache tree */ +typedef struct OSysCacheToastChunk4 +{ + OSysCacheToastChunkKey4 key; + OSysCacheToastChunkCommon common; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OSysCacheToastChunk4; + +typedef struct OSysCacheFuncs +{ + /* + * Should be always set. Used in invalidation hook to cleanup entry saved + * in fastcache. Also used inside o_sys_cache_add_if_needed. + */ + void (*free_entry) (Pointer entry); + + /* + * Should be always set. Used inside o_sys_cache_add_if_needed and + * o_sys_cache_update_if_needed. On add entry_ptr is NULL, entry should be + * created and returned. + */ + void (*fill_entry) (Pointer *entry_ptr, OSysCacheKey *key, + Pointer arg); + + /* + * Used in toast sys cache trees. Should return pointer to binary + * serialized data and it's length. + */ + Pointer (*toast_serialize_entry) (Pointer entry, int *len); + + /* + * Used in toast sys cache trees. Should return pointer to constructed + * entry of a tree. + */ + Pointer (*toast_deserialize_entry) (MemoryContext mcxt, + Pointer data, + Size length); +} OSysCacheFuncs; + +typedef uint32 (*O_CCHashFN) (OSysCacheKey *key, int att_num); + +typedef struct OSysCache +{ + int sys_tree_num; + bool is_toast; + Oid cc_indexoid; + int cacheId; + int nkeys; + Oid keytypes[CATCACHE_MAXKEYS]; + int data_len; + MemoryContext mcxt; /* context where stored entries from fast + * cache */ + HTAB *fast_cache; /* contains OSysCacheHashEntry-s */ + O_CCHashFN cc_hashfunc[CATCACHE_MAXKEYS]; + OSysCacheHashKey last_fast_cache_key; + Pointer last_fast_cache_entry; + OSysCacheFuncs *funcs; +} OSysCache; + +/* + * Initializes all sys catalog caches. + */ +extern void o_sys_caches_init(void); + +extern OSysCache *o_create_sys_cache(int sys_tree_num, bool is_toast, + Oid cc_indexoid, /* cacheinfo indoid */ + int cacheId, /* cacheinfo array index */ + int nkeys, + Oid *keytypes, + int data_len, + HTAB *fast_cache, + MemoryContext mcxt, + OSysCacheFuncs *funcs); +extern Pointer o_sys_cache_search(OSysCache *sys_cache, int nkeys, + OSysCacheKey *key); +extern void o_sys_cache_add_if_needed(OSysCache *sys_cache, OSysCacheKey *key, + Pointer arg); +extern void o_sys_cache_update_if_needed(OSysCache *sys_cache, + OSysCacheKey *key, Pointer arg); +extern bool o_sys_cache_delete(OSysCache *sys_cache, OSysCacheKey *key); + +extern void o_cache_table_types(OTable *o_table); +extern void o_cache_index_types(OTable *o_table, OTableIndex *o_table_index); +extern void o_cache_type(Oid datoid, Oid typoid, Oid opclass, + XLogRecPtr insert_lsn); + +/* + * safe version that collect processed types to prevent recursion + * when collecting any functions for type + */ +extern void o_cache_type_safe(Oid datoid, Oid typoid, Oid opclass, + XLogRecPtr insert_lsn, List **processed); +extern bool custom_type_try_add_hash_fn_if_needed(Oid typoid, + Oid opclass, + List **processed); +extern void o_validate_composite_type(Oid typoid, Oid opclass); +extern Oid o_get_hash_proc_by_btree_opclass(Oid btreeOpclass); + +extern void o_sys_caches_delete_by_lsn(XLogRecPtr checkPointRedo); + +extern void orioledb_setup_syscache_hooks(void); + +extern int o_sys_cache_key_length(BTreeDescr *desc, OTuple tuple); +extern int o_sys_cache_tup_length(BTreeDescr *desc, OTuple tuple); +extern int o_sys_cache_cmp(BTreeDescr *desc, void *p1, BTreeKeyType k1, + void *p2, BTreeKeyType k2); +extern void o_sys_cache_key_print(BTreeDescr *desc, StringInfo buf, + OTuple key_tup, Pointer arg); +extern JsonbValue *o_sys_cache_key_to_jsonb(BTreeDescr *desc, OTuple tup, + JsonbParseState **state); + +extern int o_sys_cache_toast_chunk_length(BTreeDescr *desc, OTuple tuple); +extern int o_sys_cache_toast_cmp(BTreeDescr *desc, void *p1, + BTreeKeyType k1, void *p2, + BTreeKeyType k2); +extern void o_sys_cache_toast_key_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); +extern JsonbValue *o_sys_cache_toast_key_to_jsonb(BTreeDescr *desc, + OTuple tup, + JsonbParseState **state); +extern void o_sys_cache_toast_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); +extern void o_sys_cache_delete_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, OXid oxid, + OUndoCallbackStage stage, bool changeCountsValid); + +#define O_SYS_CACHE_INIT_FUNC(cache_name) \ +void o_##cache_name##_init(MemoryContext mcxt, HTAB *fastcache) + +#define OSC_REP_PREF1(PREFIX,X) PREFIX X##1 +#define OSC_REP_PREF2(PREFIX,X) OSC_REP_PREF1(PREFIX,X), PREFIX X##2 +#define OSC_REP_PREF3(PREFIX,X) OSC_REP_PREF2(PREFIX,X), PREFIX X##3 +#define OSC_REP_PREF4(PREFIX,X) OSC_REP_PREF3(PREFIX,X), PREFIX X##4 + +#define OSC_REP_DATUM(ONES,X) OSC_REP_PREF##ONES(Datum,X) + +#define OSC_REP_ARGS(ONES) OSC_REP_PREF##ONES(,arg) + +#define O_SYS_CACHE_ARGS(nkeys) OSC_REP_DATUM(nkeys,arg) + +#define O_SYS_CACHE_DECLS(cache_name, elem_type, nkeys) \ +extern O_SYS_CACHE_INIT_FUNC(cache_name); \ +extern bool o_##cache_name##_delete(Oid datoid, O_SYS_CACHE_ARGS(nkeys)); \ +void o_##cache_name##_update_if_needed(Oid datoid, O_SYS_CACHE_ARGS(nkeys), \ + Pointer arg); \ +void o_##cache_name##_add_if_needed(Oid datoid, O_SYS_CACHE_ARGS(nkeys), \ + XLogRecPtr insert_lsn, \ + Pointer arg); \ +extern int no_such_variable + +#define O_SYS_CACHE_FUNCS(cache_name, elem_type, nkeys) \ + static inline elem_type *o_##cache_name##_search( \ + Oid datoid, O_SYS_CACHE_ARGS(nkeys), XLogRecPtr search_lsn, \ + int nkeys_arg); \ + elem_type * \ + o_##cache_name##_search(Oid datoid, O_SYS_CACHE_ARGS(nkeys), \ + XLogRecPtr search_lsn, int nkeys_arg) \ + { \ + OSysCacheKeyCommon common = { 0 }; \ + OSysCacheKey##nkeys key; \ + Datum keys[nkeys] = {OSC_REP_ARGS(nkeys)}; \ + ASAN_UNPOISON_MEMORY_REGION(&key, sizeof(key)); \ + ASAN_UNPOISON_MEMORY_REGION(&key.keys, sizeof(key.keys)); \ + memset(&key, 0, sizeof(key)); \ + memcpy(&key.keys, &keys, sizeof(keys)); \ + common.datoid = datoid; \ + common.lsn = search_lsn; \ + key.common = common; \ + return (elem_type *) \ + o_sys_cache_search(cache_name, nkeys_arg, \ + (OSysCacheKey *) &key); \ + } \ + bool \ + o_##cache_name##_delete(Oid datoid, O_SYS_CACHE_ARGS(nkeys)) \ + { \ + OSysCacheKey##nkeys key = {.common = {.datoid = datoid}, \ + .keys = {OSC_REP_ARGS(nkeys)}}; \ + return o_sys_cache_delete(cache_name, (OSysCacheKey *) &key); \ + } \ + void \ + o_##cache_name##_update_if_needed(Oid datoid, O_SYS_CACHE_ARGS(nkeys), \ + Pointer arg) \ + { \ + OSysCacheKey##nkeys key = {.common = {.datoid = datoid}, \ + .keys = {OSC_REP_ARGS(nkeys)}}; \ + o_sys_cache_update_if_needed(cache_name, (OSysCacheKey *) &key, \ + arg); \ + } \ + void \ + o_##cache_name##_add_if_needed(Oid datoid, O_SYS_CACHE_ARGS(nkeys), \ + XLogRecPtr insert_lsn, \ + Pointer arg) \ + { \ + OSysCacheKeyCommon common = { 0 }; \ + OSysCacheKey##nkeys key; \ + Datum keys[nkeys] = {OSC_REP_ARGS(nkeys)}; \ + ASAN_UNPOISON_MEMORY_REGION(&key, sizeof(key)); \ + ASAN_UNPOISON_MEMORY_REGION(&key.keys, sizeof(key.keys)); \ + memset(&key, 0, sizeof(key)); \ + memcpy(&key.keys, &keys, sizeof(keys)); \ + common.datoid = datoid; \ + common.lsn = insert_lsn; \ + key.common = common; \ + o_sys_cache_add_if_needed(cache_name, (OSysCacheKey *) &key, arg); \ + } \ + extern int no_such_variable + +static inline void +o_sys_cache_set_datoid_lsn(XLogRecPtr *cur_lsn, Oid *datoid) +{ + if (cur_lsn) + *cur_lsn = is_recovery_in_progress() ? GetXLogReplayRecPtr(NULL) : + GetXLogWriteRecPtr(); + + if (datoid) + { + if (OidIsValid(MyDatabaseId)) + { + *datoid = MyDatabaseId; + } + else + { + Assert(OidIsValid(o_sys_cache_search_datoid)); + *datoid = o_sys_cache_search_datoid; + } + } +} + +extern void o_set_syscache_hooks(void); +extern void o_unset_syscache_hooks(void); +extern void o_reset_syscache_hooks(void); +extern bool o_is_syscache_hooks_set(void); + +/* o_enum_cache.c */ + +typedef struct OEnumData +{ + Oid oid; + float4 enumsortorder; +} OEnumData; + +typedef struct +{ + OSysCacheKey2 key; + OEnumData data; +} OEnum; + +typedef struct +{ + OSysCacheKey1 key; + Oid enumtypid; +} OEnumOid; + +O_SYS_CACHE_DECLS(enum_cache, OEnum, 2); +O_SYS_CACHE_DECLS(enumoid_cache, OEnumOid, 1); +extern void o_enum_cache_add_all(Oid datoid, Oid enum_oid, + XLogRecPtr insert_lsn); +extern HeapTuple o_enum_cache_search_htup(TupleDesc tupdesc, Oid enumtypid, + Name enumlabel); +extern void o_enum_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); +extern void o_enum_cache_delete_all(Oid datoid, Oid enum_oid); + +extern HeapTuple o_enumoid_cache_search_htup(TupleDesc tupdesc, Oid enum_oid); +extern void o_enumoid_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +extern void o_load_enum_cache_data_hook(TypeCacheEntry *tcache); + +/* o_range_cache.c */ +typedef struct +{ + OSysCacheKey1 key; + Oid rngsubtype; + Oid rngcollation; + Oid rngsubopc; +} ORange; + +O_SYS_CACHE_DECLS(range_cache, ORange, 1); +extern HeapTuple o_range_cache_search_htup(TupleDesc tupdesc, Oid rngtypid); +extern void o_range_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +typedef struct +{ + OSysCacheKey1 key; + Oid rngtypid; +} OMultiRange; + +O_SYS_CACHE_DECLS(multirange_cache, OMultiRange, 1); +extern HeapTuple o_multirange_cache_search_htup(TupleDesc tupdesc, + Oid rngmultitypid); +extern void o_multirange_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* o_class_cache.c */ +typedef struct OClass OClass; + +typedef struct OClassArg +{ + bool column_drop; + int dropped; + bool found; +} OClassArg; + +O_SYS_CACHE_DECLS(class_cache, OClass, 1); +extern TupleDesc o_class_cache_search_tupdesc(Oid cc_reloid); +extern void o_class_cache_preload_for_column(Oid typoid); + +/* o_opclass_cache.c */ +typedef struct OOpclass +{ + OSysCacheKey1 key; + Oid opfamily; + Oid inputtype; + + /* + * We do not want to set FmgrInfo.fn_oid as random value. + */ + Oid cmpOid; + Oid ssupOid; +} OOpclass; + +O_SYS_CACHE_DECLS(opclass_cache, OOpclass, 1); +extern OOpclass *o_opclass_get(Oid opclassoid, Oid datoid); +extern HeapTuple o_opclass_cache_search_htup(TupleDesc tupdesc, + Oid opclassoid); +extern void o_opclass_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* o_proc_cache.c */ +typedef struct OProc OProc; + +typedef struct OProcArg +{ + Oid collation; + List **processed; +} OProcArg; + +O_SYS_CACHE_DECLS(proc_cache, OProc, 1); +extern Datum o_fmgr_sql(PG_FUNCTION_ARGS); +extern void o_proc_cache_validate_add(Oid datoid, Oid procoid, Oid fncollation, + char *func_type, char *used_for, + List **processed); +extern void o_proc_cache_fill_finfo(FmgrInfo *finfo, Oid procoid, Oid datoid); +extern HeapTuple o_proc_cache_search_htup(TupleDesc tupdesc, Oid procoid); + +/* o_type_cache.c */ +typedef struct OType +{ + OSysCacheKey1 key; + NameData typname; + int16 typlen; + bool typbyval; + char typalign; + char typstorage; + Oid typcollation; + Oid typrelid; + char typtype; + char typcategory; + bool typispreferred; + bool typisdefined; + regproc typinput; + regproc typoutput; + regproc typreceive; + regproc typsend; + Oid typelem; + char typdelim; + Oid typbasetype; + int32 typtypmod; + Oid typsubscript; + Oid default_btree_opclass; + Oid default_hash_opclass; +} OType; + +O_SYS_CACHE_DECLS(type_cache, OType, 1); +extern HeapTuple o_type_cache_search_htup(TupleDesc tupdesc, Oid typeoid); +extern void o_type_cache_fill_info(Oid typeoid, int16 *typlen, bool *typbyval, + char *typalign, char *typstorage, + Oid *typcollation); +extern Oid o_type_cache_default_opclass(Oid typeoid, Oid am_id); +extern void o_type_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); +extern bool o_type_cache_get_typtype(Oid typeoid, char *typtype); + +/* o_aggregate_cache.c */ +typedef struct OAggregate OAggregate; + +O_SYS_CACHE_DECLS(aggregate_cache, OAggregate, 1); +extern HeapTuple o_aggregate_cache_search_htup(TupleDesc tupdesc, Oid aggfnoid); + +/* o_operator_cache.c */ +typedef struct OOperator +{ + OSysCacheKey1 key; + regproc oprcode; +} OOperator; + +O_SYS_CACHE_DECLS(operator_cache, OOperator, 1); +extern HeapTuple o_operator_cache_search_htup(TupleDesc tupdesc, Oid operoid); +extern void o_operator_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); +extern Oid o_operator_cache_get_oprcode(Oid operoid); + +/* o_amop_cache.c */ +typedef struct OAmOp +{ + OSysCacheKey3 key; + Oid amopmethod; + int16 amopstrategy; + Oid amopfamily; + Oid amoplefttype; + Oid amoprighttype; +} OAmOp; + +typedef struct OAmOpStrat +{ + OSysCacheKey4 key; + Oid amopopr; +} OAmOpStrat; + +O_SYS_CACHE_DECLS(amop_cache, OAmOp, 3); +extern HeapTuple o_amop_cache_search_htup(TupleDesc tupdesc, Oid amopopr, + char amoppurpose, Oid amopfamily); +extern List *o_amop_cache_search_htup_list(TupleDesc tupdesc, Oid amopopr); +extern void o_amop_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +O_SYS_CACHE_DECLS(amop_strat_cache, OAmOpStrat, 4); +extern HeapTuple o_amop_strat_cache_search_htup(TupleDesc tupdesc, + Oid amopfamily, + Oid amoplefttype, + Oid amoprighttype, + int16 amopstrategy); +extern void o_amop_strat_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* o_amproc_cache.c */ +typedef struct OAmProc +{ + OSysCacheKey4 key; + regproc amproc; +} OAmProc; + +O_SYS_CACHE_DECLS(amproc_cache, OAmProc, 4); +extern HeapTuple o_amproc_cache_search_htup(TupleDesc tupdesc, + Oid amprocfamily, + Oid amproclefttype, + Oid amprocrighttype, + int16 amprocnum); +extern void o_amproc_cache_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* o_collation_cache.c */ +O_SYS_CACHE_DECLS(collation_cache, OCollation, 1); +extern HeapTuple o_collation_cache_search_htup(TupleDesc tupdesc, Oid colloid); +extern void orioledb_save_collation(Oid colloid); + +/* o_database_cache.c */ +O_SYS_CACHE_DECLS(database_cache, ODatabase, 1); +extern int32 o_database_cache_get_database_encoding(void); +extern void o_database_cache_set_database_encoding(void); +#if PG_VERSION_NUM >= 170000 +extern void o_database_cache_set_default_locale_provider(void); +#endif +extern void o_database_cache_set_lc_collate(void); + +static inline void +o_set_sys_cache_search_datoid(Oid datoid) +{ + if (o_sys_cache_search_datoid != datoid) + { + o_sys_cache_search_datoid = datoid; + if (!OidIsValid(MyDatabaseId)) + { + o_database_cache_set_database_encoding(); +#if PG_VERSION_NUM >= 170000 + o_database_cache_set_default_locale_provider(); +#endif + o_database_cache_set_lc_collate(); + } + } +} + +extern void o_get_prefixes_for_tablespace(Oid datoid, Oid tablespace, + char **prefix, char **db_prefix); + +#endif /* __O_SYS_CACHE_H__ */ diff --git a/contrib/orioledb/include/catalog/o_tables.h b/contrib/orioledb/include/catalog/o_tables.h new file mode 100644 index 00000000000..0965ec5d67e --- /dev/null +++ b/contrib/orioledb/include/catalog/o_tables.h @@ -0,0 +1,374 @@ +/*------------------------------------------------------------------------- + * + * o_tables.h + * Routines for orioledb tables system tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/o_tables.h + * + *------------------------------------------------------------------------- + */ +#ifndef __O_TABLES_H__ +#define __O_TABLES_H__ + +#include "btree/btree.h" +#include "catalog/sys_trees.h" + +#include "access/tupdesc.h" +#include "access/tupdesc_details.h" +#include "executor/execExpr.h" +#include "catalog/objectaddress.h" +#include "nodes/parsenodes.h" +#include "orioledb.h" + +/* + * Describes a field of an orioledb table. + */ +typedef struct +{ + NameData name; + Oid typid; + Oid collation; + int32 typmod BKI_DEFAULT(-1); + int32 ndims; + bool byval; + bool droped; + bool notnull; + int16 typlen; + char align; + char storage; + char compression; + bool hasmissing; + bool hasdef; + char generated; +} OTableField; + +/* + * Describes an index field of an orioledb table. + */ +typedef struct +{ + int attnum; + Oid collation; + Oid opclass; + SortByDir ordering; + SortByNulls nullsOrdering; + Oid hash_fn_oid; +} OTableIndexField; + +/* + * Describes an index of an orioledb table. + */ +typedef struct +{ + NameData name; + ORelOids oids; + OIndexType type; + OCompress compress; + uint32 version; + bool nulls_not_distinct; + uint8 nfields; + /* number of index fields */ + uint8 nkeyfields; + uint8 fillfactor; + OTableIndexField fields[INDEX_MAX_KEYS]; + uint8 nexprfields; + OTableField *exprfields; + List *expressions; /* list of Expr */ + char *predicate_str; + List *predicate; /* list of Expr */ + Oid tablespace; + Oid *exclops; + bool immediate; + MemoryContext index_mctx; +} OTableIndex; + +/* + * Describes an orioledb table. + */ +typedef struct +{ + ORelOids oids; + ORelOids toast_oids; + + /* + * Per-table index version counters used for sys-tree visibility (MVCC) + * during recovery and logical decoding. + * + * OrioleDB stores catalog-like metadata in system trees. For some + * operations the "same" logical index (e.g. the table's primary index) + * may be replaced by a new metadata record while keeping stable identity + * attributes (relation OIDs, names, etc.). + * + * To make each incarnation unambiguous, OIndex records are keyed not only + * by (table oids, index type) but also by a monotonically changing + * version. These fields keep the current version for the corresponding + * index kind and are copied into ORelFetchContext.version when we need to + * read the matching OIndex from SYS_TREES. + * + * O_TABLE_INVALID_VERSION means "index does not exist / version is + * unknown". + */ + uint32 toast_ixversion; /* TOAST-index version for current table */ + uint32 primary_ixversion; /* Primary-index version for current table */ + uint32 bridge_ixversion; /* Bridge-index version for current table */ + ORelOids bridge_oids; + OCompress default_compress; + OCompress primary_compress; + OCompress toast_compress; + bool index_bridging; + uint16 nfields; + uint16 primary_init_nfields; + uint16 nindices; + Oid tid_btree_ops_oid; /* have to store it here */ + Oid tid_hash_fn_oid; /* have to store it here */ + Oid int2_hash_fn_oid; /* have to store it here */ + Oid int4_hash_fn_oid; /* have to store it here */ + bool has_primary; + char persistence; + uint8 fillfactor; + uint16 data_version; + OTableIndex *indices; + OTableField *fields; + AttrMissing *missing; /* missing attributes values, NULL if none */ + Oid tablespace; + uint32 version; /* not serialized in serialize_o_table */ + MemoryContext tbl_mctx; /* not serialized in serialize_o_table */ +} OTable; + +#define OGetTableContext(table) \ + ((table)->tbl_mctx ? \ + (table)->tbl_mctx : \ + ((table)->tbl_mctx = AllocSetContextCreate(TopMemoryContext, \ + "OTableContext", \ + ALLOCSET_DEFAULT_SIZES))) + +/* + * Maximum number of retries when deserialization fails due to truncated toast + * data (missing chunks from a concurrent write race condition). + */ +#define O_DESERIALIZE_MAX_RETRIES 100 + +/* Parameters for a deserialization retry exponential backoff. */ +#define O_DESERIALIZE_RETRY_MIN_DURATION (1000L) +#define O_DESERIALIZE_RETRY_MAX_DURATION (100000L) + +extern void o_table_fill_index(OTable *o_table, OIndexNumber ix_num, + Relation index_rel); + +/* Creates and fills OTable. */ +extern OTable *o_table_tableam_create(ORelOids oids, TupleDesc tupdesc, + char relpersistence, uint8 fillfactor, + Oid tablespace, bool bridging); + +OTableField *o_tables_get_builtin_field(Oid type); +extern void o_tables_tupdesc_init_builtin(TupleDesc desc, AttrNumber att_num, + char *name, Oid type); + +extern TupleDesc o_table_fields_make_tupdesc(OTableField *fields, int nfields); + +/* Returns tuple descriptor of the OTable */ +extern TupleDesc o_table_tupdesc(OTable *o_table); + +/* Finds table field by its name */ +extern OTableField *o_table_field_by_name(OTable *table, const char *name); + +/* Drops a table by oids from o_tables list */ +extern OTable *o_tables_drop_by_oids(ORelOids oids, OXid oxid, CommitSeqNo csn); + +/* Drops all tables from o_tables list */ +extern void o_tables_drop_all(OXid oxid, CommitSeqNo csn, Oid database_id); + +/* Drops all columns of a specific type */ +extern void o_tables_drop_columns_by_type(OXid oxid, CommitSeqNo csn, Oid type_oid); + +/* Drops all temporary tables that left after crash */ +extern void o_tables_truncate_all_unlogged(void); + +/* Adds a new table to o_tables list */ +extern bool o_tables_add(OTable *table, OXid oxid, CommitSeqNo csn); + +/* Returns OTable by its oids */ +extern OTable *o_tables_get(ORelOids oids); + +/* Returns OTable by its oids, version and snapshot */ +extern OTable *o_tables_get_extended(ORelOids oids, OTableFetchContext ctx); + +/* Returns OTable by its index oids */ +extern OTable *o_tables_get_by_tree(ORelOids oids, OIndexType type); + +/* Returns number of OrioleDB tables in the database */ +extern int o_tables_num(Oid datoid); + +/* Updates OTable description in o_tables list */ +extern bool o_tables_update(OTable *table, OXid oxid, CommitSeqNo csn); + +/* Invalidates descriptors after o_tables_update */ +void o_tables_after_update(OTable *o_table, OXid oxid, CommitSeqNo csn); + +/* Free memory of OTable struct */ +extern void o_table_free(OTable *table); + +extern OIndexKey *o_table_make_index_keys(OTable *table, int *num); + +/* callback for o_tables_foreach() */ +typedef void (*OTablesCallback) (OTable *descr, void *arg); + +/* callback for o_tables_foreach_oids() */ +typedef void (*OTablesOidsCallback) (ORelOids oids, void *arg); + +/* Iterates through o_tables list. */ +extern void o_tables_foreach(OTablesCallback callback, + OSnapshot *oSnapshot, + void *arg); +extern void o_tables_foreach_oids(OTablesOidsCallback callback, + OSnapshot *oSnapshot, + void *arg); + +Pointer serialize_o_table(OTable *o_table, int *size); + +OTable *deserialize_o_table(Pointer data, Size length); + +/* + * We can't use relation_open/LockRelationId locks to protect relations that + * belong to other database. + * + * We must use this locks to protect critical code sections interacting with + * relations from other databases (workers code, walk_page() for backends). + * + * TableAM handler functions are already protected by top-level, there are no + * need on this locks nested TableAM handler functions. + */ +extern bool o_tables_rel_try_lock_extended(ORelOids *oids, int lockmode, bool *nested, bool checkpoint); +extern void o_tables_rel_lock_extended(ORelOids *oids, int lockmode, bool checkpoint); +extern void o_tables_rel_lock_extended_no_inval(ORelOids *oids, int lockmode, + bool checkpoint); +extern void o_tables_rel_lock_exclusive_no_inval_no_log(ORelOids *oids); +extern void o_tables_rel_unlock_extended(ORelOids *oids, int lockmode, bool checkpoint); + +/* Deserialize OTable stored in O_TABLES sys tree */ +extern void o_serialize_node(Node *node, StringInfo str); +extern Node *o_deserialize_node(Pointer *ptr); +extern bool o_deserialize_node_safe(Pointer *ptr, Pointer data, Size length, Node **out); +extern void o_serialize_string(char *serialized, StringInfo str); +extern char *o_deserialize_string(Pointer *ptr); +extern bool o_deserialize_string_safe(Pointer *ptr, Pointer data, Size length, char **out); + +static inline bool +o_tables_rel_try_lock(ORelOids *oids, int lockmode, bool *nested) +{ + return o_tables_rel_try_lock_extended(oids, lockmode, nested, false); +} + +static inline void +o_tables_rel_lock(ORelOids *oids, int lockmode) +{ + o_tables_rel_lock_extended(oids, lockmode, false); +} + +static inline void +o_tables_rel_unlock(ORelOids *oids, int lockmode) +{ + o_tables_rel_unlock_extended(oids, lockmode, false); +} + +extern void o_table_fill_oids(OTable *oTable, Relation rel, + const RelFileNode *newrnode, + bool drop_pkey); +extern Datum o_eval_default(OTable *o_table, Relation rel, + Node *expr, TupleTableSlot *scantuple, + bool byval, int16 typlen, bool *isNull); +extern void o_table_resize_constr(OTable *o_table); +extern void o_table_fill_constr(OTable *o_table, Relation rel, int fieldnum, + OTableField *old_field, OTableField *field); +extern void o_tupdesc_load_constr(TupleDesc tupdesc, OTable *o_table, + OIndexDescr *descr); +extern char *o_get_type_name(Oid typid, int32 typmod); + +static inline int +o_table_fieldnum(OTable *table, const char *name) +{ + int i; + + for (i = 0; i < table->nfields; i++) + { + if (table->fields[i].droped) + continue; + if (pg_strcasecmp(NameStr(table->fields[i].name), name) == 0) + return i; + } + return i; +} + +extern void orioledb_attr_to_field(OTableField *field, Form_pg_attribute attr); + +extern void o_tables_meta_lock(void); +extern void o_tables_meta_lock_no_wal(void); + +static inline void +o_tables_rel_meta_lock(Relation rel) +{ + if (!rel || rel->rd_rel->relpersistence != RELPERSISTENCE_TEMP) + o_tables_meta_lock(); + else + o_tables_meta_lock_no_wal(); +} +static inline void +o_tables_table_meta_lock(OTable *o_table) +{ + if (!o_table || o_table->persistence != RELPERSISTENCE_TEMP) + o_tables_meta_lock(); + else + o_tables_meta_lock_no_wal(); +} + +extern void o_tables_meta_unlock(ORelOids oids, Oid oldRelnode); +extern void o_tables_meta_unlock_no_wal(void); + +static inline void +o_tables_rel_meta_unlock(Relation rel, Oid oldRelnode) +{ + if (!rel) + { + ORelOids tmpOids = {InvalidOid, InvalidOid, InvalidOid}; + + o_tables_meta_unlock(tmpOids, oldRelnode); + } + else if (rel->rd_rel->relpersistence != RELPERSISTENCE_TEMP) + { + ORelOids oids; + + ORelOidsSetFromRel(oids, rel); + o_tables_meta_unlock(oids, oldRelnode); + } + else + { + o_tables_meta_unlock_no_wal(); + } +} +static inline void +o_tables_table_meta_unlock(OTable *o_table, Oid oldRelnode) +{ + if (!o_table) + { + ORelOids tmpOids = {InvalidOid, InvalidOid, InvalidOid}; + + o_tables_meta_unlock(tmpOids, oldRelnode); + } + else if (o_table->persistence != RELPERSISTENCE_TEMP) + { + o_tables_meta_unlock(o_table->oids, oldRelnode); + } + else + o_tables_meta_unlock_no_wal(); +} + +extern Oid o_saved_relrewrite; +extern List *o_reuse_indices; + +extern void redefine_pkey_for_rel(Relation rel); + +#endif /* __O_TABLES_H__ */ diff --git a/contrib/orioledb/include/catalog/sys_trees.h b/contrib/orioledb/include/catalog/sys_trees.h new file mode 100644 index 00000000000..9f9127d072a --- /dev/null +++ b/contrib/orioledb/include/catalog/sys_trees.h @@ -0,0 +1,170 @@ +/*------------------------------------------------------------------------- + * + * sys_trees.h + * Headers for system trees + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/catalog/sys_trees.h + * + *------------------------------------------------------------------------- + */ +#ifndef __SYS_TREES_H__ +#define __SYS_TREES_H__ + +#include "btree/btree.h" +#include "btree/print.h" + +#include "utils/catcache.h" + +#define SYS_TREES_DATOID (1) + +#define SYS_TREES_SHARED_ROOT_INFO (1) +#define SYS_TREES_O_TABLES (2) +#define SYS_TREES_O_INDICES (3) +#define SYS_TREES_OPCLASS_CACHE (4) +#define SYS_TREES_ENUM_CACHE (5) +#define SYS_TREES_ENUMOID_CACHE (6) +#define SYS_TREES_RANGE_CACHE (7) +#define SYS_TREES_CLASS_CACHE (8) +#define SYS_TREES_EXTENTS_OFF_LEN (9) +#define SYS_TREES_EXTENTS_LEN_OFF (10) +#define SYS_TREES_PROC_CACHE (11) +#define SYS_TREES_TYPE_CACHE (12) +#define SYS_TREES_AGG_CACHE (13) +#define SYS_TREES_OPER_CACHE (14) +#define SYS_TREES_AMOP_CACHE (15) +#define SYS_TREES_AMPROC_CACHE (16) +#define SYS_TREES_COLLATION_CACHE (17) +#define SYS_TREES_DATABASE_CACHE (18) +#define SYS_TREES_AMOP_STRAT_CACHE (19) +#define SYS_TREES_EVICTED_DATA (20) +#define SYS_TREES_CHKP_NUM (21) +#define SYS_TREES_MULTIRANGE_CACHE (22) +#define SYS_TREES_CATALOG_XID_UNDO_LOCATION (23) +#define SYS_TREES_NUM (23) + +#define IS_SYS_TREE_OIDS(oids) \ + ((oids).datoid == SYS_TREES_DATOID) + +#define OIDS_EQ_SYS_TREE(oids, systree) \ + ((oids).datoid == SYS_TREES_DATOID && \ + (oids).reloid == (systree) && \ + (oids).relnode == (systree)) + +#define IS_TYPCACHE_SYSTREE(systree) \ + ((systree) == SYS_TREES_ENUM_CACHE || \ + (systree) == SYS_TREES_ENUMOID_CACHE || \ + (systree) == SYS_TREES_TYPE_CACHE || \ + (systree) == SYS_TREES_MULTIRANGE_CACHE) + +#define O_OPCLASS_PROSRC_MAXLEN 512 + +typedef struct +{ + Oid datoid; + Oid relnode; +} SharedRootInfoKey; + +typedef struct +{ + SharedRootInfoKey key; + BTreeRootInfo rootInfo; + bool placeholder; +} SharedRootInfo; + +#define O_TABLE_INVALID_VERSION UINT32_MAX + +typedef struct +{ + ORelOids oids; + uint32 chunknum; + uint32 version; +} OTableChunkKey; + +typedef struct +{ + OTableChunkKey key; + OXid oxid; +} OTableChunkBoundKey; + +typedef struct +{ + OTableChunkKey key; + uint32 dataLength; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OTableChunk; + +typedef struct +{ + OIndexType type; + ORelOids oids; + uint32 chunknum; + uint32 version; +} OIndexChunkKey; + +typedef struct +{ + OIndexChunkKey key; + OXid oxid; +} OIndexChunkBoundKey; + +typedef struct +{ + OIndexChunkKey key; + uint32 dataLength; + char data[FLEXIBLE_ARRAY_MEMBER]; +} OIndexChunk; + +/* + * FileExtent type stores length of an extent inside unsigned 16-bit value. + * It enough for FileExtent purposes but extents inside free B-trees + * can be more than 2^16. + */ +typedef struct +{ + uint64 offset; + uint64 length; +} FreeTreeFileExtent; + +/* + * Tuple stored in a free B-tree nodes and tuples. + */ +typedef struct +{ + FreeTreeFileExtent extent; + OIndexType ixType; + Oid datoid; + Oid relnode; +} FreeTreeTuple; + +typedef struct +{ + SharedRootInfoKey key; + uint32 checkpointNumbers[2]; +} ChkpNumTuple; + +typedef struct +{ + TransactionId xid; + UndoLocation undoLocation; +} ReplicationRetainUndoTuple; + +extern Size sys_trees_shmem_needs(void); +extern void sys_trees_shmem_init(Pointer ptr, bool found); +extern BTreeDescr *get_sys_tree(int tree_num); +extern BTreeDescr *get_sys_tree_no_init(int tree_num); +extern BTreeStorageType sys_tree_get_storage_type(int tree_num); +extern bool sys_tree_is_temporary(int tree_num); +extern bool sys_tree_supports_transactions(int tree_num); +extern PrintFunc sys_tree_key_print(BTreeDescr *desc); +extern PrintFunc sys_tree_tup_print(BTreeDescr *desc); +extern void sys_tree_set_extra(int tree_num, Pointer extra); +extern Pointer sys_tree_get_extra(int tree_num); +#ifdef IS_DEV +extern const text *inspect_sys_tree_structure(int systree, int depth); +#endif + +#endif /* __SYS_TREES_H__ */ diff --git a/contrib/orioledb/include/checkpoint/checkpoint.h b/contrib/orioledb/include/checkpoint/checkpoint.h new file mode 100644 index 00000000000..4de277f027d --- /dev/null +++ b/contrib/orioledb/include/checkpoint/checkpoint.h @@ -0,0 +1,288 @@ +/*------------------------------------------------------------------------- + * + * checkpoint.h + * Declarations for checkpoint. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/checkpoint/checkpoint.h + * + *------------------------------------------------------------------------- + */ +#ifndef __CHECKPOINT_H__ +#define __CHECKPOINT_H__ + +#include "orioledb.h" + +#include "btree/page_contents.h" +#include "rewind/rewind.h" + +#include "access/xlogdefs.h" + +struct CheckpointFileHeader +{ + uint64 ctid; + uint64 bridgeCtid; + uint64 rootDownlink; + uint64 datafileLength; + uint64 numFreeBlocks; + uint32 leafPagesNum; +}; + +typedef enum +{ + NextKeyNone, + NextKeyValue, + NextKeyGreatest +} NextKeyType; + +typedef enum +{ + CheckpointBoundNone, + CheckpointBoundHikey, + CheckpointBoundRightmost +} CheckpointBound; + +typedef struct +{ + /* + * Checkpoint bound type for current level, helps to determine the + * checkpoint number for eviction + */ + CheckpointBound bound; + + /* + * Type of next key which will be added to image. + */ + NextKeyType nextkeyType; + + /* + * Contains data which will be written to disk. + */ + char image[ORIOLEDB_BLCKSZ]; + + /* + * Hikey of current page. It helps to determine checkpoint number for + * eviction and filling bound for autonomous pages. + */ + OFixedShmemKey hikey; + /* Next key which can be added to current node. */ + OFixedShmemKey nextkey; + /* Lokey of current BTree page */ + OFixedShmemKey lokey; + + /* + * Current the BTree page number. OInvalidInMemoryBlkno if no page + * processing at now. + */ + OInMemoryBlkno blkno; + + /* + * Number page containing the relevant hikey. Might be different from + * `blkno` for autonomous pages. + */ + OInMemoryBlkno hikeyBlkno; + + /* Current valid offset on BTree page. */ + OffsetNumber offset; + /* Is current in memory page leftmost on a BTree level. */ + bool leftmost; + + /* + * Is current page autonomous. + * + * Autonomous pages do not equal to in memory BTree pages and written to + * disk as far as filling or reach CheckpointPageInfo.hikey. + */ + bool autonomous; + + /* + * Is the image already contains an internal tuple with a valid key. + * + * If autonomous stack[level - 1].image was not written to disk no sense + * to reinsert the same internal tuple key to the current level (with + * autonomous image too). + * + * We can avoid reinsert of an internal tuple in this case. + * + * Although this approach helps to flush autonomous images stack to disk + * if needed. + */ + bool autonomousTupleExist; + + /* + * Helps to setup O_BTREE_FLAG_LEFTMOST for autonomous pages. We can not + * use CheckpointPageInfo.leftmost flag because it used for navigation + * through OrioleDB BTree. + */ + bool autonomousLeftmost; +} CheckpointPageInfo; + +typedef enum +{ + CurKeyLeast, + CurKeyValue, + CurKeyGreatest, + CurKeyFinished +} CurKeyType; + +#define SHARED_ROOT_INFO_INSERT_NUM_LOCKS 128 + +#define XID_RECS_QUEUE_SIZE (max_procs * 4) + +/* + * Tag stored in an XidFileRec to distinguish the kinds of records the file + * carries. The first UndoLogsCount values are 1:1 with UndoLogType for + * "active undo at checkpoint time" records. The next UndoLogsCount values + * are the "rewind" variants (same UndoLogType, but the record describes a + * not-yet-applied rewind cleanup) and are at offset UndoLogsCount. Values + * beyond that are independent kinds carried in the same file, the only one + * today being XidRecPendingSkFixup, written by the checkpointer to record + * a transaction that had applied its primary index change but had not yet + * updated the corresponding secondary indices at the checkpoint boundary. + * + * The numeric layout matches the pre-XidRecKind file format, so existing + * on-disk values keep their meaning. + */ +typedef enum +{ + XidRecUndoRegular = UndoLogRegular, + XidRecUndoRegularPageLevel = UndoLogRegularPageLevel, + XidRecUndoSystem = UndoLogSystem, + XidRecRewindUndoRegular = UndoLogsCount + UndoLogRegular, + XidRecRewindUndoRegularPageLevel = UndoLogsCount + UndoLogRegularPageLevel, + XidRecRewindUndoSystem = UndoLogsCount + UndoLogSystem, + XidRecPendingSkFixup = 2 * UndoLogsCount +} XidRecKind; + +typedef struct +{ + OXid oxid; + XidRecKind kind; + UndoStackLocations undoLocation; + UndoLocation retainLocation; +} XidFileRec; + +/* Rewind kinds are shifted by UndoLogsCount compared to their UndoLogType base */ +#define XID_REC_REWIND_TYPES_OFFSET UndoLogsCount + +typedef struct +{ + uint64 controlIdentifier; + uint32 changecount; + uint32 lastCheckpointNumber; + OIndexType treeType; + Oid datoid; + Oid reloid; + Oid relnode; + Oid tablespace; + bool completed; + CurKeyType curKeyType; + OFixedShmemKey curKeyValue; + CheckpointPageInfo stack[ORIOLEDB_MAX_DEPTH]; + /* pid of the worker */ + pid_t pid; + double dirtyPagesEstimate; + uint64 pagesWritten; + /* helps to avoid skip a new table for the checkpoint in progress */ + int oTablesMetaTrancheId; + LWLock oTablesMetaLock; + int oSysTreesTrancheId; + LWLock oSysTreesLock; + int oSharedRootInfoInsertTrancheId; + LWLock oSharedRootInfoInsertLocks[SHARED_ROOT_INFO_INSERT_NUM_LOCKS]; + struct Latch *checkpointerLatch; + pg_atomic_uint32 autonomousLevel; + XLogRecPtr replayStartPtr; + XLogRecPtr controlReplayStartPtr; + XLogRecPtr sysTreesStartPtr; + XLogRecPtr controlSysTreesStartPtr; + XLogRecPtr toastConsistentPtr; + XLogRecPtr controlToastConsistentPtr; + pg_atomic_uint64 mmapDataLength; + + /* + * Shared memory queue of records for writing to the xids file. Backends + * write to this queue last undo position on transaction commit/abort. + * Checkpoint writes current undo positions for in-progress transactions. + */ + uint32 xidQueueCheckpointNum; + int oXidQueueTrancheId; + LWLock oXidQueueLock; + int oXidQueueFlushTrancheId; + LWLock oXidQueueFlushLock; + int copyBlknoTrancheId; + int oMetaTrancheId; + int punchHolesTrancheId; + pg_atomic_uint64 xidRecLastPos; + pg_atomic_uint64 xidRecFlushPos; + XidFileRec xidRecQueue[FLEXIBLE_ARRAY_MEMBER]; +} CheckpointState; + +#define XID_FILENAME_FORMAT (ORIOLEDB_DATA_DIR"/%u.xid") + +#define chkp_inc_changecount_before(state) \ + do { \ + state->changecount++; \ + pg_write_barrier(); \ + } while (0) + +#define chkp_inc_changecount_after(state) \ + do { \ + pg_write_barrier(); \ + state->changecount++; \ + Assert((state->changecount & 1) == 0); \ + } while (0) + +#define chkp_save_changecount_before(state, save_changecount) \ + do { \ + save_changecount = state->changecount; \ + pg_read_barrier(); \ + } while (0) + +#define chkp_save_changecount_after(state, save_changecount) \ + do { \ + pg_read_barrier(); \ + save_changecount = state->changecount; \ + } while (0) + +extern CheckpointState *checkpoint_state; + +extern Size checkpoint_shmem_size(void); +extern void checkpoint_shmem_init(Pointer ptr, bool found); +extern uint32 o_get_latest_chkp_num(Oid datoid, Oid relnode, + uint32 max_chkp_num, bool *found); +extern void o_update_latest_chkp_num(Oid datoid, Oid relnode, uint32 chkp_num); +extern void o_delete_chkp_num(Oid datoid, Oid relnode); + +extern void o_perform_checkpoint(XLogRecPtr redo_pos, int flags); +extern void o_after_checkpoint_cleanup_hook(XLogRecPtr checkPointRedo, + int flags); + +extern bool page_is_under_checkpoint(BTreeDescr *desc, OInMemoryBlkno blkno, + bool includingHikeyBlkno); +extern bool tree_is_under_checkpoint(BTreeDescr *desc); +extern bool get_checkpoint_number(BTreeDescr *desc, OInMemoryBlkno blkno, uint32 *checkpoint_number, bool *copy_blkno); +extern uint32 get_cur_checkpoint_number(ORelOids *oids, OIndexType type, bool *checkpoint_concurrent); +extern bool can_use_checkpoint_extents(BTreeDescr *desc, uint32 chkp_num); +extern void free_extent_for_checkpoint(BTreeDescr *desc, FileExtent *extent, uint32 chkp_num); +extern void backend_set_autonomous_level(CheckpointState *state, uint32 level); +extern bool tbl_data_exists(ORelOids *oids, Oid tablespace); +extern void evictable_tree_init(BTreeDescr *desc, bool init_shmem, + bool *was_evicted); +extern void checkpointable_tree_init(BTreeDescr *desc, bool init_shmem, + bool *was_evicted); +extern void checkpointable_tree_free(BTreeDescr *desc); +extern void systrees_modify_start(void); +extern void systrees_modify_end(bool any_wal); +extern void systrees_lock_callback(UndoLogType undoType, + UndoLocation location, + UndoStackItem *baseItem, OXid oxid, + OUndoCallbackStage stage, bool changeCountsValid); +extern void before_writing_xids_file(int chkpnum); +extern void write_to_xids_queue(XidFileRec *rec); +void checkpoint_write_rewind_item(RewindItem *rewindItem); + +#endif /* __CHECKPOINT_H__ */ diff --git a/contrib/orioledb/include/checkpoint/control.h b/contrib/orioledb/include/checkpoint/control.h new file mode 100644 index 00000000000..8d11ef88875 --- /dev/null +++ b/contrib/orioledb/include/checkpoint/control.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * control.h + * Declarations for control file. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/checkpoint/control.h + * + *------------------------------------------------------------------------- + */ +#ifndef __CHECKPOINT_CONTROL_H__ +#define __CHECKPOINT_CONTROL_H__ + +#include "postgres.h" + +#include "orioledb.h" + +typedef struct +{ + UndoLocation lastUndoLocation; + UndoLocation checkpointRetainStartLocation; + UndoLocation checkpointRetainEndLocation; +} CheckpointUndoInfo; + +#define NUM_CHECKPOINTABLE_UNDO_LOGS 2 + +/* + * Bump every time CheckpointControl structure changes its format. + * Also on-the-flight conversion routine should be added to + * check_checkpoint_control() + */ +#define ORIOLEDB_CHECKPOINT_CONTROL_VERSION 1 + +/* + * To ensure correct reading of controlFileVersion, changes in struct layout + * are permitted only between .controlFileVersion and .crc, that + * should be the last. + */ +typedef struct +{ + uint64 controlIdentifier; + uint32 lastCheckpointNumber; + uint32 controlFileVersion; + CommitSeqNo lastCSN; + OXid lastXid; + UndoLocation lastUndoLocation; + XLogRecPtr toastConsistentPtr; + XLogRecPtr replayStartPtr; + XLogRecPtr sysTreesStartPtr; + uint64 mmapDataLength; + CheckpointUndoInfo undoInfo[NUM_CHECKPOINTABLE_UNDO_LOGS]; + UndoLocation checkpointRetainStartLocation; + UndoLocation checkpointRetainEndLocation; + OXid checkpointRetainXmin; + OXid checkpointRetainXmax; + uint32 binaryVersion; + bool s3Mode; + /* CRC of all fields above. It should be last */ + pg_crc32c crc; +} CheckpointControl; + +#define CONTROL_FILENAME ORIOLEDB_DATA_DIR"/control" + +#define GetCheckpointableUndoLog(i) \ + (AssertMacro((i) >= 0 && (i) < 2), \ + (i) == 0 ? UndoLogRegular : UndoLogSystem) + +/* + * Physical size of the orioledb_data/control file. Note that this is considerably + * bigger than the actually used size (ie, sizeof(CheckpointControl)). + * The idea is to keep the physical size constant independent of format + * changes, so that get_checkpoint_control_data will deliver a suitable wrong-version + * message instead of a read error if it's looking at an incompatible file. + */ +#define CHECKPOINT_CONTROL_FILE_SIZE 8192 + +extern bool get_checkpoint_control_data(CheckpointControl *control); +extern void check_checkpoint_control(CheckpointControl *control); +extern void write_checkpoint_control(CheckpointControl *control); + +#endif /* __CHECKPOINT_CONTROL_H__ */ diff --git a/contrib/orioledb/include/indexam/handler.h b/contrib/orioledb/include/indexam/handler.h new file mode 100644 index 00000000000..32cb5269b1e --- /dev/null +++ b/contrib/orioledb/include/indexam/handler.h @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * handler.h + * Declarations of index access method handler + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/indexam/handler.h + * + *------------------------------------------------------------------------- + */ +#ifndef __INDEXAM_HANDLER_H__ +#define __INDEXAM_HANDLER_H__ + +#include "access/amapi.h" + +extern IndexAmRoutine *orioledb_indexam_routine_hook(Oid tamoid, + Oid amhandler); + +#endif diff --git a/contrib/orioledb/include/orioledb.h b/contrib/orioledb/include/orioledb.h new file mode 100644 index 00000000000..525521919e6 --- /dev/null +++ b/contrib/orioledb/include/orioledb.h @@ -0,0 +1,584 @@ +/*------------------------------------------------------------------------- + * + * orioledb.h + * Common declarations for orioledb engine. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/orioledb.h + * + *------------------------------------------------------------------------- + */ +#ifndef __ORIOLEDB_H__ +#define __ORIOLEDB_H__ + +#include "access/nbtree.h" +#include "access/reloptions.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "common/int.h" +#include "nodes/extensible.h" +#include "miscadmin.h" +#include "port/atomics.h" +#include "storage/bufpage.h" +#include "storage/fd.h" +#include "storage/lock.h" +#include "storage/procarray.h" +#include "storage/spin.h" +#include "utils/builtins.h" +#include "utils/jsonb.h" +#include "utils/typcache.h" +#include "utils/rel.h" +#include "utils/relcache.h" + +#if defined __has_include +#if __has_include ("sanitizer/asan_interface.h") +#include "sanitizer/asan_interface.h" +#endif +#endif + +#ifndef ASAN_UNPOISON_MEMORY_REGION +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif + +/* + * Currently OrioleDB has the following version-related values: + * + * Reference-only value: + * + * - ORIOLEDB_VERSION - is an external text value meaning the current release. + * + * ORIOLEDB_VERSION is output by orioledb_version() SQL function and its change doesn't mean any + * changes in the code that can introduce incompatibilities. But if values for compatibility (below) + * should be bumped, it's enough to do this once before next release. I.e. compatibility is + * neecessary between releases, not between different commits belonging to one release. + * + * Values to reflect incompatibilities and limitations: + * + * - ORIOLEDB_WAL_VERSION - Version of OrioleDB WAL format (see include/recovery/wal.h) + * + * ORIOLEDB_WAL_VERSION is used for on-the fly conversion of WAL. As WAL can be transferred between + * different clusters ORIOLEDB_WAL_VERSION compatibility is not limited to the same + * ORIOLEDB_BINARY_VERSION (see below). Compatibility is only one-way, if read versions are lower + * than current, WAL will be converted seamlessly at its reading. But if read versions are greater + * than current there is a difference for WAL used in recovery and WAL used for logical decoding: + * For recovery: Cluster will shut down (recovery failed) + * For logical decoding: Logical decoding will fail and throw error. Cluster will continue working. + * + * - ORIOLEDB_CHECKPOINT_CONTROL_VERSION - Version of OrioleDB control file format + * (seeinclude/checkpoint/control.h) + * + * ORIOLEDB_CHECKPOINT_CONTROL_VERSION is intended for on-the fly conversion of CheckpointControl + * structure. Compatibility is only one-way, if read versions are lower than current + * CheckpointControl will be converted seamlessly. Otherwise cluster will refuse to start with + * error. Note that CheckpointControl structure is used for reading cluster's + * ORIOLEDB_BINARY_VERSION(see below). (As now we have only one ORIOLEDB_CHECKPOINT_CONTROL_VERSION + * only check is implemented yet, no conversion) + * + * - ORIOLEDB_BINARY_VERSION - Clusters with different ORIOLEDB_BINARY_VERSION are binarily + * incompatible. + * + * ORIOLEDB_BINARY_VERSION of a cluster is written to a checkpoint control file. At start OrioleDB + * will check if current ORIOLEDB_BINARY_VERSION is equal to what is in the control file. Otherwise + * the cluster will refuse to start. (We don't go to the following versions checks in this case) + * + * - ORIOLEDB_SYS_TREE_VERSION - Version of OrioleDB system trees format + * - ORIOLEDB_PAGE_VERSION - Version of OrioleDB page format + * - ORIOLEDB_COMPRESS_VERSION - Version of OrioleDB page compression format + * + * These ORIOLEDB_SYS_TREE_VERSION, ORIOLEDB_PAGE_VERSION, ORIOLEDB_COMPRESS_VERSION reflect + * incompatibilities that could be converted on-the-fly. Compatibility is only one-way: if read + * versions are greater than current, cluster will shut down with error. If read version is lower + * than current - seamless conversion will occur at the first reading. + * + * As said above, these values are not checked if ORIOLEDB_BINARY_VERSION is different, so each of + * these values makes sense only within one ORIOLEDB_BINARY_VERSION value. + */ +#define ORIOLEDB_VERSION "OrioleDB pre-2 beta 16" +#define ORIOLEDB_BINARY_VERSION 9 +#define ORIOLEDB_SYS_TREE_VERSION 1 /* Version of system catalog */ +#define ORIOLEDB_PAGE_VERSION 1 /* Version of binary page format */ +#define ORIOLEDB_COMPRESS_VERSION 1 /* Version of page compression (only + * for compressed pages) */ + +#define ORIOLEDB_DATA_DIR "orioledb_data" +#define ORIOLEDB_UNDO_DIR "orioledb_undo" +#define ORIOLEDB_RMGR_ID (129) +#define ORIOLEDB_XLOG_CONTAINER (0x00) + +/* + * perform_page_split() removes a key data from first right page downlink. + * But the data can be useful for debug. The macro controls this behavior. + * + * See usage in perform_page_split(). + */ +#define ORIOLEDB_CUT_FIRST_KEY 1 +/* max a BTree depth */ +#define ORIOLEDB_MAX_DEPTH 32 +/* size of OrioleDB BTree page */ +#define ORIOLEDB_BLCKSZ 8192 +/* size of on disk compressed page chunk */ +#define ORIOLEDB_COMP_BLCKSZ 512 +/* size of data file segment */ +#define ORIOLEDB_SEGMENT_SIZE (1024 * 1024 * 1024) +/* size of S3 data file part */ +#define ORIOLEDB_S3_PART_SIZE (1024 * 1024) + +#define GetMaxBackends() MaxBackends + +/* + * Number of orioledb page. + * If high bit is set, it means the page is in the local memory. + */ +typedef uint32 OInMemoryBlkno; +#define OInvalidInMemoryBlkno ((OInMemoryBlkno) 0xFFFFFFFF) +#define OInMemoryBlknoIsValid(blockNumber) \ + ((bool) ((OInMemoryBlkno) (blockNumber) != OInvalidInMemoryBlkno)) +#define ORootPageIsValid(desc) (OInMemoryBlknoIsValid((desc)->rootInfo.rootPageBlkno)) +#define OMetaPageIsValid(desc) (OInMemoryBlknoIsValid((desc)->rootInfo.metaPageBlkno)) + +/* Undo log location */ +typedef uint64 UndoLocation; +#define InvalidUndoLocation UINT64CONST(0x2000000000000000) +#define MaxUndoLocation UINT64CONST(0x1FFFFFFFFFFFFFFE) +#define UndoLocationValueMask UINT64CONST(0x1FFFFFFFFFFFFFFF) +#define UndoLocationIsValid(loc) (((loc) & InvalidUndoLocation) == 0) +#define UndoLocationGetValue(loc) ((loc) & UndoLocationValueMask) + +/* + * Sentinel for ODBProcData.pendingSkUndoLoc. Set by the PK btree_modify + * when no undo record was produced because the table was created in the + * current transaction (see the self-created shortcut in + * o_btree_modify_internal()). The checkpointer treats this as "wait + * until this backend leaves the PK-applied/SK-pending window" instead of + * recording a fix-up entry: a self-created table is private to the + * in-progress txn, so no other backend can stall the SK btree_modify and + * the wait is bounded. Distinguishable from InvalidUndoLocation by the + * low bit, while still failing UndoLocationIsValid(). + */ +#define WaitingSkUndoLoc UINT64CONST(0x2000000000000001) + +/* Identifier for orioledb transaction */ +typedef uint64 OXid; +#define InvalidOXid UINT64CONST(0x7FFFFFFFFFFFFFFF) +#define OXidIsValid(oxid) ((oxid) != InvalidOXid) +#define LXID_NORMAL_FROM (1) + +/* Index number */ +typedef uint16 OIndexNumber; + +/* Index type */ +typedef enum +{ + oIndexInvalid = 0, + oIndexToast = 1, + oIndexBridge = 2, + oIndexPrimary = 3, + oIndexUnique = 4, + oIndexRegular = 5, + oIndexExclusion = 6, +} OIndexType; + +static inline OIndexType +o_index_rel_get_ix_type(Relation index) +{ + OIndexType ix_type; + + if (index->rd_index->indisprimary) + ix_type = oIndexPrimary; + else if (index->rd_index->indisunique) + ix_type = oIndexUnique; + else if (index->rd_index->indisexclusion) + ix_type = oIndexExclusion; + else + ix_type = oIndexRegular; + return ix_type; +} + +#define PROC_XID_ARRAY_SIZE 32 + +typedef enum +{ + /* Invalid value. */ + UndoLogNone = -1, + + /* + * Undo log for row-level record of modifications of user data. + */ + UndoLogRegular = 0, + + /* + * Undo log for page-level record of modifications of user data. + */ + UndoLogRegularPageLevel = 1, + + /* + * Undo log for modification of system trees. + */ + UndoLogSystem = 2, + + UndoLogsCount = 3 +} UndoLogType; + +#define GET_PAGE_LEVEL_UNDO_TYPE(undoType) \ + (((undoType) == UndoLogRegular) ? UndoLogRegularPageLevel : (undoType)) + +typedef struct +{ + OXid oxid; + VirtualTransactionId vxid; +} XidVXidMapElement; + +typedef struct +{ + pg_atomic_uint64 location; + pg_atomic_uint64 branchLocation; + pg_atomic_uint64 subxactLocation; + pg_atomic_uint64 onCommitLocation; +} UndoStackSharedLocations; + +typedef struct +{ + pg_atomic_uint64 reservedUndoLocation; + pg_atomic_uint64 transactionUndoRetainLocation; + pg_atomic_uint64 snapshotRetainUndoLocation; +} UndoRetainSharedLocations; + +typedef struct +{ + UndoRetainSharedLocations undoRetainLocations[(int) UndoLogsCount]; + pg_atomic_uint64 commitInProgressXlogLocation; + int autonomousNestingLevel; + LWLock undoStackLocationsFlushLock; + bool flushUndoLocations; + bool waitingForOxid; + pg_atomic_uint64 xmin; + + /* + * Undo location of the most recent PK modification whose secondary-index + * counterparts are still pending. Set after the PK btree_modify and + * before the WAL write, cleared by tuple_complete_modification. Always + * refers to UndoLogRegular; the other undo types do not participate in + * PK/SK recovery fix-up. + */ + pg_atomic_uint64 pendingSkUndoLoc; + UndoStackSharedLocations undoStackLocations[PROC_XID_ARRAY_SIZE][(int) UndoLogsCount]; + XidVXidMapElement vxids[PROC_XID_ARRAY_SIZE]; +} ODBProcData; + +typedef struct +{ + Oid datoid; + Oid reloid; + Oid relnode; +} ORelOids; + +typedef uint64 S3TaskLocation; + +typedef RelFileLocator RelFileNode; +#define PG_FUNCNAME_MACRO __func__ +#define ORelOidsSetFromRel(oids, rel) \ + do { \ + (oids).datoid = MyDatabaseId; \ + (oids).reloid = (rel)->rd_id; \ + (oids).relnode = (rel)->rd_locator.relNumber; \ + } while (0) +#define RelIsInMyDatabase(rel) ((rel)->rd_locator.dbOid == MyDatabaseId) +#define RelGetNode(rel) ((rel)->rd_locator) +#define RelFileNodeGetNode(node) ((node)->relNumber) +#define IndexStmtGetOldNode(stmt) ((stmt)->oldNumber) +#define RelationSetNewRelfilenode(relation, persistence) \ + RelationSetNewRelfilenumber(relation, persistence) + +#define ORelOidsIsValid(oids) (OidIsValid((oids).datoid) && OidIsValid((oids).reloid) && OidIsValid((oids).relnode)) +#define ORelOidsIsEqual(l, r) ((l).datoid == (r).datoid && (l).reloid == (r).reloid && (l).relnode == (r).relnode) +#define ORelOidsSetInvalid(oids) \ + ((oids).datoid = (oids).reloid = (oids).relnode = InvalidOid) + +#if PG_VERSION_NUM >= 170000 + +#define LXID vxid.lxid +#define REORDER_BUFFER_TUPLE_TYPE HeapTuple +/* Renaming */ +#define TRANSAM_VARIABLES TransamVariables +#define WAIT_EVENT_MQ_PUT_MESSAGE WAIT_EVENT_MESSAGE_QUEUE_PUT_MESSAGE +#define vacuum_is_relation_owner vacuum_is_permitted_for_relation +#define ResourceOwnerEnlargeCatCacheRefs ResourceOwnerEnlarge +#define ResourceOwnerEnlargeCatCacheListRefs ResourceOwnerEnlarge +/* Join BackendId and ProcNumber */ +#define BACKENDID procNumber +#define PROCBACKENDID vxid.procNumber +#define MYPROCNUMBER MyProcNumber +#define MyBackendId MyProcNumber +#define PROCNUMBER(proc) GetNumberFromPGProc(proc) +/* Deprecated */ +#define palloc0fast palloc0 + +#else + +#define LXID lxid +#define REORDER_BUFFER_TUPLE_TYPE ReorderBufferTupleBuf * +/* Before renaming */ +#define TRANSAM_VARIABLES ShmemVariableCache +/* BackendId and ProcNumber were separate */ +#define BACKENDID backendId +#define PROCBACKENDID backendId +#define MYPROCNUMBER MyProc->pgprocno +#define PROCNUMBER(proc) ((proc)->pgprocno) + +#endif + +typedef struct +{ + uint64 len:16, + off:48; +} FileExtent; + +/* + * Should be used as a beginning of header in all orioledb shared in-memory pages: + * BTree pages, Meta-pages, SeqBuf pages, etc. + * + * At writing page to disk: + * - OrioleDBPageHeader is replaced by OrioleDBOndiskPageHeader of the same size. + * - Necessary information related to checkpoint moved to OrioleDBOndiskPageHeader. + * - All other information (related to compression and page format version) is initialized as needed. + * + * At reading page from disk: + * - Necessary information (related to checkpoint and compression) is extracted from OrioleDBOndiskPageHeader. + * - Сompression info and page format version is used for checks and decompression (if needed) + * - OrioleDBPageHeader from decompressed page is redundant and it is not used (but we check checkpointNum in it just in case) + * - Page header is replaced by empty OrioleDBPageHeader of the same size. + * - Necessary information related to checkpoint is restored to OrioleDBPageHeader. + */ +typedef struct +{ + pg_atomic_uint64 state; + uint32 pageChangeCount; + uint32 checkpointNum; +} OrioleDBPageHeader; + +/* + * Should be used as a beginning of header in all orioledb disk pages. + * (See extensive comment to OrioleDBPageHeader above) + */ +typedef struct +{ + /* + * We save number of chunks inside downlinks instead of size of compressed + * data because it helps us to avoid too often setup dirty flag for parent + * if page is changed. + * + * The header of compressed data contains compressed data length. + */ + uint32 checkpointNum; /* Checkpoint number for both compressed and + * not compressed pages */ + uint16 compress_page_size; /* Reserved for compressed pages. Empty + * for non-compressed */ + uint8 compress_version; /* Reserved for compressed pages. Empty + * for non-compressed */ + + /* + * Version of binary page format for possible conversion. For compressed + * pages it should be used for conversion of uncompressed images + */ + uint8 page_version; + uint32 reserved1; + uint32 reserved2; +} OrioleDBOndiskPageHeader; + +StaticAssertDecl(sizeof(OrioleDBPageHeader) == sizeof(OrioleDBOndiskPageHeader), + "sizes of OrioleDBPageHeader and OrioleDBOndiskPageHeader are not equal"); +#define O_PAGE_HEADER_SIZE sizeof(OrioleDBPageHeader) +#define O_PAGE_HEADER(page) ((OrioleDBPageHeader *)(page)) + +#define O_PAGE_CHANGE_COUNT_MAX (0x7FFFFFFF) +#define InvalidOPageChangeCount (O_PAGE_CHANGE_COUNT_MAX) +#define O_PAGE_CHANGE_COUNT_INC(page) \ + if (O_PAGE_HEADER(page)->pageChangeCount >= O_PAGE_CHANGE_COUNT_MAX) \ + O_PAGE_HEADER(page)->pageChangeCount = 0; \ + else \ + O_PAGE_HEADER(page)->pageChangeCount++; +#define O_PAGE_GET_CHANGE_COUNT(p) (O_PAGE_HEADER(p)->pageChangeCount) + +#define S3_OFFSET_MASK (0x00FFFFFFFF) +#define S3_CHKP_NUM_MASK (0xFF00000000) +#define S3_CHKP_NUM_SHIFT (32) +#define S3_GET_CHKP_NUM(offset) (((offset) & S3_CHKP_NUM_MASK) >> S3_CHKP_NUM_SHIFT) + +#define InvalidFileExtentLen (0) +#define InvalidFileExtentOff (UINT64CONST(0xFFFFFFFFFFFF)) +#define FileExtentLenIsValid(len) ((len) != InvalidFileExtentLen) +#define FileExtentOffIsValid(off) ((off) < InvalidFileExtentOff) +#define FileExtentIsValid(extent) (FileExtentLenIsValid((extent).len) && FileExtentOffIsValid((extent).off)) +#define CompressedSize(page_size) ((page_size) == ORIOLEDB_BLCKSZ \ + ? ORIOLEDB_BLCKSZ \ + : ((page_size) + sizeof(OrioleDBOndiskPageHeader) + ORIOLEDB_COMP_BLCKSZ - 1)) +#define FileExtentLen(page_size) (CompressedSize(page_size) / ORIOLEDB_COMP_BLCKSZ) + +typedef struct +{ + ORelOids oids; + int ionum; + FileExtent fileExtent; + uint32 flags:4, + type:28; + OInMemoryBlkno leftBlkno; +} OrioleDBPageDesc; + +/* orioledb.c */ +extern Size orioledb_buffers_size; +extern Size orioledb_buffers_count; +extern Size orioledb_temp_buffers_count; +extern Size undo_circular_buffer_size; +extern uint32 undo_buffers_count; +extern Size xid_circular_buffer_size; +extern Size rewind_circular_buffer_size; +extern double regular_block_undo_circular_buffer_fraction; +extern double system_undo_circular_buffer_fraction; +extern uint32 xid_buffers_count; +extern uint32 rewind_buffers_count; +extern Pointer o_shared_buffers; +extern ODBProcData *oProcData; +extern int max_procs; +extern Page *local_ppool_pages; +extern OrioleDBPageDesc *page_descs; +extern OrioleDBPageDesc *local_ppool_page_descs; +extern bool remove_old_checkpoint_files; +extern bool skip_unmodified_trees; +extern bool debug_disable_bgwriter; +extern MemoryContext btree_insert_context; +extern MemoryContext btree_seqscan_context; +extern double o_checkpoint_completion_ratio; +extern int max_io_concurrency; +extern bool use_mmap; +extern bool use_device; +extern bool orioledb_use_sparse_files; +extern int device_fd; +extern char *device_filename; +extern Pointer mmap_data; +extern Size device_length; +extern int default_compress; +extern int default_primary_compress; +extern int default_toast_compress; +extern bool orioledb_table_description_compress; +extern BlockNumber max_bridge_ctid_blkno; +extern bool orioledb_s3_mode; +extern int s3_num_workers; +extern int s3_desired_size; +extern int s3_queue_size_guc; +extern char *s3_host; +extern bool s3_use_https; +extern char *s3_region; +extern char *s3_prefix; +extern char *s3_accesskey; +extern char *s3_secretkey; +extern char *s3_cainfo; +extern bool enable_rewind; +extern int rewind_max_time; +extern int rewind_max_transactions; +extern int logical_xid_buffers_guc; +extern bool orioledb_strict_mode; +extern XLogRecPtr replay_until_lsn; + +/* For page eviction/read checkpoint test only */ +extern uint32 min_read_page_checkpoint; +extern uint32 max_read_page_checkpoint; + +#define GET_CUR_PROCDATA() \ + (AssertMacro(MYPROCNUMBER >= 0 && \ + MYPROCNUMBER < max_procs), \ + &oProcData[MYPROCNUMBER]) +/* Needed to get blkno without high bit that defines if page is local or not */ +#define O_BLKNO_MASK ((OInMemoryBlkno) 0x7FFFFFFF) +#define O_PAGE_IS_LOCAL(blkno) ((blkno) >> 31 != 0) +#define O_GET_IN_MEMORY_PAGE(blkno) \ + (AssertMacro(OInMemoryBlknoIsValid(blkno)), \ + (O_PAGE_IS_LOCAL(blkno) ? local_ppool_pages[(blkno) & O_BLKNO_MASK] : \ + (Page)(o_shared_buffers + (((uint64) ((blkno) & O_BLKNO_MASK)) * ((uint64) ORIOLEDB_BLCKSZ))))) +#define O_GET_IN_MEMORY_PAGEDESC(blkno) \ + (AssertMacro(OInMemoryBlknoIsValid(blkno)), \ + (O_PAGE_IS_LOCAL(blkno) ? local_ppool_page_descs + ((blkno) & O_BLKNO_MASK) : \ + page_descs + ((blkno) & O_BLKNO_MASK))) +#define O_GET_IN_MEMORY_PAGE_CHANGE_COUNT(blkno) \ + (O_PAGE_GET_CHANGE_COUNT(O_GET_IN_MEMORY_PAGE(blkno))) + +extern void orioledb_check_shmem(void); + +typedef int OCompress; +#define O_COMPRESS_DEFAULT (10) +#define InvalidOCompress (-1) +#define OCompressIsValid(compress) ((compress) != InvalidOCompress) + +typedef struct ORelOptions +{ + StdRdOptions std_options; + int compress_offset; + int primary_compress_offset; + int toast_compress_offset; + bool index_bridging; +} ORelOptions; + +typedef struct OBTOptions +{ + BTOptions bt_options; + int compress_offset; + bool orioledb_index; +} OBTOptions; + +extern int16 o_parse_compress(const char *value); +extern void o_invalidate_oids(ORelOids oids); + +#define EXPR_ATTNUM (FirstLowInvalidHeapAttributeNumber - 1) + +/* orioledb.c */ +typedef enum OPagePoolType +{ + OPagePoolMain = 0, + OPagePoolFreeTree = 1, + OPagePoolCatalog = 2 +} OPagePoolType; +#define OPagePoolTypesCount 3 + +typedef struct PagePool PagePool; +typedef struct LocalPagePool LocalPagePool; +struct BTreeDescr; + +extern LocalPagePool local_ppool; + +extern void o_verify_dir_exists_or_create(char *dirname, bool *created, bool *found); +extern uint64 orioledb_device_alloc(struct BTreeDescr *descr, uint32 size); +extern PagePool *get_ppool(OPagePoolType type); +extern PagePool *get_ppool_by_blkno(OInMemoryBlkno blkno); +extern OInMemoryBlkno get_dirty_pages_count_sum(void); +extern void jsonb_push_key(JsonbParseState **state, char *key); +extern void jsonb_push_null_key(JsonbParseState **state, char *key); +extern void jsonb_push_bool_key(JsonbParseState **state, char *key, bool value); +extern void jsonb_push_int8_key(JsonbParseState **state, char *key, int64 value); +extern void jsonb_push_string_key(JsonbParseState **state, const char *key, const char *value); +extern bool is_bump_memory_context(MemoryContext mxct); +extern void o_page_desc_init(OrioleDBPageDesc *desc); + +extern CheckPoint_hook_type next_CheckPoint_hook; + +/* tableam_handler.c */ +extern bool is_orioledb_rel(Relation rel); + +typedef struct OTableDescr OTableDescr; +typedef struct OIndexDescr OIndexDescr; + +/* ddl.c */ +extern List *reindex_list; +extern IndexBuildResult o_pkey_result; +extern bool o_in_add_column; + +extern void orioledb_setup_ddl_hooks(void); +extern void o_ddl_cleanup(void); +extern void o_drop_table(ORelOids oids); + +/* scan.c */ +extern CustomScanMethods o_scan_methods; + +#endif /* __ORIOLEDB_H__ */ diff --git a/contrib/orioledb/include/recovery/internal.h b/contrib/orioledb/include/recovery/internal.h new file mode 100644 index 00000000000..e6b67b0d1a8 --- /dev/null +++ b/contrib/orioledb/include/recovery/internal.h @@ -0,0 +1,261 @@ +/*------------------------------------------------------------------------- + * + * internal.h + * Internal declarations for orioledb engine recovery. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/internal.h + * + *------------------------------------------------------------------------- + */ +#ifndef __RECOVERY_INTERNAL_H__ +#define __RECOVERY_INTERNAL_H__ + +#include "postgres.h" + +#include "orioledb.h" + +#include "postmaster/bgworker.h" + +/* + * Recovery transaction support functions. + */ +extern void recovery_init(int worker_id); +extern void recovery_switch_to_oxid(OXid oxid, int worker_id); +extern void recovery_finish_current_oxid(CommitSeqNo csn, XLogRecPtr ptr, + int worker_id, bool sync); +extern void recovery_savepoint(SubTransactionId parentSubid, int worker_id); +extern void recovery_rollback_to_savepoint(SubTransactionId parentSubid, int worker_id); +extern void recovery_finish(int worker_id); +extern void update_recovery_undo_loc_flush(bool single, int worker_id); +extern void recovery_on_proc_exit(int code, Datum arg); + +extern Pointer recovery_first_queue; +extern uint64 recovery_queue_data_size; + +#define GET_WORKER_QUEUE(worker_id) ((void*)(recovery_first_queue \ + + recovery_queue_data_size * (worker_id))) +#define GET_WORKER_ID(hash) ((hash) % recovery_pool_size_guc) + +/* + * Recovery from master to workers messages format. + */ +#define RECOVERY_MSG_OPERATION_MASK (0xFF) + +typedef enum +{ + RecoveryMsgTypeInsert, + RecoveryMsgTypeUpdate, + RecoveryMsgTypeDelete, + RecoveryMsgTypeBridgeErase, + RecoveryMsgTypeCommit, + RecoveryMsgTypeRollback, + RecoveryMsgTypeFinished, + RecoveryMsgTypeSynchronize, + RecoveryMsgTypeToastConsistent, + RecoveryMsgTypeSavepoint, + RecoveryMsgTypeRollbackToSavepointt, + RecoveryMsgTypeLeaderParallelIndexBuild, + RecoveryMsgTypeWorkerParallelIndexBuild, + RecoveryMsgTypeInit, + RecoveryMsgTypeReinsert +} RecoveryMsgType; + +#define RECOVERY_MODIFY_OXID (0x0100) +#define RECOVERY_MODIFY_OIDS (0x0200) + +#define RECOVERY_QUEUE_BUF_SIZE (8 * 1024) + + +typedef struct +{ + uint32 type; +} RecoveryMsgHeader; + +typedef struct +{ + RecoveryMsgHeader header; + bool needsFeedback; + OXid oxid; + XLogRecPtr ptr; +} RecoveryMsgOXidPtr; + +typedef struct +{ + RecoveryMsgHeader header; + XLogRecPtr ptr; +} RecoveryMsgPtr; + +typedef struct +{ + RecoveryMsgHeader header; + Size o_table_size; + char o_table_serialized[FLEXIBLE_ARRAY_MEMBER]; +} RecoveryMsgIdxBuild; + +typedef struct +{ + RecoveryMsgHeader header; + ORelOids oids; + ORelOids old_oids; + OIndexNumber ix_num; + uint32 o_table_version; + uint32 old_o_table_version; + uint64 current_position; + bool isrebuild; + OXid oxid; +} RecoveryMsgLeaderIdxBuild; + +typedef struct +{ + RecoveryMsgHeader header; + OXid oxid; + dsm_handle seg_handle; +} RecoveryMsgWorkerIdxBuild; + +typedef struct +{ + RecoveryMsgHeader header; +} RecoveryMsgEmpty; + +typedef struct +{ + uint32 finishRequestCheckpointNumber; + uint32 immediateRequestCheckpointNumber; + uint32 completedCheckpointNumber; + uint32 recoveryMainCompletedCheckpointNumber; + slock_t exitLock; +} RecoveryUndoLocFlush; + +typedef struct +{ + pg_atomic_uint64 commitPtr; + pg_atomic_uint64 retainPtr; + uint32 flushedUndoLocCompletedCheckpointNumber; + pg_atomic_flag hasTempFile; +} RecoveryWorkerPtrs; + +typedef struct +{ + RecoveryMsgHeader header; + OXid oxid; + SubTransactionId parentSubId; +} RecoveryMsgSavepoint; + +typedef struct +{ + RecoveryMsgHeader header; + OXid oxid; + XLogRecPtr ptr; + SubTransactionId parentSubId; +} RecoveryMsgRollbackToSavepoint; + +typedef struct ParallelRecoveryContext +{ + int nworkers; /* Number of recovery workers except a leader */ + shm_toc_estimator estimator; + dsm_segment *seg; + void *private_memory; + shm_toc *toc; +} ParallelRecoveryContext; + +#define O_PARALLEL_RECOVERY_MAGIC 0xD42E9F13 + +extern bool toast_consistent; +extern pg_atomic_uint32 *worker_finish_count; +extern pg_atomic_uint32 *idx_worker_finish_count; +extern pg_atomic_uint32 *worker_ptrs_changes; +extern RecoveryWorkerPtrs *worker_ptrs; +extern pg_atomic_uint64 *recovery_ptr; +extern pg_atomic_uint64 *recovery_main_retain_ptr; +extern bool *recovery_single_process; +extern RecoveryUndoLocFlush *recovery_undo_loc_flush; + +extern bool *was_in_recovery; +extern pg_atomic_uint32 *after_recovery_cleaned; + +extern pg_atomic_uint64 *recovery_index_next_pos; +extern pg_atomic_uint64 *recovery_index_completed_pos; +extern ConditionVariable *recovery_index_cv; + +/* + * Recovery master/workers functions. + */ +extern BackgroundWorkerHandle *recovery_worker_register(int worker_id); +PGDLLEXPORT void recovery_worker_main(Datum main_arg); + +/* + * Functions to work with parallel recovery contexts. + */ +extern ParallelRecoveryContext *CreateParallelRecoveryContext(int nworkers); +extern void InitializeParallelRecoveryDSM(ParallelRecoveryContext *context); +extern void DestroyParallelRecoveryContext(ParallelRecoveryContext *context); + +/* + * Recovery utility. + */ +extern void apply_modify_record(OTableDescr *descr, OIndexDescr *id, + uint16 type, OTuple p); +extern bool apply_btree_modify_record(BTreeDescr *tree, + RecoveryMsgType type, + OTuple ptr, OXid oxid, CommitSeqNo csn); +extern void replay_erase_bridge_item(OIndexDescr *bridge, ItemPointer iptr); + +extern OBTreeModifyCallbackAction recovery_insert_primary_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_delete_primary_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_insert_overwrite_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_delete_overwrite_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); + +extern OBTreeModifyCallbackAction recovery_insert_deleted_primary_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + BTreeLeafTupleDeletedStatus deleted, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_delete_deleted_primary_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + BTreeLeafTupleDeletedStatus deleted, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_insert_deleted_overwrite_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + BTreeLeafTupleDeletedStatus deleted, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); +extern OBTreeModifyCallbackAction recovery_delete_deleted_overwrite_callback(BTreeDescr *descr, + OTuple tup, OTuple *newtup, + OXid oxid, OTupleXactInfo xactInfo, + BTreeLeafTupleDeletedStatus deleted, + UndoLocation location, RowLockMode *lock_mode, + BTreeLocationHint *hint, + void *arg); + +#endif /* __RECOVERY_INTERNAL_H__ */ diff --git a/contrib/orioledb/include/recovery/logical.h b/contrib/orioledb/include/recovery/logical.h new file mode 100644 index 00000000000..68ddee5605d --- /dev/null +++ b/contrib/orioledb/include/recovery/logical.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * logical.h + * External declarations for logical decoding of OrioleDB tables. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/logical.h + * + *------------------------------------------------------------------------- + */ +#ifndef __LOGICAL_H__ +#define __LOGICAL_H__ + +#include "btree/btree.h" +#include "recovery/internal.h" + +#include "replication/decode.h" +#include "replication/logical.h" + +extern void orioledb_decode(LogicalDecodingContext *ctx, + XLogRecordBuffer *buf); + +#endif /* __LOGICAL_H__ */ diff --git a/contrib/orioledb/include/recovery/recovery.h b/contrib/orioledb/include/recovery/recovery.h new file mode 100644 index 00000000000..284515baa83 --- /dev/null +++ b/contrib/orioledb/include/recovery/recovery.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * recovery.h + * External declarations for orioledb engine recovery. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/recovery.h + * + *------------------------------------------------------------------------- + */ +#ifndef __RECOVERY_H__ +#define __RECOVERY_H__ + +#include "btree/btree.h" +#include "recovery/internal.h" +#include "btree/page_contents.h" + +extern void o_recovery_start_hook(void); +extern void orioledb_redo(XLogReaderState *record); +extern void o_xact_redo_hook(TransactionId xid, XLogRecPtr lsn, bool commit); +extern void o_recovery_finish_hook(bool cleanup); + +extern Size recovery_shmem_needs(void); +extern void recovery_shmem_init(Pointer ptr, bool found); +extern bool is_recovery_process(void); +extern CommitSeqNo recovery_map_oxid_csn(OXid oxid, bool *found); +extern void worker_send_msg(int worker_id, Pointer msg, uint64 msg_size); +extern void worker_queue_flush(int worker_id); +extern void idx_workers_shutdown(void); +extern void recovery_send_leader_oids(ORelOids oids, OIndexNumber ix_num, + uint32 o_table_version, + ORelOids old_oids, uint32 old_o_table_version, + bool isrebuild); +extern void recovery_send_worker_oids(dsm_handle seg_handle); +extern void workers_send_finish(bool send_to_idx_pool); +extern void update_proc_retain_undo_location(int worker_id); + +static inline bool +is_recovery_in_progress(void) +{ + return is_recovery_process() || RecoveryInProgress(); +} + +extern XLogRecPtr recovery_get_current_ptr(void); +extern XLogRecPtr recovery_get_effective_replay_ptr(void); + +extern bool orioledb_recovery_stops_before_hook(XLogReaderState *record, + TransactionId *recordXid, + TimestampTz *recordXtime); + +extern int recovery_queue_size_guc; +extern int recovery_pool_size_guc; +extern int recovery_idx_pool_size_guc; +extern OXid recovery_oxid; +extern TransactionId recoveryHeapTransactionId; +extern pg_atomic_uint64 *recovery_finished_list_ptr; + +typedef struct BTreeDescr BTreeDescr; + +extern OTuple recovery_rec_insert(BTreeDescr *desc, OTuple tuple, bool *allocated, int *size); +extern OTuple recovery_rec_update(BTreeDescr *desc, OTuple tuple, bool *allocated, int *size); +extern OTuple recovery_rec_delete(BTreeDescr *desc, OTuple tuple, bool *allocated, int *size, char relreplident); +extern OTuple recovery_rec_delete_key(BTreeDescr *desc, OTuple key, bool *allocated, int *size); + +extern void recovery_cleanup_old_files(uint32 max_chkp_num, + bool before_recovery); + +extern void recovery_load_state_from_file(int worker_id, uint32 chkpnum, bool shutdown); +extern bool check_recovery_workers_finished(void); + +#endif /* __RECOVERY_H__ */ diff --git a/contrib/orioledb/include/recovery/wal.h b/contrib/orioledb/include/recovery/wal.h new file mode 100644 index 00000000000..0b3686bee55 --- /dev/null +++ b/contrib/orioledb/include/recovery/wal.h @@ -0,0 +1,230 @@ +/*------------------------------------------------------------------------- + * + * wal.h + * WAL declarations for orioledb. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/wal.h + * + *------------------------------------------------------------------------- + */ +#ifndef __WAL_H__ +#define __WAL_H__ + +#define WAL_REC_NONE (0) + +#define WAL_CONTAINER_HAS_XACT_INFO (1U << 0) +#define WAL_CONTAINER_HAS_ORIGIN_INFO (1U << 1) + +/* + * Current WAL version of OrioleDB. + * Bump it when WAL format changes compared to previous release. + * ORIOLEDB_WAL_VERSION makes sense and should be converted even between + * different ORIOLEDB_BINARY_VERSION's. This is unlike + * ORIOLEDB_SYS_TREE_VERSION, ORIOLEDB_PAGE_VERSION and + * ORIOLEDB_COMPRESS_VERSION (see big comment on versioning + * in include/orioledb.h) + */ +#define ORIOLEDB_WAL_VERSION (17) + +/* + * Value has been fixed at the moment of introducing WAL versioning. + * WAL versions before FIRST_ORIOLEDB_WAL_VERSION are treated as zero + * and still supported for on-the fly conversion to the current + * ORIOLEDB_WAL_VERSION. (Exact value follows the fact that before + * WAL version was introduced in the beginning of the WAL container, + * WAL container started from rec_type byte with at most 4 lower bits + * occupied. So it's a way to distinguish pre-WAL version container + * from the container with WAL version.) + * + * We should never change this value. + */ +#define FIRST_ORIOLEDB_WAL_VERSION (16) + +/* + * Particular WAL version when per-container flags were added to WAL container. + * + * We should never change this value. + */ +#define ORIOLEDB_CONTAINER_FLAGS_WAL_VERSION (17) + +/* Constants for commitInProgressXlogLocation */ +#define OWalTmpCommitPos (0) +#define OWalInvalidCommitPos UINT64_MAX + + +/* + * Data structures for transactions in-progress recording. + */ +typedef struct +{ + uint8 recType; +} WALRec; + +typedef struct +{ + uint8 recType; + uint8 oxid[sizeof(OXid)]; + uint8 logicalXid[sizeof(TransactionId)]; + /* Since ORIOLEDB_WAL_VERSION = 17 */ + uint8 heapXid[sizeof(TransactionId)]; +} WALRecXid; + +typedef struct +{ + uint8 recType; + uint8 topXid[sizeof(TransactionId)]; + uint8 subXid[sizeof(TransactionId)]; +} WALRecSwitchLogicalXid; + +typedef struct +{ + uint8 recType; + uint8 treeType; + uint8 datoid[sizeof(Oid)]; + uint8 reloid[sizeof(Oid)]; + uint8 relnode[sizeof(Oid)]; + /* Since ORIOLEDB_WAL_VERSION = 17 */ + uint8 xmin[sizeof(OXid)]; + uint8 csn[sizeof(CommitSeqNo)]; + uint8 cid[sizeof(CommandId)]; + uint8 version[sizeof(uint32)]; + uint8 baseVersion[sizeof(uint32)]; +} WALRecRelation; + +typedef struct +{ + uint8 recType; + uint8 relreplident; + uint8 relreplident_ix_oid[sizeof(Oid)]; +} WALRecRelReplident; + +typedef struct +{ + uint8 recType; + uint8 datoid[sizeof(Oid)]; + uint8 reloid[sizeof(Oid)]; + uint8 old_relnode[sizeof(Oid)]; + uint8 new_relnode[sizeof(Oid)]; +} WALRecOTablesUnlockMeta; + +/* Modify record that contains one tuple */ +typedef struct +{ + uint8 recType; + uint8 tupleFormatFlags; + uint8 length[sizeof(OffsetNumber)]; + /* tuple[length] */ +} WALRecModify1; + +/* Modify records that contains 2 tuples, old and new. Needed for REINSERT and for REPLICA IDENTITY FULL */ +typedef struct +{ + uint8 recType; + uint8 tupleFormatFlags1; + uint8 tupleFormatFlags2; + uint8 length1[sizeof(OffsetNumber)]; + uint8 length2[sizeof(OffsetNumber)]; + /* tuple1[length1] */ + /* tuple2[length2] */ +} WALRecModify2; + +typedef struct +{ + uint8 recType; + uint8 parentSubid[sizeof(SubTransactionId)]; + uint8 logicalXid[sizeof(TransactionId)]; + uint8 parentLogicalXid[sizeof(TransactionId)]; +} WALRecSavepoint; + +typedef struct +{ + uint8 recType; + uint8 parentSubid[sizeof(SubTransactionId)]; + uint8 xmin[sizeof(OXid)]; /* Since ORIOLEDB_WAL_VERSION = 17 */ + uint8 csn[sizeof(CommitSeqNo)]; /* Since ORIOLEDB_WAL_VERSION = 17 */ +} WALRecRollbackToSavepoint; + +typedef struct +{ + uint8 recType; + uint8 xid[sizeof(TransactionId)]; + uint8 xmin[sizeof(OXid)]; + uint8 csn[sizeof(CommitSeqNo)]; +} WALRecJointCommit; + +typedef struct +{ + uint8 recType; + uint8 xmin[sizeof(OXid)]; + uint8 csn[sizeof(CommitSeqNo)]; +} WALRecFinish; + +typedef struct +{ + uint8 recType; + uint8 datoid[sizeof(Oid)]; + uint8 reloid[sizeof(Oid)]; + uint8 relnode[sizeof(Oid)]; +} WALRecTruncate; + +typedef struct +{ + uint8 recType; + uint8 iptr[sizeof(ItemPointerData)]; +} WALRecBridgeErase; + +typedef struct +{ + uint8 xactTime[sizeof(TimestampTz)]; + uint8 xid[sizeof(TransactionId)]; +} WALRecXactInfo; + +typedef struct +{ + uint8 origin_id[sizeof(RepOriginId)]; + uint8 origin_lsn[sizeof(XLogRecPtr)]; +} WALRecOriginInfo; + + +#define LOCAL_WAL_BUFFER_SIZE (8192) +#define ORIOLEDB_WAL_PREFIX "o_wal" +#define ORIOLEDB_WAL_PREFIX_SIZE (5) + +extern const char *wal_record_type_to_string(int wal_record); + +extern void add_rel_wal_record(ORelOids oids, OIndexType type, uint32 version, uint32 base_version); + +extern void add_modify_wal_record(uint8 rec_type, BTreeDescr *desc, + OTuple tuple, OffsetNumber length, char relreplident, uint32 version, uint32 base_version); +extern void add_bridge_erase_wal_record(BTreeDescr *desc, ItemPointer iptr, uint32 version, uint32 base_version); +extern void add_o_tables_meta_lock_wal_record(void); +extern void add_o_tables_meta_unlock_wal_record(ORelOids oids, Oid oldRelnode); +extern void add_switch_logical_xid_wal_record(TransactionId logicalXid_top, TransactionId logicalXid_sub); +extern void add_savepoint_wal_record(SubTransactionId parentSubid, + TransactionId prentLogicalXid); +extern void add_rollback_to_savepoint_wal_record(SubTransactionId parentSubid); +extern bool local_wal_is_empty(void); +extern XLogRecPtr flush_local_wal(bool isCommit, bool withXactTime); +extern XLogRecPtr wal_commit(OXid oxid, TransactionId logicalXid, + bool isAutonomous); +extern XLogRecPtr wal_joint_commit(OXid oxid, TransactionId logicalXid, + TransactionId xid, bool subTransaction); +extern void wal_after_commit(void); +extern void wal_rollback(OXid oxid, TransactionId logicalXid, + bool isAutonomous); +extern XLogRecPtr log_logical_wal_container(Pointer ptr, int length, + bool withXactTime); +extern void o_wal_insert(BTreeDescr *desc, OTuple tuple, char relreplident, uint32 version); +extern void o_wal_update(BTreeDescr *desc, OTuple tuple, OTuple oldtuple, char relreplident, uint32 version); +extern void o_wal_delete(BTreeDescr *desc, OTuple tuple, char relreplident, uint32 version); +extern void o_wal_delete_key(BTreeDescr *desc, OTuple key, bool is_bridge_index, uint32 version); +extern void o_wal_reinsert(BTreeDescr *desc, OTuple oldtuple, OTuple newtuple, char relreplident, uint32 version); +extern void add_truncate_wal_record(ORelOids oids); +extern bool get_local_wal_has_material_changes(void); +extern void set_local_wal_has_material_changes(bool value); + +#endif /* __WAL_H__ */ diff --git a/contrib/orioledb/include/recovery/wal_reader.h b/contrib/orioledb/include/recovery/wal_reader.h new file mode 100644 index 00000000000..7b2c789dc24 --- /dev/null +++ b/contrib/orioledb/include/recovery/wal_reader.h @@ -0,0 +1,250 @@ +/*------------------------------------------------------------------------- + * + * wal_reader.h + * WAL parser declarations for OrioleDB. + * + * Copyright (c) 2026, Oriole DB Inc. + * Copyright (c) 2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/wal_reader.h + * + *------------------------------------------------------------------------- + */ +#ifndef __WAL_READER_H__ +#define __WAL_READER_H__ + +#include "recovery/wal_record.h" + +/* + * WalRecord instances are transient and reused across iterations. + * + * Callers must not retain pointers to the record itself. + * Any data that must outlive the callback must be copied. + */ +typedef struct WalRecord +{ + WalRecordType type; + + uint32 offset; + Pointer data; + + ORelOids oids; + OXid oxid; + TransactionId logicalXid; + TransactionId heapXid; + char relreplident; + + union + { + struct + { + OXid xmin; + CommitSeqNo csn; + } finish; + struct + { + TransactionId topXid; + TransactionId subXid; + } swxid; + struct + { + TransactionId xid; + OXid xmin; + CommitSeqNo csn; + } joint_commit; + struct + { + uint8 treeType; + OSnapshot snapshot; + uint32 version; + uint32 base_version; + } relation; + struct + { + Oid relreplident_ix_oid; + } relreplident; + struct + { + ORelOids oids; + Oid oldRelnode; + } unlock; + struct + { + ORelOids oids; + } truncate; + struct + { + SubTransactionId parentSubid; + TransactionId parentLogicalXid; + } savepoint; + struct + { + SubTransactionId parentSubid; + OXid xmin; + CommitSeqNo csn; + } rb_to_sp; + struct + { + ItemPointerData iptr; + } bridge_erase; + + struct + { + OTuple t1; + OffsetNumber len1; + + OTuple t2; + OffsetNumber len2; + + bool read_two_tuples; + } modify; + } u; + +} WalRecord; + +typedef struct WalContainer +{ + uint16 version; + uint8 flags; + + struct + { + TimestampTz xactTime; + TransactionId xid; + } xact_info; + + struct + { + RepOriginId id; + XLogRecPtr lsn; + } origin_info; +} WalContainer; + +/* + * WalParseResult + * + * Status codes returned by the WAL container parser and callbacks. + * + * WALPARSE_OK + * Success. + * + * WALPARSE_STOP + * Terminate parser. + * + * WALPARSE_EOF + * Not enough bytes in the input buffer to parse the requested element. + * This is a "need more data" / framing error depending on the caller. + * + * WALPARSE_BAD_TYPE + * Unknown record/flag type, missing descriptor, or otherwise + * unparseable tag. Treated as a hard protocol error. + * + * WALPARSE_BAD_VERSION + * Container version policy rejected by the consumer (typically WAL from + * a newer or unsupported OrioleDB version). + */ +typedef enum WalParseResult +{ + WALPARSE_OK = 0, + WALPARSE_STOP, + WALPARSE_EOF, /* not enough bytes */ + WALPARSE_BAD_TYPE, + WALPARSE_BAD_VERSION +} WalParseResult; + +struct WalReaderState; + +/* + * WalCheckVersionFn + * + * Optional consumer-level version policy check. + * + * Called after container framing has been validated, but before any + * records are delivered. Allows recovery/decoder code to reject WAL + * based on semantic compatibility rules. + */ +typedef WalParseResult (*WalCheckVersionFn) (const struct WalReaderState *r); +typedef WalParseResult (*WalOnContainerFn) (struct WalReaderState *r); +typedef WalParseResult (*WalOnRecordFn) (struct WalReaderState *r, WalRecord *rec); + +/* + * Cursor advancement invariant: + * + * r->ptr must only be advanced by: + * + * - WR_PARSE / WR_SKIP, + * - record parse routines, + * - container flag parsers. + * + * Consumers must never modify ptr. + */ +typedef struct WalReaderState +{ + Pointer start; + Pointer end; + Pointer ptr; + WalContainer container; + + /* Consumer */ + void *ctx; + WalCheckVersionFn check_version; + WalOnContainerFn on_container; + WalOnRecordFn on_record; +} WalReaderState; + +/* + * WalParseFn + * + * Parser routine for a single record type. + * + * The parser must: + * - read exactly this record's payload from r->ptr, + * - populate rec->u.* fields as needed, + * - leave r->ptr positioned at the next record tag. + * + * It must not read beyond r->end; use WR_REQUIRE_SIZE / WR_PARSE / WR_SKIP. + */ +typedef WalParseResult (*WalParseFn) (WalReaderState *r, WalRecord *rec); + +/* + * Reader helpers. + * + * WR_REQUIRE_SIZE() + * Ensures that at least nbytes remain in the input buffer. + * + * WR_PARSE() + * Copies sizeof(*out) bytes from r->ptr into *out and advances r->ptr. + * + * WR_SKIP() + * Advances r->ptr by sz bytes after bounds check. + * + * These macros are the preferred way to move the cursor. Direct arithmetic + * on r->ptr should be avoided outside of low-level parsing code. + */ + +#define WR_REQUIRE_SIZE(r, nbytes) \ +do { \ + if (((size_t) ((r)->end - (r)->ptr)) < (size_t)(nbytes)) \ + return WALPARSE_EOF; \ +} while (0) + +#define WR_PARSE(r, out) \ +{ \ + WR_REQUIRE_SIZE(r, sizeof(*out)); \ + memcpy(out, r->ptr, sizeof(*out)); \ + r->ptr += sizeof(*out); \ +} + +#define WR_SKIP(r, sz) \ +{ \ + WR_REQUIRE_SIZE(r, sz); \ + r->ptr += sz; \ +} + +extern void build_fixed_tuples(const WalRecord *rec, OFixedTuple *tuple1, OFixedTuple *tuple2); + +extern const char *wal_type_name(WalRecordType type); +extern WalParseResult wal_parse_container(WalReaderState *r, bool allow_logging); + +#endif /* __WAL_READER_H__ */ diff --git a/contrib/orioledb/include/recovery/wal_record.h b/contrib/orioledb/include/recovery/wal_record.h new file mode 100644 index 00000000000..8e181c61431 --- /dev/null +++ b/contrib/orioledb/include/recovery/wal_record.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * wal_record.h + * WAL records list. The exact representation of a WAL record is determined + * by a X-Macro, which is not defined in this file; it can be defined by the + * caller for special purposes. + * + * Copyright (c) 2026, Oriole DB Inc. + * Copyright (c) 2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/recovery/wal_record.h + * + *------------------------------------------------------------------------- + */ + +#ifndef __WAL_RECORD_H__ +#define __WAL_RECORD_H__ + +/* + * WAL record types + * + * WAL_REC_REINSERT: + * UPDATE with changed pkey represented as DELETE + INSERT in OrioleDB but + * externally exported as an UPDATE in logical decoding + * + * WAL_REC_SWITCH_LOGICAL_XID: + * Record type for a case when both heap and Oriole apply changes within a + * single transaction, so one logical xid is assigned by heap, and the other + * is assigned by Oriole. + * This record defines a connection between Oriole's sub-transaction xid and + * a xid of the top heap transaction which is needed for logical decoder. + * Otherwise, without this connection, main transaction suddenly becomes + * splitted into two independent parts. + * From logical decoder's point of view this looks like two independent + * transactions but in fact internally related to each other. This situation + * outcomes in troubles for logical decoder with visibility of heap + * modifications in Oriole's sub-part due to incorrect state of the + * MVCC-historical snapshot. + */ + +#define ORIOLE_WAL_RECORDS(X) \ + X(WAL_REC_XID, 1, "XID", wal_parse_rec_xid) \ + X(WAL_REC_COMMIT, 2, "COMMIT", wal_parse_rec_finish) \ + X(WAL_REC_ROLLBACK, 3, "ROLLBACK", wal_parse_rec_finish) \ + X(WAL_REC_RELATION, 4, "RELATION", wal_parse_rec_relation) \ + X(WAL_REC_INSERT, 5, "INSERT", wal_parse_rec_modify) \ + X(WAL_REC_UPDATE, 6, "UPDATE", wal_parse_rec_modify) \ + X(WAL_REC_DELETE, 7, "DELETE", wal_parse_rec_modify) \ + X(WAL_REC_O_TABLES_META_LOCK, 8, "META_LOCK", wal_parse_empty) \ + X(WAL_REC_O_TABLES_META_UNLOCK, 9, "META_UNLOCK", wal_parse_rec_o_tables_meta_unlock) \ + X(WAL_REC_SAVEPOINT, 10, "SAVEPOINT", wal_parse_rec_savepoint) \ + X(WAL_REC_ROLLBACK_TO_SAVEPOINT,11, "RB_TO_SP", wal_parse_rec_rollback_to_savepoint) \ + X(WAL_REC_JOINT_COMMIT, 12, "JOINT_COMMIT", wal_parse_rec_joint_commit) \ + X(WAL_REC_TRUNCATE, 13, "TRUNCATE", wal_parse_rec_truncate) \ + X(WAL_REC_BRIDGE_ERASE, 14, "BRIDGE_ERASE", wal_parse_rec_bridge_erase) \ + X(WAL_REC_REINSERT, 15, "REINSERT", wal_parse_rec_modify) \ + X(WAL_REC_REPLAY_FEEDBACK, 16, "REPLAY_FEEDBACK", wal_parse_empty) \ + X(WAL_REC_SWITCH_LOGICAL_XID, 17, "SWITCH_LOGICAL_XID", wal_parse_rec_switch_logical_xid) \ + X(WAL_REC_RELREPLIDENT, 18, "RELREPLIDENT", wal_parse_rec_relreplident) + +typedef enum +{ +#define X(sym, val, name, fn) sym = (val), + ORIOLE_WAL_RECORDS(X) +#undef X +} WalRecordType; + +#endif /* __WAL_RECORD_H__ */ diff --git a/contrib/orioledb/include/rewind/rewind.h b/contrib/orioledb/include/rewind/rewind.h new file mode 100644 index 00000000000..6ade9a73cf3 --- /dev/null +++ b/contrib/orioledb/include/rewind/rewind.h @@ -0,0 +1,197 @@ +/*------------------------------------------------------------------------- + * + * rewind.h + * Routines for rewind worker. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/rewind/rewind.h + * + *------------------------------------------------------------------------- + */ +#ifndef __REWIND_WORKER_H__ +#define __REWIND_WORKER_H__ + +#include "c.h" +#include "utils/o_buffers.h" + +#include "access/transam.h" + +#define REWIND_FILE_SIZE (0x1000000) +#define REWIND_BUFFERS_TAG (0) + +extern TransactionId orioledb_vacuum_horizon_hook(void); +extern void register_rewind_worker(void); +extern bool is_rewind_worker(void); +PGDLLEXPORT void rewind_worker_main(Datum); +extern Size rewind_shmem_needs(void); +extern void rewind_init_shmem(Pointer buf, bool found); +extern void checkpoint_write_rewind_xids(void); +extern void add_to_rewind_buffer(OXid oxid, TransactionId xid, int nsubxids, TransactionId *subxids); +extern void save_precommit_xid_subxids(void); +extern TransactionId get_precommit_xid_subxids(int *nsubxids, TransactionId **subxids); +extern void reset_precommit_xid_subxids(void); +extern OXid get_rewind_run_xmin(void); +extern void log_print_rewind_queue(void); + +#define EMPTY_ITEM_TAG (0) +#define REWIND_ITEM_TAG (1) +#define SUBXIDS_ITEM_TAG (2) + +#define SUBXIDS_PER_ITEM (25) + + +#define PG_CTL_CMD_LEN (8) /* Actually we only need 4 extra chars */ +#define PG_CTL_MAX_CMD_LEN (MAXPGPATH + PG_CTL_CMD_LEN) + +/* RewindItem and SubxidsItem should have same size to be castable to each other */ +/* Empty RewindItem and SubxidsItem have invalid oxid and tag */ +typedef struct RewindItem +{ + uint8 tag; + int nsubxids; + OXid oxid; + TransactionId xid; /* regular transaction id if any */ + uint64 onCommitUndoLocation[UndoLogsCount]; + uint64 undoLocation[UndoLogsCount]; + uint64 minRetainLocation[UndoLogsCount]; + FullTransactionId oldestConsideredRunningXid; + OXid runXmin; + TimestampTz timestamp; +} RewindItem; + +typedef struct SubxidsItem +{ + uint8 tag; + int nsubxids; + OXid oxid; /* Redundant, for debug */ + TransactionId subxids[SUBXIDS_PER_ITEM]; +} SubxidsItem; + +#define REWIND_DISK_BUFFER_LENGTH (ORIOLEDB_BLCKSZ / sizeof(RewindItem)) + +typedef struct +{ + pg_atomic_uint64 addPosReserved; /* Next adding position available for + * concurrent add process */ + pg_atomic_uint64 addPosFilledUpto; /* First position that is not yet + * added and it could not yet be + * evicted or read */ + uint64 completePos; /* Next complete position */ + pg_atomic_uint64 evictPos; /* Next evict position. Increments by + * bufferLength only */ + uint64 restorePos; /* Next restore after eviction position. + * Increments by bufferLength only */ + uint64 checkpointPos; /* Already included into checkpoint. Start + * point for writing rewindItem-s into + * checkpoint. */ + uint64 oldToBeCleanedBlockNum; /* First disk block not yet known to + * be fully cleaned; advanced past + * each range punched by cleanup. */ + bool skipCheck; /* Skip timestamp-based check of items to + * process */ + int rewindEvictTrancheId; + LWLock rewindEvictLock; /* Lock during evict/restore page from + * circular buffer against concurrent + * eviction */ + int rewindCheckpointTrancheId; + LWLock rewindCheckpointLock; /* Lock during checkpointing againts + * concurrent eviction */ + pg_atomic_uint64 oldestConsideredRunningXid; + pg_atomic_uint64 runXmin; + bool rewindWorkerStopRequested; + bool rewindWorkerStopped; + bool addToRewindQueueDisabled; + TransactionId complete_xid; + OXid complete_oxid; + TimestampTz complete_timestamp; + TransactionId force_complete_xid; + OXid force_complete_oxid; + +} RewindMeta; + +/* + * Queue of rewind items internals: + * -------------------------------- + * F R + * | C+++++++R++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++E++++++++++++A.A | + * C - completePos, R-restorePos, E-evictPos, AF - addPosFilled, AR - addPosReserved + * " " - empty items, "+" - busy items, "." - transient items + * All positions are uint64 and can only increase. + * E and R are modified only under lock. + * C can only be increased by the only rewind worker and is not locked. + * A(F) and A(R) - are atomic uint64 as they are accessed concurrently by adding backends at commit transaction. + * + * All readable items are from C to AF. Between AF and AR some items are transiently not filled, so they + * are not readable by evict/restore/complete/rewind. But up to A(R) position they are is considered "busy" + * and A(R) is used for calculation of free space for eviction. + * + * Tail and head of this queue are placed into in-memory buffers completeBuffer and addBuffer. + * They use modulo placement so every time ( R-C < buffer_length ) and ( AR - E < buffer_length). + * + * | C+++++R | or alternatively |++++R C+++++| + * + * F R F R + * | E+++A.A | or alternatively |+++A.A E+++++| + * + * Positions between R and E are written to disk (in full blocks only). + * + * There are following actions on a queue: + * - Adding: to addBuffer, done by committing backends + * - Eviction: from addBuffer, done by committing backends + * - Restore: to completeBuffer, done by a rewind worker + * - Rewind: from all buffers, done by a backend where rewind is requested. All other backends are shut down + * before actual Rewind starts. + * - Complete: from completeBuffer, done by a rewind worker + * + * Adding to addBuffer: + * (1) checks is the space enough, if not calls eviction first + * (2) reserves space to addind by incrementing AR + * (3) adds items to the reserved space: + * + * Rewind queue items has two types + * - rewindItem: mandatory for each transaction commit + * - subxidsItem: optional, when subxids exist for a transaction. + * + * All items for the same transaction are written together in the following order: + * rewindItem->subxidsItem(1)->...->subxidsItem(n). This is crucial for proper backwards reading during Rewind. + * + * (4) pushes AF: + * - if all concurrent backends that incremented AR finished + * - if transient difference AR-AF becomes big (then transaction waits concurrent ones to complete) + * + * Eviction from addBuffer has two ways: + * - fast: direct transfer from addBuffer to completeBuffer (in there is space for this) + * - ordinary: items written to disk and cleared from addBuffer (by full blocks) + * + * Restore to completeBuffer also has two ways: + * - ordinary: if some items are evicted to disk but not restored yet. So they are read from disk to free + * space in conpleteBuffer (by full blocks); + * - fast: - if items are only in in-memory buffers. So they are directly transferred from addBuffer to + * completeBuffer. + * + * At Rewind, backend reads all existing items from both in-memory buffers and on-disk buffer in reverse + * so from AF to C. Only one backend remains at the time of do_rewind() starts so it works only serially and doesn't + * need locks. RewindWorker applies undoChain from each item and restores global variables relevant to heap + * transaction visibility. Rewind stops (1) on reaching specified xid/oxid pair or time values (2) on reaching + * completePos, so maximum depth of rewind is limited by a rewind buffer. + * + * At Complete, rewind worker fix both Oriole and heap transactions remembered in queue rewindItems from C to AF. + * This is done serially but may require restore action when completeBuffer frees enough. Restore takes evictLock + * to avoid concurrent eviction by backends. + * + * Eviction has two locks: + * rewindEvictLock - against concurrent eviction (1) and eviction during restore (2). Only backend that got + * eviction lock does eviction, others skip and do not wait. + * + * rewindCheckpointLock - against eviction during checkpoint. All backends that couldn't get it wait until + * checkpoint releases the lock. + * + * + */ + +#define InvalidRewindPos UINT64_MAX + +#endif /* __REWIND_WORKER_H__ */ diff --git a/contrib/orioledb/include/s3/checkpoint.h b/contrib/orioledb/include/s3/checkpoint.h new file mode 100644 index 00000000000..853e356ee4d --- /dev/null +++ b/contrib/orioledb/include/s3/checkpoint.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * checkpoint.h + * Declarations for S3 checkpointing. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/checkpoint.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_CHECKPOINT_H__ +#define __S3_CHECKPOINT_H__ + +extern void s3_perform_backup(int flags, S3TaskLocation maxLocation); + +#endif /* __S3_CHECKPOINT_H__ */ diff --git a/contrib/orioledb/include/s3/checksum.h b/contrib/orioledb/include/s3/checksum.h new file mode 100644 index 00000000000..0d1dd0ec236 --- /dev/null +++ b/contrib/orioledb/include/s3/checksum.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * checksum.h + * Declarations for calculating checksums of S3-specific data. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/checksum.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_CHECKSUM_H__ +#define __S3_CHECKSUM_H__ + +#include "utils/hsearch.h" + +#include "openssl/sha.h" + +#define O_SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) + +typedef struct S3FileChecksum +{ + char filename[MAXPGPATH]; + char checksum[O_SHA256_DIGEST_STRING_LENGTH]; + + bool changed; /* true if the checksum changed since last + * checkpoint */ + uint32 checkpointNumber; +} S3FileChecksum; + +typedef struct S3ChecksumState +{ + HTAB *hashTable; + uint32 checkpointNumber; + + S3FileChecksum *fileChecksums; /* Buffer of S3FileChecksum entries */ + uint32 fileChecksumsMaxLen; + uint32 fileChecksumsLen; +} S3ChecksumState; + +extern S3ChecksumState *makeS3ChecksumState(uint32 checkpointNumber, + S3FileChecksum *fileChecksums, + uint32 fileChecksumsMaxLen, + const char *filename); +extern void freeS3ChecksumState(S3ChecksumState *state); +extern void flushS3ChecksumState(S3ChecksumState *state, const char *filename); + +extern S3FileChecksum *getS3FileChecksum(S3ChecksumState *state, + const char *filename, + Pointer data, uint64 size); + +#endif /* __S3_CHECKSUM_H__ */ diff --git a/contrib/orioledb/include/s3/control.h b/contrib/orioledb/include/s3/control.h new file mode 100644 index 00000000000..abdb98382ef --- /dev/null +++ b/contrib/orioledb/include/s3/control.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * control.h + * Declarations for S3 control and lock files. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/control.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_CONTROL_H__ +#define __S3_CONTROL_H__ + +extern bool s3_check_control(const char **errmsgp, const char **errdetailp); +extern void s3_put_lock_file(void); +extern void s3_delete_lock_file(void); + +#endif /* __S3_CONTROL_H__ */ diff --git a/contrib/orioledb/include/s3/headers.h b/contrib/orioledb/include/s3/headers.h new file mode 100644 index 00000000000..04521ce37f3 --- /dev/null +++ b/contrib/orioledb/include/s3/headers.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * headers.h + * Declarations for handling of S3-specific data file headers. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/headers.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_HEADERS_H__ +#define __S3_HEADERS_H__ + +typedef struct +{ + OIndexKey key; + uint32 checkpointNum; + int segNum; +} S3HeaderTag; + +typedef enum +{ + S3PartStatusNotLoaded = 0, + S3PartStatusLoading = 1, + S3PartStatusLoaded = 2, + S3PartStatusEvicting = 3 +} S3PartStatus; + +/* + * Two S3HeaderTags identify the same on-disk file when they share the same + * datoid, relnode, tablespace, checkpointNum and segNum. reloid and ixType + * are tree-level metadata that do not affect the file path and must not + * participate in the hash/equality check. + */ +#define S3HeaderTagsIsEqual(t1, t2) \ + ((t1).key.oids.datoid == (t2).key.oids.datoid && \ + (t1).key.oids.relnode == (t2).key.oids.relnode && \ + (t1).key.tablespace == (t2).key.tablespace && \ + (t1).checkpointNum == (t2).checkpointNum && \ + (t1).segNum == (t2).segNum) + +extern int s3_headers_buffers_size; + +extern Size s3_headers_shmem_needs(void); +extern void s3_headers_shmem_init(Pointer buf, bool found); +extern void s3_headers_increase_loaded_parts(uint64 inc); +extern uint32 s3_header_get_load_id(S3HeaderTag tag); +extern bool s3_header_lock_part(S3HeaderTag tag, int index, + uint32 *loadId); +extern S3PartStatus s3_header_mark_part_loading(S3HeaderTag tag, int index); +extern void s3_header_mark_part_loaded(S3HeaderTag tag, int index); +extern void s3_header_unlock_part(S3HeaderTag tag, int index, bool setDirty); +extern bool s3_header_mark_part_scheduled_for_write(S3HeaderTag tag, int index); +extern void s3_header_mark_part_writing(S3HeaderTag tag, int index); +extern void s3_header_mark_part_written(S3HeaderTag tag, int index); +extern void s3_header_mark_part_not_written(S3HeaderTag tag, int index); +extern void s3_headers_sync(void); +extern void s3_headers_error_cleanup(void); +extern void s3_headers_try_eviction_cycle(void); + +#endif /* __S3_HEADERS_H__ */ diff --git a/contrib/orioledb/include/s3/queue.h b/contrib/orioledb/include/s3/queue.h new file mode 100644 index 00000000000..1f63f6fdfdb --- /dev/null +++ b/contrib/orioledb/include/s3/queue.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * queue.h + * Declarations for queue of tasks for S3 workers. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/queue.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_QUEUE_H__ +#define __S3_QUEUE_H__ + +#include "postgres.h" + +#include "orioledb.h" + +#define InvalidS3TaskLocation (UINT64_MAX) + +extern Size s3_queue_shmem_needs(void); +extern void s3_queue_init_shmem(Pointer ptr, bool found); +extern S3TaskLocation s3_queue_get_insert_location(void); +extern S3TaskLocation s3_queue_put_task(Pointer data, uint32 len); +extern S3TaskLocation s3_queue_try_pick_task(void); +Pointer s3_queue_get_task(S3TaskLocation taskLocation); +extern void s3_queue_erase_task(S3TaskLocation taskLocation); +extern void s3_queue_wait_for_location(S3TaskLocation location); + +#endif /* __S3_QUEUE_H__ */ diff --git a/contrib/orioledb/include/s3/requests.h b/contrib/orioledb/include/s3/requests.h new file mode 100644 index 00000000000..cdb6fae16ea --- /dev/null +++ b/contrib/orioledb/include/s3/requests.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * requests.h + * Declarations for S3 requests. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/requests.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_REQUESTS_H__ +#define __S3_REQUESTS_H__ + +#define S3_RESPONSE_OK 200 +#define S3_RESPONSE_NOT_FOUND 404 +#define S3_RESPONSE_CONDITION_CONFLICT 409 +#define S3_RESPONSE_CONDITION_FAILED 412 + +extern long s3_put_file(char *objectname, char *filename, bool ifNoneMatch); +extern void s3_get_file(char *objectname, char *filename); +extern void s3_put_empty_dir(char *objectname); +extern long s3_put_file_part(char *objectname, char *filename, int partnum); +extern void s3_get_file_part(char *objectname, char *filename, int partnum); +extern long s3_put_object_with_contents(char *objectname, Pointer data, + uint64 dataSize, char *dataChecksum, + bool ifNoneMatch); +extern long s3_get_object(char *objectname, StringInfo str, bool missing_ok); +extern void s3_delete_object(char *objectname); + +extern Pointer read_file(const char *filename, uint64 *size); + +#endif /* __S3_REQUESTS_H__ */ diff --git a/contrib/orioledb/include/s3/worker.h b/contrib/orioledb/include/s3/worker.h new file mode 100644 index 00000000000..9ab3266049f --- /dev/null +++ b/contrib/orioledb/include/s3/worker.h @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * + * worker.h + * Declarations for S3 worker process. + * + * Copyright (c) 2024-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/s3/worker.h + * + *------------------------------------------------------------------------- + */ +#ifndef __S3_WORKER_H__ +#define __S3_WORKER_H__ + +#include "orioledb.h" + +#include "btree/undo.h" +#include "s3/queue.h" + +typedef enum +{ + S3TaskTypeWriteFile, + S3TaskTypeWriteFilePart, + S3TaskTypeReadFilePart, + S3TaskTypeWriteWALFile, + S3TaskTypeWriteUndoFile, + S3TaskTypeWriteEmptyDir, + S3TaskTypeWriteRootFile, + S3TaskTypeWritePGFile +} S3TaskType; + +/* + * The data structure representing the task for S3 worker. + */ +typedef struct +{ + S3TaskType type; + union + { + struct + { + uint32 chkpNum; + bool delete; + char filename[FLEXIBLE_ARRAY_MEMBER]; + } writeFile; + struct + { + UndoLogType undoType; + uint64 fileNum; + } writeUndoFile; + struct + { + OIndexKey key; + uint32 chkpNum; + int32 segNum; + int32 partNum; + } filePart; + char walFilename[1]; + struct + { + uint32 chkpNum; + char dirname[FLEXIBLE_ARRAY_MEMBER]; + } writeEmptyDir; + struct + { + bool delete; + char filename[FLEXIBLE_ARRAY_MEMBER]; + } writeRootFile; + struct + { + uint32 chkpNum; + char filename[FLEXIBLE_ARRAY_MEMBER]; + } writePGFile; + } typeSpecific; +} S3Task; + +#define FILE_CHECKSUMS_FILENAME ORIOLEDB_DATA_DIR "/file_checksums" + +extern Size s3_workers_shmem_needs(void); +extern void s3_workers_init_shmem(Pointer ptr, bool found); +extern void register_s3worker(int num); +extern void s3_workers_checkpoint_init(void); +extern void s3_workers_checkpoint_finish(void); +PGDLLEXPORT void s3worker_main(Datum); + +extern S3TaskLocation s3_schedule_file_write(uint32 chkpNum, char *filename, + bool delete); +extern S3TaskLocation s3_schedule_empty_dir_write(uint32 chkpNum, + char *dirname); +extern S3TaskLocation s3_schedule_file_part_write(uint32 chkpNum, OIndexKey key, + int32 segNum, int32 partNum); +extern S3TaskLocation s3_schedule_file_part_read(uint32 chkpNum, OIndexKey key, + int32 segNum, int32 partNum); +extern S3TaskLocation s3_schedule_wal_file_write(char *filename); +extern S3TaskLocation s3_schedule_undo_file_write(UndoLogType undoType, + uint64 fileNum); +extern S3TaskLocation s3_schedule_downlink_load(struct BTreeDescr *desc, + uint64 downlink); +extern S3TaskLocation s3_schedule_root_file_write(char *filename, bool delete); +extern S3TaskLocation s3_schedule_pg_file_write(uint32 chkpNum, char *filename); +extern void s3_load_file_part(uint32 chkpNum, OIndexKey key, int32 segNum, + int32 partNum); +extern void s3_load_map_file(uint32 chkpNum, OIndexKey key); + + +#endif /* __S3_WORKER_H__ */ diff --git a/contrib/orioledb/include/tableam/bitmap_scan.h b/contrib/orioledb/include/tableam/bitmap_scan.h new file mode 100644 index 00000000000..ff61f04503c --- /dev/null +++ b/contrib/orioledb/include/tableam/bitmap_scan.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * bitmap_scan.h + * Declarations for bitmap scan of OrioleDB table. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/bitmap_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_BITMAP_SCAN_H__ +#define __TABLEAM_BITMAP_SCAN_H__ + +#include "btree/scan.h" +#include "tableam/handler.h" +#include "tableam/scan.h" + +#include "executor/executor.h" +#include "lib/rbtree.h" + +typedef struct OBitmapScan OBitmapScan; + +typedef struct OBitmapHeapPlanState +{ + OPlanState o_plan_state; + Plan *bitmapqualplan; + PlanState *bitmapqualplanstate; + /* index quals, in standard expr form */ + List *bitmapqualorig; + /* initialized ExprState for bitmapqualorig, reused across tuples */ + ExprState *bitmapqualorig_state; + Oid typeoid; + OSnapshot oSnapshot; + MemoryContext cxt; + OBitmapScan *scan; + OEACallsCounters *eaCounters; +} OBitmapHeapPlanState; + +extern OBitmapScan *o_make_bitmap_scan(OBitmapHeapPlanState *bitmap_state, + ScanState *ss, + PlanState *bitmapqualplanstate, + Relation rel, Oid typeoid, + OSnapshot *oSnapshot, MemoryContext cxt); +extern TupleTableSlot *o_exec_bitmap_fetch(OBitmapScan *scan, + CustomScanState *node); +extern void o_free_bitmap_scan(OBitmapScan *scan); + +extern RBTree *o_keybitmap_create(void); +extern void o_keybitmap_insert(RBTree *rbtree, uint64 value); +extern void o_keybitmap_intersect(RBTree *a, RBTree *b); +extern void o_keybitmap_union(RBTree *a, RBTree *b); +extern void o_keybitmap_free(RBTree *tree); +extern bool o_keybitmap_is_empty(RBTree *rbtree); +extern bool o_keybitmap_test(RBTree *rbtree, uint64 value); +extern bool o_keybitmap_range_is_valid(RBTree *rbtree, uint64 low, uint64 high); +extern uint64 o_keybitmap_get_next(RBTree *rbtree, uint64 prev, bool *found); + +#endif /* __TABLEAM_BITMAP_SCAN_H__ */ diff --git a/contrib/orioledb/include/tableam/descr.h b/contrib/orioledb/include/tableam/descr.h new file mode 100644 index 00000000000..a6f671ba8a9 --- /dev/null +++ b/contrib/orioledb/include/tableam/descr.h @@ -0,0 +1,371 @@ +/*------------------------------------------------------------------------- + * + * descr.h + * Declarations of descriptors used for table access method definition. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/descr.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_DESCR_H__ +#define __TABLEAM_DESCR_H__ + +#include "checkpoint/checkpoint.h" +#include "catalog/sys_trees.h" +#include "utils/seq_buf.h" +#include "s3/queue.h" +#include "tableam/handler.h" +#include "transam/undo.h" +#include "tuple/format.h" + +#include "access/htup_details.h" +#include "utils/resowner.h" +#include "access/nbtree.h" +#include "access/skey.h" +#include "commands/explain.h" +#include "executor/tuptable.h" +#include "nodes/pathnodes.h" + +/* tableam/descr.c */ + +typedef struct +{ + SharedRootInfoKey key; + + CheckpointFileHeader file_header; + S3TaskLocation maxLocation[2]; + EvictedSeqBufData freeBuf; + + /* + * we can hold just offsets here, but SeqBufTag info can be useful on + * debug + */ + EvictedSeqBufData nextChkp; + EvictedSeqBufData tmpBuf; + + bool dirtyFlag1; + bool dirtyFlag2; + + uint32 punchHolesChkpNum; +} EvictedTreeData; + + +typedef struct OComparator OComparator; +typedef struct OComparatorKey OComparatorKey; + +typedef struct OExclusionFn +{ + Oid operator; + + FmgrInfo finfo; +} OExclusionFn; + +typedef struct OHashFnKey +{ + Oid datoid; + Oid hash_fn_oid; +} OHashFnKey; + +typedef struct OHashFn +{ + OHashFnKey key; + FmgrInfo finfo; +} OHashFn; + +#define O_DEFAULT_HASH_FN_OID 1 /* Using 1 here to distinct from uninitialized + * fields */ +extern OHashFn o_default_hash_fn; + +/* + * The index field descriptor + */ +typedef struct +{ + Oid inputtype; + Oid opfamily; + Oid opclass; + Oid collation; + bool ascending; + bool nullfirst; + + /* + * A cached comparator to compare inputtype values according to opfamily + * and opclass. + */ + OComparator *comparator; + OExclusionFn *exclusion_fn; + OHashFn *hash_fn; +} OIndexField; + +typedef struct AttrNumberMap +{ + AttrNumber key; + AttrNumber value; +} AttrNumberMap; + +/* + * The index descriptor + */ +struct OIndexDescr +{ + ORelOids oids; + ORelOids tableOids; + uint32 version; + + /* reference count */ + int refcnt; + bool valid; + + BTreeDescr desc; + + /* Name of the index */ + NameData name; + + MemoryContext index_mctx; + List *expressions; /* list of Expr */ + List *predicate; /* list of Expr */ + char *predicate_str; + + List *expressions_state; /* list of ExprState */ + ExprState *predicate_state; + ExprContext *econtext; + + /* Tuple descriptor and format specifier for non-leaf tuples */ + TupleDesc nonLeafTupdesc; + OTupleFixedFormatSpec nonLeafSpec; + + /* Tuple descriptor and format specifier for leaf tuples */ + TupleDesc leafTupdesc; + OTupleFixedFormatSpec leafSpec; + + /* + * Flag to indicate unique index and number of unique fields for unique + * index. + */ + bool unique; + bool immediate; + bool nulls_not_distinct; + int nUniqueFields; + + /* + * Flag indicates that primary key index on the table is surrogate index + * on ctid (no primary key is explicitly defined). + */ + bool primaryIsCtid; + + /* + * Indicates that bridging enabled for table: i.e. there is bridge_ctid + * column in pkey and also bridge index + */ + bool bridging; + + uint8 fillfactor; + + /* Description of the index fields */ + int nFields; + int nKeyFields; + int nIncludedFields; + OIndexField *fields; + + /* + * Attnums for primary key values in the secondary index tuples. We may + * assume that secondary index tuple just contain primary key values in + * the tail. But we would like to save the space if secondary index + * shares some attributes with primary key. + */ + int nPrimaryFields; + AttrNumber primaryFieldsAttnums[INDEX_MAX_KEYS]; + + /* Compression rate used in this index */ + OCompress compress; + + /* + * Attribute numbers of fields in table tupdesc. Counts from 1. Ctid + * counts as the first column if used. + */ + AttrNumber *tableAttnums; + /* The maximal value in tableAttnums */ + int maxTableAttnum; + + /* Used in getsomeattrs to reorder fields during index only scan of pkey */ + AttrNumberMap *pk_tbl_field_map; + + /* Cached comparators used to fill key for pkey tuple search */ + OComparator **pk_comparators; + + /* tupdesc and slots needed for indexam operations */ + TupleDesc itupdesc; + TupleTableSlot *index_slot; + TupleTableSlot *old_leaf_slot; + TupleTableSlot *new_leaf_slot; + /* Copy of duplicates from OIndex */ + List *duplicates; +}; + +/* + * TODO: Remove usage of this function or document its purpose + */ +static inline OffsetNumber +OIndexKeyAttnumToTupleAttnum(BTreeKeyType keyType, OIndexDescr *idx, int attnum) +{ + if (keyType == BTreeKeyLeafTuple && idx->desc.type == oIndexPrimary) + { + Assert((attnum - 1) < idx->leafTupdesc->natts); + return idx->tableAttnums[attnum - 1] + (idx->bridging && !idx->primaryIsCtid ? 1 : 0); + } + else + { + Assert((keyType == BTreeKeyLeafTuple && attnum <= idx->leafTupdesc->natts) || + (keyType == BTreeKeyNonLeafKey && attnum <= idx->nFields)); + return attnum; + } +} + +#define OGetIndexContext(index) \ + ((index)->index_mctx ? \ + (index)->index_mctx : \ + ((index)->index_mctx = AllocSetContextCreate(TopMemoryContext, \ + "OIndexContext", \ + ALLOCSET_DEFAULT_SIZES))) + +#define OIgnoreColumn(descr, attnum) \ + ((descr->desc.type != oIndexToast && descr->desc.type != oIndexBridge) && \ + (attnum >= descr->nKeyFields) && \ + (attnum < (descr->nKeyFields + descr->nIncludedFields))) + +struct OTableDescr +{ + ORelOids oids; + uint32 version; + + /* reference count */ + int refcnt; + + /* Source table tupdesc (without ctid, etc) */ + TupleDesc tupdesc; + + /* Slots for handling the modifications */ + TupleTableSlot *oldTuple; + TupleTableSlot *newTuple; + + /* + * Description of table indices and toast. indices[0] always points to + * the primary key, reset of indeces array point to the secondary indices. + */ + OIndexDescr **indices; + OIndexDescr *bridge; + OIndexDescr *toast; + + /* list of TOASTable values */ + AttrNumber *toastable; + /* number of toastable fields */ + int ntoastable; + /* number of trees */ + int nIndices; + /* number of unique trees */ + int nUniqueIndices; + /* OID of the tablespace for table */ + Oid tablespace; + + bool noInvalidation; +}; + +typedef struct +{ + Datum *values; + bool *nulls; + TupleDesc desc; + OTupleFixedFormatSpec *spec; + FmgrInfo *outputFns; + TupleDesc keyDesc; + OTupleFixedFormatSpec *keySpec; + FmgrInfo *keyOutputFns; + bool printRowVersion; + bool truncateValues; +} TuplePrintOpaque; + +#define O_INVALIDATE_OIDS_ON_COMMIT 1 +#define O_INVALIDATE_OIDS_ON_ABORT 2 + +typedef struct +{ + OnCommitUndoStackItem header; + ORelOids oids; + uint32 flags; +} InvalidateUndoStackItem; + +#define GET_PRIMARY(descr) ((descr)->indices[PrimaryIndexNumber]) + +extern OTableFetchContext default_table_fetch_context; + +/* + * Please, read commit before o_bree_load_shmemd() definition. + */ + +extern OTableDescr *o_fetch_table_descr_extended(ORelOids oids, OTableFetchContext ctx); + +extern OIndexDescr *o_fetch_index_descr_extended(ORelOids oids, OIndexType type, + bool lock, OTableFetchContext ctx, OTableFetchContext base_ctx); + +extern OTableDescr *o_fetch_table_descr(ORelOids oids); +extern OIndexDescr *o_fetch_index_descr(ORelOids oids, OIndexType type, + bool lock, bool *nested); + +extern void recreate_table_descr_by_oids(ORelOids oids); +extern void o_fill_tmp_table_descr(OTableDescr *descr, OTable *o_table); +extern void o_free_tmp_table_descr(OTableDescr *descr); + +static inline bool +is_explain_analyze(PlanState *ps) +{ + return ps->state->es_instrument & INSTRUMENT_BUFFERS; +} + +extern void o_btree_load_shmem(BTreeDescr *desc); +extern bool o_btree_load_shmem_checkpoint(BTreeDescr *desc); +extern bool o_btree_try_use_shmem(BTreeDescr *desc); + +extern SharedRootInfo *o_find_shared_root_info(SharedRootInfoKey *key); +extern void o_insert_shared_root_placeholder(Oid datoid, Oid relnode); + +extern OComparator *o_find_comparator(Oid opfamily, + Oid lefttype, + Oid righttype, + Oid collation); +extern int o_call_comparator(OComparator *comparator, Datum left, + Datum right); +extern int o_call_exclusion_fn(OExclusionFn *exclusion_fn, Datum left, Datum right, Oid collation); +extern uint32 o_call_hash_fn(OHashFn *hash_fn, Oid collation, Datum val); +extern void o_invalidate_comparator_cache(Oid opfamily, Oid lefttype, + Oid righttype); + +extern EvictedTreeData *read_evicted_data(Oid datoid, Oid relnode, bool delete); +extern void insert_evicted_data(EvictedTreeData *data); + +extern void oFillFieldOpClassAndComparator(OIndexField *field, Oid datoid, Oid opclassoid, Oid exclusion_op, Oid hash_fn_oid); +extern void o_finish_sort_support_function(OComparator *comparator, SortSupport ssup); + +extern void o_add_invalidate_undo_item(ORelOids oids, uint32 flags); +extern void o_invalidate_undo_item_callback(UndoLogType undoType, + UndoLocation location, + UndoStackItem *baseItem, + OXid oxid, OUndoCallbackStage stage, + bool changeCountsValid); + +extern void o_add_invalidate_comparator_undo_item(Oid opfamily, Oid lefttype, Oid righttype); +extern void o_invalidate_comparator_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, + OXid oxid, OUndoCallbackStage stage, + bool changeCountsValid); +extern void reset_saving_inval_messages(void); + +extern void ResourceOwnerRememberOTableDescr(ResourceOwner owner, OTableDescr *descr); +extern void ResourceOwnerForgetOTableDescr(ResourceOwner owner, OTableDescr *descr); +extern void ResourceOwnerRememberOIndexDescr(ResourceOwner owner, OIndexDescr *descr); +extern void ResourceOwnerForgetOIndexDescr(ResourceOwner owner, OIndexDescr *descr); + +#endif diff --git a/contrib/orioledb/include/tableam/handler.h b/contrib/orioledb/include/tableam/handler.h new file mode 100644 index 00000000000..1e360f4e64f --- /dev/null +++ b/contrib/orioledb/include/tableam/handler.h @@ -0,0 +1,227 @@ +/*------------------------------------------------------------------------- + * + * handler.h + * Declarations of table access method handler + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/handler.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_HANDLER_H__ +#define __TABLEAM_HANDLER_H__ + +#include "btree/btree.h" +#include "btree/undo.h" +#include "catalog/o_tables.h" + +#include "access/tableam.h" +#include "nodes/execnodes.h" +#include "nodes/pathnodes.h" +#include "rewrite/rewriteHandler.h" + +extern bool is_orioledb_rel(Relation rel); +extern OIndexNumber find_tree_in_descr(OTableDescr *descr, ORelOids oids); + +/* EXPLAIN ANALYZE functions call counter */ +typedef struct +{ + uint32 read; /* o_btree_read_page() */ + uint32 write; /* write_page() */ + uint32 load; /* load_page() */ + uint32 lock; /* lock_page() */ + uint32 evict; /* evict_page() */ +} OEACallsCounter; + +#define EA_COUNTERS_NUM (5) /* number of EXPLAIN ANALYZE counters */ + +/* + * EXPLAIN ANALYZE counters for different trees involved in single executor + * node. + */ +typedef struct +{ + /* Identifiers of table being analyzed */ + ORelOids oids; + /* Table descriptor */ + OTableDescr *descr; + /* Counters for primary and secondary indices */ + int nindices; + OEACallsCounter *indices; + /* Counters for TOAST */ + OEACallsCounter toast; + /* Counters for indices of other tables */ + OEACallsCounter others; +} OEACallsCounters; + +/* + * EXPLAIN ANALYZE function call counters. + * will be init and free in tableam_scan.c + */ +extern OEACallsCounters *ea_counters; + +/* returns AnalyzeCallsCounter for specified index number */ +static inline OEACallsCounter * +get_ea_counters(OrioleDBPageDesc *desc) +{ + OIndexNumber ix_num = find_tree_in_descr(ea_counters->descr, desc->oids); + + if (ix_num == InvalidIndexNumber) + return &ea_counters->others; + if (ix_num == TOASTIndexNumber) + return &ea_counters->toast; + return &ea_counters->indices[ix_num]; +} + +/* increases EXPLAIN_ANALYZE counter for o_btree_read_page() call */ +#define EA_READ_INC(blkno) \ + if (ea_counters != NULL) \ + { \ + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); \ + OEACallsCounter *ix_counter = get_ea_counters(desc); \ + if (ix_counter != NULL) \ + ix_counter->read++; \ + } + +/* increases EXPLAIN_ANALYZE counter for write_read() call */ +#define EA_WRITE_INC(blkno) \ + if (ea_counters != NULL) \ + { \ + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); \ + OEACallsCounter *ix_counter = get_ea_counters(desc); \ + if (ix_counter != NULL) \ + ix_counter->write++; \ + } + +/* increases EXPLAIN_ANALYZE counter for load_page() call */ +#define EA_LOAD_INC(blkno) \ + if (ea_counters != NULL) \ + { \ + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); \ + OEACallsCounter *ix_counter = get_ea_counters(desc); \ + if (ix_counter != NULL) \ + ix_counter->load++; \ + } + +/* increases EXPLAIN_ANALYZE counter for lock_page() call */ +#define EA_LOCK_INC(blkno) \ + if (ea_counters != NULL) \ + { \ + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); \ + OEACallsCounter *ix_counter = get_ea_counters(desc); \ + if (ix_counter != NULL) \ + ix_counter->lock++; \ + } + +/* increases EXPLAIN_ANALYZE counter for evict_page() call */ +#define EA_EVICT_INC(blkno) \ + if (ea_counters != NULL) \ + { \ + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); \ + OEACallsCounter *ix_counter = get_ea_counters(desc); \ + if (ix_counter != NULL) \ + ix_counter->evict++; \ + } + +extern void cleanup_btree(OIndexKey ix_key, bool files, bool fsync); +extern bool o_drop_shared_root_info(Oid datoid, Oid relnode); +extern void o_tableam_descr_init(void); +extern void o_invalidate_descrs(Oid datoid, Oid reloid, Oid relfilenode); +extern bool o_start_saving_inval_messages(void); +extern void o_stop_saving_inval_messages(bool was_saving); +extern void o_replay_saved_inval_messages(void); +extern void init_print_options(BTreePrintOptions *printOptions, VarChar *optionsArg); +extern void orioledb_free_rd_amcache(Relation rel); +extern OTableDescr *relation_get_descr(Relation rel); +extern void table_descr_inc_refcnt(OTableDescr *descr); +extern void table_descr_dec_refcnt(OTableDescr *descr); + +extern Size orioledb_parallelscan_estimate(Relation rel); +extern Size orioledb_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan); +extern Size orioledb_parallelscan_initialize_inner(ParallelTableScanDesc pscan); +extern void orioledb_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan); + +extern int64 orioledb_calculate_relation_size(Relation rel, ForkNumber forkNumber, uint8 method); +extern int64 orioledb_calculate_database_size(Oid dbOid); +extern database_size_hook_type prev_database_size_hook; + +typedef enum +{ + OParallelScanPageInvalid, + OParallelScanPageValid, + OParallelScanPageInProgress +} OParallelScanPageStatus; + +/* + * OrioleDB-specific shared state for parallel table scan. + * + * Each backend participating in a parallel table scan has its own BTreeSeqScan + * in its memory, that contains a pointer to ParallelOScanDescData. The + * information here is sufficient to properly initialize each new BTreeSeqScan + * as workers join the scan, and to coordiate their scans. + */ + +typedef struct BTreeIntPageParallelData +{ + char img[ORIOLEDB_BLCKSZ]; /* internal page image */ + OFixedShmemKey prevHikey; /* low key of internal page */ + OffsetNumber offset; + OffsetNumber startOffset; /* first offset on internal page */ + OParallelScanPageStatus status; + int pageno; /* debug only */ + CommitSeqNo imgReadCsn; +} BTreeIntPageParallelData; + +typedef BTreeIntPageParallelData *BTreeIntPageParallel; + +#define O_PARALLEL_LEADER_STARTED 1 +#define O_PARALLEL_FIRST_PAGE_LOADED (1<<1) +#define O_PARALLEL_IS_SINGLE_LEAF_PAGE (1<<2) +#define O_PARALLEL_CURRENT_PAGE (1<<3) /* If set then current + * internal page is in + * intPage[1], and next + * internal page is in + * intPage[0]. If not set - + * vice versa. */ +#define O_PARALLEL_DOWNLINKS_SORTED (1<<4) +#define O_PARALLEL_DSM_CREATED (1<<5) + +#define CUR_PAGE(poscan) (&(poscan)->intPage[((poscan)->flags & O_PARALLEL_CURRENT_PAGE) ? 0 : 1]) +#define NEXT_PAGE(poscan) (&(poscan)->intPage[((poscan)->flags & O_PARALLEL_CURRENT_PAGE) ? 1 : 0]) + +typedef struct ParallelOScanDescData +{ + ParallelTableScanDescData phs_base; /* Shared AM-independent state for + * parallel table scan */ + BTreeIntPageParallelData intPage[2]; + slock_t intpageAccess, + workerStart; /* for sequential workers joining */ + LWLock intpageLoad, /* for sequential internal page loading */ + downlinksPublish; /* workers can put disk downlinks to + * shared state */ + pg_atomic_uint64 downlinksCount; /* number of disk downlinks written to + * shared DSM array */ + pg_atomic_uint64 downlinkIndex; + pg_atomic_uint32 downlinksWritersInProgress; /* number of workers + * currently writing a + * downlink to DSM */ + uint64 dsmAllocated; /* number of slots allocated in DSM array */ + bits8 flags; + int nworkers; /* number of scan workers initialized their + * own seq scan */ + dsm_handle dsmHandle; + /* debug only */ + int cur_int_pageno; +#ifdef USE_ASSERT_CHECKING + bool worker_active[1024]; +#endif +} ParallelOScanDescData; + +typedef ParallelOScanDescData *ParallelOScanDesc; + +extern bool in_nontransactional_truncate; + +#endif diff --git a/contrib/orioledb/include/tableam/index_scan.h b/contrib/orioledb/include/tableam/index_scan.h new file mode 100644 index 00000000000..3e481985450 --- /dev/null +++ b/contrib/orioledb/include/tableam/index_scan.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * index_scan.h + * Declarations for index scan of OrioleDB table. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/index_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_INDEX_SCAN_H__ +#define __TABLEAM_INDEX_SCAN_H__ + +#include "tableam/key_range.h" +#include "tableam/scan.h" + +#include "access/sdir.h" + +typedef struct OScanState +{ + IndexScanDescData scandesc; + OIndexNumber ixNum; + MemoryContext cxt; + ScanDirection scanDir; + bool addJunk; + /* is only current index can be used in scan */ + bool onlyCurIx; + bool returning; + bool curKeyRangeIsLoaded; + int numPrefixExactKeys; + bool exact; + OBTreeKeyRange curKeyRange; + BTreeIterator *iterator; + List *indexQuals; + /* used only by direct modify functions */ + CmdType cmd; + OSnapshot oSnapshot; +} OScanState; + +typedef struct OIndexPlanState +{ + OPlanState o_plan_state; + OScanState ostate; + /* Used only in o_explain_custom_scan */ + List *stripped_indexquals; + bool onlyCurIx; + struct ScanKeyData *iss_ScanKeys; + int iss_NumScanKeys; + IndexRuntimeKeyInfo *iss_RuntimeKeys; + int iss_NumRuntimeKeys; + bool iss_RuntimeKeysReady; + ExprContext *iss_RuntimeContext; + + /* + * Keep the index relation open for the lifetime of the scan so that + * ostate.scandesc.indexRelation remains valid. _bt_preprocess_keys and + * related nbtree helpers read rd_opfamily / rd_indoption through this + * pointer; if the relation were closed and the relcache entry evicted, + * the pointer would dangle. + */ + Relation indexRelation; +} OIndexPlanState; + +/* + * iteration code. + */ +extern void init_index_scan_state(OPlanState *o_plan_state, OScanState *ostate, Relation index, + ExprContext *econtext, IndexRuntimeKeyInfo **runtimeKeys, + int *numRuntimeKeys, ScanKeyData **scanKeys, int *numScanKeys); +extern OTuple o_iterate_index(OIndexDescr *indexDescr, OScanState *ostate, + CommitSeqNo *tupleCsn, MemoryContext tupleCxt, + BTreeLocationHint *hint); +extern OTuple o_index_scan_getnext(OTableDescr *descr, OScanState *ostate, + CommitSeqNo *tupleCsn, + bool scan_primary, MemoryContext tupleCxt, + BTreeLocationHint *hint); +extern TupleTableSlot *o_exec_fetch(OScanState *ostate, ScanState *ss); +extern bool o_exec_qual(ExprContext *econtext, ExprState *qual, + TupleTableSlot *slot); +extern TupleTableSlot *o_exec_project(ProjectionInfo *projInfo, + ExprContext *econtext, + TupleTableSlot *scanTuple, + TupleTableSlot *innerTuple); + +/* explain analyze */ +extern void eanalyze_counters_init(OEACallsCounters *eacc, OTableDescr *descr); +extern void eanalyze_counter_explain(OEACallsCounter *counter, char *label, + char *ix_name, ExplainState *es); +extern void eanalyze_counters_explain(OTableDescr *descr, + OEACallsCounters *counters, + ExplainState *es); + +extern int o_get_num_prefix_exact_keys(ScanKey scankey, int nscankeys); + +#endif /* __TABLEAM_INDEX_SCAN_H__ */ diff --git a/contrib/orioledb/include/tableam/key_range.h b/contrib/orioledb/include/tableam/key_range.h new file mode 100644 index 00000000000..b6dc380514a --- /dev/null +++ b/contrib/orioledb/include/tableam/key_range.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * key_range.h + * Declarations of range of keys. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/key_range.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_KEY_RANGE_H__ +#define __TABLEAM_KEY_RANGE_H__ + +#include "tableam/descr.h" + +#define O_VALUE_BOUND_INCLUSIVE 0x01 +#define O_VALUE_BOUND_NULL 0x02 +#define O_VALUE_BOUND_UNBOUNDED 0x04 +#define O_VALUE_BOUND_LOWER 0x08 +#define O_VALUE_BOUND_UPPER 0x10 +#define O_VALUE_BOUND_COERCIBLE 0x20 +#define O_VALUE_BOUND_DIRECTIONS (O_VALUE_BOUND_LOWER | O_VALUE_BOUND_UPPER) +#define O_VALUE_BOUND_NO_VALUE (O_VALUE_BOUND_NULL | O_VALUE_BOUND_UNBOUNDED) +#define O_VALUE_BOUND_MINUS_INFINITY (O_VALUE_BOUND_LOWER | O_VALUE_BOUND_UNBOUNDED) +#define O_VALUE_BOUND_PLUS_INFINITY (O_VALUE_BOUND_UPPER | O_VALUE_BOUND_UNBOUNDED) +#define O_VALUE_BOUND_PLAIN_VALUE (O_VALUE_BOUND_LOWER | O_VALUE_BOUND_INCLUSIVE | O_VALUE_BOUND_COERCIBLE) + +typedef struct +{ + Datum value; + Oid type; + uint8 flags; + + /* + * We're going to do many comparisons between bound value and tuple + * values. It would be very slow to lookup for the comparator each time. + * So if types don't match, we do cache the comaparator. + */ + OComparator *comparator; + OExclusionFn *exclusion_fn; +} OBTreeValueBound; + +typedef struct OBtreeRowKeyBound +{ + int nkeys; + int *keynums; + OBTreeValueBound *keys; +} OBtreeRowKeyBound; + +typedef struct +{ + int nkeys; + OBTreeValueBound keys[INDEX_MAX_KEYS]; + int n_row_keys; + OBtreeRowKeyBound *row_keys; +} OBTreeKeyBound; + +typedef struct +{ + bool empty; + OBTreeKeyBound low; + OBTreeKeyBound high; +} OBTreeKeyRange; + +extern bool o_key_data_to_key_range(OBTreeKeyRange *res, + ScanKeyData *keyData, + int numberOfKeys, + BTArrayKeyInfo *arrayKeys, + int numPrefixExactKeys, + int resultNKeys, + OIndexField *fields); + +#endif diff --git a/contrib/orioledb/include/tableam/operations.h b/contrib/orioledb/include/tableam/operations.h new file mode 100644 index 00000000000..e5e7cb91fab --- /dev/null +++ b/contrib/orioledb/include/tableam/operations.h @@ -0,0 +1,166 @@ +/*------------------------------------------------------------------------- + * + * operations.h + * Declarations of table-level operations + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/operations.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_OPERATIONS_H__ +#define __TABLEAM_OPERATIONS_H__ + +#include "btree/btree.h" +#include "btree/modify.h" +#include "c.h" +#include "catalog/o_tables.h" +#include "tableam/descr.h" +#include "tuple/slot.h" + +#include "access/tableam.h" +#include "nodes/execnodes.h" +#include "nodes/pathnodes.h" +#include "rewrite/rewriteHandler.h" + +/* + * Result of table modification functions. + */ +typedef struct OTableModifyResult +{ + /* result of the modification */ + bool success; + /* a failed modification action */ + BTreeOperationType action; + /* an index number on which the modification action has been failed */ + OIndexNumber failedIxNum; + /* the modified tuple */ + TupleTableSlot *oldTuple; +} OTableModifyResult; + +typedef struct +{ + OTableDescr *desc; + TupleTableSlot *scanSlot; + OTableSlot *newSlot; + OXid conflictOxid; + OXid oxid; + CommitSeqNo csn; + UndoLocation tupUndoLocation; + OIndexNumber conflictIxNum; + bool copyPrimaryOxid; + RowLockMode lockMode; +} InsertOnConflictCallbackArg; + +typedef struct +{ + TupleTableSlot *scanSlot; + TupleTableSlot *tmpSlot; + OTableDescr *descr; + OTableSlot *newSlot; + OXid oxid; + CommitSeqNo csn; + UndoLocation tup_undo_location; + BTreeLeafTupleDeletedStatus deleted; + CommandId modifyCid; + CommandId tupleCid; + bool modified; + bool selfModified; + bool changingPart; + Bitmapset *keyAttrs; + int options; +} OModifyCallbackArg; + +typedef struct +{ + Relation rel; + TupleTableSlot *scanSlot; + OTableDescr *descr; + OXid oxid; + CommitSeqNo csn; + LockWaitPolicy waitPolicy; + UndoLocation tupUndoLocation; + BTreeLeafTupleDeletedStatus deleted; + CommandId modifyCid; + CommandId tupleCid; + bool wouldBlock; + bool modified; + bool selfModified; +} OLockCallbackArg; + +extern TupleTableSlot *o_tbl_insert(OTableDescr *descr, Relation relation, + TupleTableSlot *slot, OXid oxid, + CommitSeqNo csn); +extern TupleTableSlot *o_tbl_insert_with_arbiter(Relation rel, + OTableDescr *descr, + TupleTableSlot *slot, + List *arbiterIndexes, + CommandId cid, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + EState *estate, + ResultRelInfo *resultRelInfo); +extern OBTreeModifyResult o_tbl_index_insert(OTableDescr *descr, + OIndexDescr *id, + OTuple *own_tup, + TupleTableSlot *slot, + OXid oxid, CommitSeqNo csn, + BTreeModifyCallbackInfo *callbackInfo, + IndexUniqueCheck checkUnique); +extern OBTreeModifyResult o_tbl_lock(OTableDescr *descr, OBTreeKeyBound *pkey, + LockTupleMode mode, OXid oxid, + OLockCallbackArg *larg, + BTreeLocationHint *hint); +extern OTableModifyResult o_tbl_update(OTableDescr *descr, TupleTableSlot *slot, + OBTreeKeyBound *oldPkey, + Relation rel, OXid oxid, + CommitSeqNo csn, + BTreeLocationHint *hint, + OModifyCallbackArg *arg, + ItemPointer bridge_ctid); +extern OTableModifyResult o_update_secondary_index(OIndexDescr *id, + OIndexNumber ix_num, + bool new_valid, + bool old_valid, + TupleTableSlot *newSlot, + OTuple new_ix_tup, + TupleTableSlot *oldSlot, + OXid oxid, + CommitSeqNo csn, + IndexUniqueCheck checkUnique); +extern OTableModifyResult o_tbl_delete(Relation rel, + OTableDescr *descr, + OBTreeKeyBound *primary_key, + OXid oxid, + CommitSeqNo csn, + BTreeLocationHint *hint, + OModifyCallbackArg *arg); +extern OTableModifyResult o_tbl_index_delete(OIndexDescr *id, + OIndexNumber ix_num, + TupleTableSlot *slot, + OXid oxid, CommitSeqNo csn); +extern void o_check_tbl_update_mres(OTableModifyResult mres, + OTableDescr *descr, + Relation rel, + TupleTableSlot *slot); +extern void o_check_tbl_delete_mres(OTableModifyResult mres, + OTableDescr *descr, Relation rel); + +extern void set_pending_sk_marker(OTableDescr *descr, UndoLocation pkUndoLoc); +extern void set_pending_sk_marker_from_slot(UndoLocation pkUndoLoc, void *arg); +extern void set_pending_sk_marker_from_modify_arg(UndoLocation pkUndoLoc, + void *arg); +extern void fire_sk_modify_pending_stopevent(OTableDescr *descr); +extern void clear_pending_sk_marker(void); + +extern bool o_is_index_predicate_satisfied(OIndexDescr *idx, + TupleTableSlot *slot, + ExprContext *econtext); +extern void o_truncate_table(ORelOids oids, bool missingOK); +extern void o_apply_new_bridge_index_ctid(OTableDescr *descr, Relation relation, TupleTableSlot *slot, CommitSeqNo csn, bool increment_bridge_ctid); +extern int o_exclusion_cmp(OIndexDescr *id, OBTreeKeyBound *key1, OTuple *tuple2); + +#endif diff --git a/contrib/orioledb/include/tableam/scan.h b/contrib/orioledb/include/tableam/scan.h new file mode 100644 index 00000000000..5f2417f31f6 --- /dev/null +++ b/contrib/orioledb/include/tableam/scan.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * scan.h + * Scan Provider for orioledb tables. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_SCAN_H__ +#define __TABLEAM_SCAN_H__ + +#include "postgres.h" + +#include "nodes/extensible.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/planner.h" + +typedef enum OPlanTag +{ + O_IndexPlan, + O_BitmapHeapPlan, +} OPlanTag; + +typedef struct OPlanState +{ + OPlanTag type; + PlanState *plan_state; +} OPlanState; + +typedef struct OCustomScanState +{ + CustomScanState css; + OEACallsCounters eaCounters; + OPlanState *o_plan_state; +} OCustomScanState; + +extern set_rel_pathlist_hook_type old_set_rel_pathlist_hook; + +extern void orioledb_set_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, + Index rti, RangeTblEntry *rte); +extern bool orioledb_set_plain_rel_pathlist_hook(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); + +extern bool is_o_custom_scan(CustomScan *scan); +extern bool is_o_custom_scan_state(CustomScanState *scan); + +#endif /* __TABLEAM_SCAN_H__ */ diff --git a/contrib/orioledb/include/tableam/toast.h b/contrib/orioledb/include/tableam/toast.h new file mode 100644 index 00000000000..93ceaca51fc --- /dev/null +++ b/contrib/orioledb/include/tableam/toast.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * toast.h + * Table-level declarations for orioledb TOAST implementation + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/toast.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_TOAST_H__ +#define __TABLEAM_TOAST_H__ + +#include "postgres.h" + +#include "btree/btree.h" +#include "tableam/descr.h" + +/* + * Table-level interface for work with orioledb TOAST with OTableDescr. + */ + +/* external function used by toast_fetch_datum() */ +extern struct varlena *o_detoast(struct varlena *attr); + +/* + * BTree functions. + */ + +/* Prints toast index tuple */ +extern void o_toast_key_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* Prints toast table tuple */ +extern void o_toast_tup_print(BTreeDescr *desc, StringInfo buf, + OTuple tup, Pointer arg); + +/* + * Useful definitions and functions which can be used by external code. + */ + +#ifdef WORDS_BIGENDIAN + +#define SET_TOAST_POINTER(PTR) \ + (*((uint8 *) (PTR)) = 0x80) +#define IS_TOAST_POINTER(PTR) \ + (*((uint8 *) (PTR)) == 0x80) + +#else /* !WORDS_BIGENDIAN */ + +#define SET_TOAST_POINTER(PTR) \ + (*((uint8 *) (PTR)) = 0x01) +#define IS_TOAST_POINTER(PTR) \ + (*((uint8 *) (PTR)) == 0x01) + +#endif + +#endif /* __TABLE_TOAST_H__ */ diff --git a/contrib/orioledb/include/tableam/tree.h b/contrib/orioledb/include/tableam/tree.h new file mode 100644 index 00000000000..9b9c4713569 --- /dev/null +++ b/contrib/orioledb/include/tableam/tree.h @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------- + * + * tree.h + * Declarations for implementation of BTree interface for OrioleDB + * tables. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/tree.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_TREE_H__ +#define __TABLEAM_TREE_H__ + +#include "tableam/descr.h" +#include "tableam/key_range.h" + +extern void index_btree_desc_init(BTreeDescr *desc, OCompress compress, int fillfactor, + ORelOids oids, OIndexType type, + char persistence, Oid tablespace, + OXid createOxid, void *arg); +extern uint32 o_hash_iptr(OIndexDescr *idx, ItemPointer iptr); +extern void o_fill_key_bound(OIndexDescr *id, OTuple tuple, + BTreeKeyType keyType, OBTreeKeyBound *bound); +extern void o_fill_bridge_index_key_bound(BTreeDescr *secondary, OTuple tuple, OBTreeKeyBound *bound); +extern void o_fill_pindex_tuple_key_bound(BTreeDescr *desc, + OTuple tup, OBTreeKeyBound *bound); +extern int o_idx_cmp_value_bounds(OBTreeValueBound *bound1, + OBTreeValueBound *bound2, + OIndexField *field, + bool *equal); +extern int o_idx_cmp(BTreeDescr *desc, + void *p1, BTreeKeyType keyType1, + void *p2, BTreeKeyType keyType2); +extern int o_idx_cmp_range_key_to_value(OBTreeValueBound *sk1, OIndexField *field, + Datum value, bool isnull); + +#endif diff --git a/contrib/orioledb/include/tableam/vacuum.h b/contrib/orioledb/include/tableam/vacuum.h new file mode 100644 index 00000000000..82d8654ff1b --- /dev/null +++ b/contrib/orioledb/include/tableam/vacuum.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * vacuum.h + * Declarations for implementation of BTree interface for OrioleDB + * tables. + * + * Copyright (c) 2025-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tableam/vacuum.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TABLEAM_VACUUM_H__ +#define __TABLEAM_VACUUM_H__ + +#include "access/tableam.h" + +extern void orioledb_vacuum_bridged_indexes(Relation rel, OTableDescr *descr, + struct VacuumParams *params, + BufferAccessStrategy bstrategy); + +#endif /* __TABLEAM_VACUUM_H__ */ diff --git a/contrib/orioledb/include/transam/oxid.h b/contrib/orioledb/include/transam/oxid.h new file mode 100644 index 00000000000..ce83eec7841 --- /dev/null +++ b/contrib/orioledb/include/transam/oxid.h @@ -0,0 +1,241 @@ +/*------------------------------------------------------------------------- + * + * oxid.h + * Declarations for transaction management routines. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/transam/oxid.h + * + *------------------------------------------------------------------------- + */ +#ifndef __OXID_H__ +#define __OXID_H__ + +#include "storage/lmgr.h" + +typedef struct +{ + pg_atomic_uint64 csn; + pg_atomic_uint64 commitPtr; +} OXidMapItem; + +typedef struct +{ + pg_atomic_uint64 nextXid; + pg_atomic_uint64 lastXidWhenUpdatedGlobalXmin; + pg_atomic_uint64 runXmin; + pg_atomic_uint64 globalXmin; + + pg_atomic_uint64 writeInProgressXmin; + pg_atomic_uint64 writtenXmin; + pg_atomic_uint64 checkpointRetainXmin; + pg_atomic_uint64 checkpointRetainXmax; + pg_atomic_uint64 cleanedXmin; + pg_atomic_uint64 cleanedCheckpointXmin; + pg_atomic_uint64 cleanedCheckpointXmax; + + slock_t xminMutex; + + int xidMapTrancheId; + LWLock xidMapWriteLock; + + /* + * sysXidUndoLocationChangeCount with locks are used for caching in + * read_replication_catalog_retain_undo_location() + */ + int sysXidUndoLocationTrancheId; + LWLock sysXidUndoLocationLock; + uint32 sysXidUndoLocationChangeCount; + +} XidMeta; + +extern XidMeta *xid_meta; + +typedef struct +{ + TransactionId xid; /* a 32-bit transaction id to be used during + * logical decoding */ + bool useHeap; /* flag indicates if current logical xid was + * allocated when heap xid has been already + * set */ +} LogicalXidCtx; + +typedef struct OSnapshot +{ + CommitSeqNo csn; + XLogRecPtr xlogptr; + XLogRecPtr xmin; + CommandId cid; +} OSnapshot; + +/* + * OTableFetchContext + * + * Encapsulates MVCC visibility context used to fetch relation and index + * descriptors from OrioleDB system catalogs: SYS_TREES_O_TABLES & SYS_TREES_O_INDICES. + * + * The context combines: + * - snapshot: defines transactional visibility rules (xmin/csn/cid/xlogptr) + * - version: explicit schema version of the relation to be fetched (possibly from tuple-level undo chain) + * + * This allows callers to retrieve a descriptor corresponding to a specific + * catalog version as visible at a given snapshot, which is required during + * logical decoding, recovery, and other multi-version catalog access paths. + */ +typedef struct +{ + OSnapshot *snapshot; + uint32 version; +} OTableFetchContext; + +static inline OTableFetchContext +build_fetch_context(OSnapshot *snapshot, uint32 version) +{ + OTableFetchContext ctx = {.snapshot = snapshot,.version = version}; + + return ctx; +} + +extern OSnapshot o_in_progress_snapshot; +extern OSnapshot o_non_deleted_snapshot; + +/* + * orioledb.serializable GUC modes. Selects how OrioleDB handles a + * client request for SERIALIZABLE isolation, since OrioleDB does not + * implement SSI predicate locking. + */ +typedef enum OSerializableMode +{ + O_SERIALIZABLE_TABLE_LOCK, /* coarse ExclusiveLock per table (default) */ + O_SERIALIZABLE_ERROR, /* reject with ERRCODE_FEATURE_NOT_SUPPORTED */ + O_SERIALIZABLE_REPEATABLE_READ /* treat OrioleDB tables as REPEATABLE + * READ */ +} OSerializableMode; + +extern int orioledb_serializable_mode; + +static inline void +o_check_isolation_level(void) +{ + if (XactIsoLevel != XACT_SERIALIZABLE) + return; + + switch ((OSerializableMode) orioledb_serializable_mode) + { + case O_SERIALIZABLE_TABLE_LOCK: + /* Locks are taken in o_serializable_lock_relation(). */ + return; + case O_SERIALIZABLE_ERROR: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("orioledb does not support SERIALIZABLE isolation level"), + errdetail("orioledb.serializable is set to \"error\"."), + errhint("Set orioledb.serializable to 'table_lock' or 'repeatable_read' to enable SERIALIZABLE for OrioleDB tables."))); + break; + case O_SERIALIZABLE_REPEATABLE_READ: + + /* + * Silent downgrade for OrioleDB tables only. OrioleDB's snapshot + * path is CSN-based and per-transaction stable, so accessing an + * OrioleDB relation under XACT_SERIALIZABLE already yields + * REPEATABLE READ semantics; we just skip the table-lock and the + * error. XactIsoLevel is left untouched because PG's SSI + * machinery (for any heap table accessed in the same transaction) + * requires it to stay consistent with MySerializableXact for the + * lifetime of the xact. + */ + return; + } +} + +/* + * SERIALIZABLE table-level lock (orioledb.serializable = 'table_lock'). + * + * OrioleDB doesn't implement SSI; instead, every relation accessed by a + * SERIALIZABLE transaction is protected with a heavyweight ExclusiveLock + * on its OID. ExclusiveLock conflicts with RowExclusiveLock and with + * itself, so: + * + * - two SERIALIZABLE transactions touching the same table block on + * each other (lockmgr serializes them); + * - any non-SERIALIZABLE writer (RowExclusiveLock) on the same table + * blocks against an in-flight SERIALIZABLE xact; + * - non-SERIALIZABLE readers (AccessShareLock) are unaffected. + * + * Called from scan/insert/update/delete entry points; the lock is + * released at xact end through PG's normal lock-release machinery. + * Inlined so the fast path (not SERIALIZABLE) is a single branch. + */ +static inline void +o_serializable_lock_relation(Oid relid) +{ + if (XactIsoLevel != XACT_SERIALIZABLE || !OidIsValid(relid)) + return; + if (orioledb_serializable_mode != O_SERIALIZABLE_TABLE_LOCK) + return; + + LockRelationOid(relid, ExclusiveLock); +} + +#define O_LOAD_SNAPSHOT(o_snapshot, snapshot) \ + do { \ + o_check_isolation_level(); \ + (o_snapshot)->xmin = (snapshot)->csnSnapshotData.xmin; \ + (o_snapshot)->csn = (snapshot)->csnSnapshotData.snapshotcsn; \ + (o_snapshot)->xlogptr = (snapshot)->csnSnapshotData.xlogptr; \ + (o_snapshot)->cid = (snapshot)->curcid; \ + } while (false) + +#define O_LOAD_SNAPSHOT_CSN(o_snapshot, csnValue) \ + do { \ + o_check_isolation_level(); \ + (o_snapshot)->xmin = 0; \ + (o_snapshot)->csn = (csnValue); \ + (o_snapshot)->xlogptr = InvalidXLogRecPtr; \ + (o_snapshot)->cid = 0; \ + } while (false) + +#define XLOG_PTR_ALIGN(ptr) ((ptr) + ((ptr) & 1)) + +extern void oxid_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg); + +extern Size oxid_shmem_needs(void); +extern void oxid_init_shmem(Pointer ptr, bool found); +extern bool wait_for_oxid(OXid oxid, bool errorOk); +extern void oxid_notify(OXid oxid); +extern void oxid_notify_all(void); +extern void advance_oxids(OXid new_xid); +extern OXid get_current_oxid(void); +extern void assign_subtransaction_logical_xid(void); +extern void set_oxid_csn(OXid oxid, CommitSeqNo csn); +extern void set_oxid_xlog_ptr(OXid oxid, XLogRecPtr ptr); +extern void set_current_oxid(OXid oxid); +extern void set_current_logical_xid(LogicalXidCtx *in); +extern void parallel_worker_set_oxid(void); +extern void reset_current_oxid(void); +extern OXid get_current_oxid_if_any(void); +extern TransactionId get_current_logical_xid(void); +extern void get_current_logical_xid_ctx(LogicalXidCtx *output); +extern void current_oxid_precommit(void); +extern void current_oxid_xlog_precommit(void); +extern void current_oxid_commit(CommitSeqNo csn); +extern void current_oxid_clear_committing(void); +extern void current_oxid_abort(void); +extern CommitSeqNo oxid_get_csn(OXid oxid, bool getRawCsn); +extern XLogRecPtr oxid_get_xlog_ptr(OXid oxid); +extern void oxid_match_snapshot(OXid oxid, OSnapshot *snapshot, + CommitSeqNo *outCsn, XLogRecPtr *outPtr); +extern void fill_current_oxid_osnapshot(OXid *oxid, OSnapshot *snapshot); +extern void fill_current_oxid_osnapshot_no_check(OXid *oxid, + OSnapshot *snapshot); +extern int oxid_get_procnum(OXid oxid); +extern bool xid_is_finished(OXid xid); +extern bool xid_is_finished_for_everybody(OXid xid); +extern void fsync_xidmap_range(OXid xmin, OXid xmax, uint32 wait_event_info); +extern void clear_rewind_oxid(OXid oxid); +extern bool csn_is_retained_for_rewind(CommitSeqNo csn); + +#endif /* __OXID_H__ */ diff --git a/contrib/orioledb/include/transam/undo.h b/contrib/orioledb/include/transam/undo.h new file mode 100644 index 00000000000..36d89d7ea95 --- /dev/null +++ b/contrib/orioledb/include/transam/undo.h @@ -0,0 +1,463 @@ +/*------------------------------------------------------------------------- + * + * undo.h + * Declarations of undo log routines. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/transam/undo.h + * + *------------------------------------------------------------------------- + */ +#ifndef __UNDO_H__ +#define __UNDO_H__ + +typedef struct +{ + /*--- + * lastUsedLocation - max undo location actually used (within in-memory undo + * buffer) + * advanceReservedLocation - undo location for reservation of future + * records + * minProcReservedLocation - min undo location, where forming undo records + * might exist) + * minProcTransactionRetainLocation - min undo location retained by + * transaction + * minProcRetainLocation - min undo location retained by transaction or + * snapshot + * writeInProgressLocation - writing to this location is currently + * in-progress + * writtenLocation - already written to this location + * cleanedLocation - files behind this location are already cleaned + * + * cleanedLocation <= minProcRetainLocation <= minProcTransactionRetainLocation <= minProcReservedLocation + * cleanedLocation <= writtenLocation <= writeInProgressLocation <= minProcReservedLocation + * minProcReservedLocation <= lastUsedLocation <= advanceReservedLocation + * + * writtenLocation >= lastUsedLocation + undo_circular_buffer_size + * + * [checkpointRetainStartLocation; checkpointRetainEndLocation) -- range of + * undo locations required for recovery from the checkpoint. + */ + + /*----- + * lastUsedLocation is a position within a RAM undo log buffer, + * representing a boundary between two areas in the undo log buffer: + * (1) reserved area - location range in undo log buffer, already + * reserved by (granted to) some backends for writing; backends are + * writing their data to this location range at the moment. + * (2) ready-for-reservation area is a pre-reserved, free location range + * in undo log buffer, ready to be reserved (occupied) by any backend. + * + * lastUsedLocation is the first position in the RAM undo log buffer + * from which the ready-for-reservation area begins. + * + * RAM undo log buffer visualization: + * <- -----------------|------------------------------ -> + * | + * \ + * lastUsedLocation + * + * lastUsedLocation gets increased in the get_undo_record() method only. + * + * Basic algorithm used by each backend for obtaining afree undo log + * location for an undo record: + * - call get_undo_record() + * - call set_my_reserved_location() + * - read current lastUsedLocation - the first location available + * for the reservation at the moment, + * - setup the per-process undo locations: reservedUndoLocation + * and transactionUndoRetainLocation, from read lastUsedLocation, + * - advance shared meta->lastUsedLocation by the value of the + * reservation size. + */ + pg_atomic_uint64 lastUsedLocation; + + /* + * advanceReservedLocation is used for preliminary reservation of RAM undo + * log buffer' free locations, ready to be obtained by backends. + * advanceReservedLocation is the top value of all monotonically + * increasing undo log buffer locations. advanceReservedLocation gets + * increased in reserve_undo_size_extended() method only. + * + * Pre-reservation must be performed well in advance before the actual + * obtaining (reserving) of undo log locations, because of eviction + * overhead in a case of undo log buffer overflow. + * + * reserve_undo_size_extended() method may trigger an eviction process in + * a case of undo log buffer overflow and waitForUndoLocation == true, + * otherwise if waitForUndoLocation == false and there is no place in a + * buffer - revert modifications on advanceReservedLocation and return + * failure. + */ + pg_atomic_uint64 advanceReservedLocation; + + /*----- + * Eviction of undo log from the RAM buffer to the file range on disk is + * performed by: + * - background writer, + * - any process during reserve_undo_size_extended(). + * + * Eviction metadata is presented by an interval: (writtenLocation, + * writeInProgressLocation]. writtenLocation is the last location has + * already been successfully evicted to the file. writeInProgressLocation + * is the last location where eviction is still in-progress. The location + * range between writtenLocation and writeInProgressLocation means the + * area which is currently being evicted to files, i.e., write-to-file + * operation for this area is still in-progress. When writtenLocation + * equals to writeInProgressLocation`, then there is no in-progress + * eviction process at the moment. + */ + pg_atomic_uint64 writeInProgressLocation; + pg_atomic_uint64 writtenLocation; + + /* + * lastUsedUndoLocationWhenUpdatedMinLocation is modified by the + * update_min_undo_locations() method only and represents the last actual + * lastUsedLocation that has been seen during update_min_undo_locations(). + * lastUsedUndoLocationWhenUpdatedMinLocation is used by + * orioledb_snapshot_hook() for determining if there is necessary to call + * update_min_undo_locations() method to actualize shared meta' locations. + * NOTE: The update_min_undo_locations() method is called each time after + * 1/10 of the undo log is passed. NOTE: location values in shared meta + * may lag behind each process's data' actual undoRetainLocations. + */ + pg_atomic_uint64 lastUsedUndoLocationWhenUpdatedMinLocation; + + /* + * minProcTransactionRetainLocation is used for transaction rollback. It + * is a minimum (among all transactions) location that is needed for + * rollback. + * + * minProcRetainLocation - is a minimum (among all snapshots) location + * that is needed for any of the active snapshots. + * + * minRewindRetainLocation is used for the rewind mechanism. The rewind + * mechanism allows rolling back all recent transactions, i.e., allows + * moving a database to a previous time point. In the case of rewind, the + * undo log is retained for a longer period of time to provide recovery + * for more remote timepoints. + */ + pg_atomic_uint64 minProcTransactionRetainLocation; + pg_atomic_uint64 minProcRetainLocation; + pg_atomic_uint64 minRewindRetainLocation; + + /* + * minProcReservedLocation is a minimum location (among all backends) + * within the RAM undo log buffer that is actually reserved (obtained) by + * a backend for writing its undo log record to a RAM undo log buffer. The + * process must retain its reservedUndoLocation only while performing a + * write operation to a RAM undo log buffer. When the process finishes the + * write operation for an undo log record to the RAM buffer, it must + * release its reservedUndoLocation as soon as possible. + */ + pg_atomic_uint64 minProcReservedLocation; + + /* + * [checkpointRetainStartLocation; checkpointRetainEndLocation) represents + * the undo range retained for the checkpoint. This range allows the + * rollback of any transaction that was in-progress during checkpointing. + */ + pg_atomic_uint64 checkpointRetainStartLocation; + pg_atomic_uint64 checkpointRetainEndLocation; + + /* + * cleanedLocation - the value of minRetainLocation at the moment of last + * cleanup. + * + * Range [cleanedCheckpointStartLocation, cleanedCheckpointEndLocation] + * means an undo log range, which has been retained during the last + * cleanup, i.e., the last undo log range that is persisted on disk after + * the last cleanup. + * + * cleanedCheckpointStartLocation - the value of + * checkpointRetainStartLocation during the last cleanup. + * cleanedCheckpointEndLocation - value of checkpointRetainEndLocation + * during the last cleanup. + */ + pg_atomic_uint64 cleanedLocation; + pg_atomic_uint64 cleanedCheckpointStartLocation; + pg_atomic_uint64 cleanedCheckpointEndLocation; + + /* + * minUndoLocationsMutex is primarily used by update_min_undo_locations() + * method, also is used by evict_undo_to_disk() method to protect shared + * meta' fields, but is released for eviction writes; also is used in some + * cases to protect shared meta' retain* locations and write/written + * locations. + */ + slock_t minUndoLocationsMutex; + + /* + * minUndoLocationsChangeCount gets increased by + * update_min_undo_locations() method. minUndoLocationsChangeCount is used + * together with wait_for_even_min_undo_locations_changecount() method to + * fix concurrency between update_min_undo_locations() method and + * set_my_reserved_location() & set_my_retain_location() methods. + */ + uint32 minUndoLocationsChangeCount; + + /* + * sysXidUndoLocationChangeCount is a counter used for caching in + * read_replication_catalog_retain_undo_location(). When trying to use + * cached value read_replication_catalog_retain_undo_location() checks if + * this counter is not modified since last call by a concurrent + * insert_replication_catalog_retain_undo_location(), so that it will use + * cached last value without reading actual system tree. + */ + uint32 sysXidUndoLocationChangeCount; + + /* + * writeInProgressChangeCount is used together with a + * wait_for_even_write_in_progress_changecount() method to fix concurrency + * between undo_write() and evict_undo_to_disk() methods (protects shared + * meta' write/written locations). + */ + uint32 writeInProgressChangeCount; + + int undoWriteTrancheId; /* tranche-group ID of the LWLock */ + + /* + * undoWriteLock is an LWLock that is used for protecting disk writes + * (during the eviction process, evict_undo_to_disk()) or for await + * process on in-progress writes. + */ + LWLock undoWriteLock; + + int undoStackLocationsFlushLockTrancheId; +} UndoMeta; + +typedef struct +{ + int pendingTruncatesTrancheId; + LWLock pendingTruncatesLock; + uint64 pendingTruncatesLocation; +} PendingTruncatesMeta; + +typedef struct UndoStackItem UndoStackItem; + +typedef enum +{ + ModifyUndoItemType = 1, + RowLockUndoItemType, + RelnodeUndoItemType, + SysTreesLockUndoItemType, + InvalidateUndoItemType, + BranchUndoItemType, + SubXactUndoItemType, + RewindRelFileNodeUndoItemType, + SysCacheDeleteUndoItemType, + InvalidateComparatorUndoItemType, +} UndoItemType; + +struct UndoStackItem +{ + UndoLocation prev; + LocationIndex itemSize; + uint8 type; + uint8 indexType; +}; + +typedef struct +{ + UndoStackItem base; + UndoLocation onCommitLocation; +} OnCommitUndoStackItem; + +typedef struct +{ + UndoLocation location; + UndoLocation branchLocation; + UndoLocation subxactLocation; + UndoLocation onCommitLocation; +} UndoStackLocations; + +typedef struct +{ + bool needs_wal_flush; + bool has_retained_undo_location[(int) UndoLogsCount]; + bool local_wal_has_material_changes; + XLogRecPtr saved_xidless_commit_lsn; /* During autonumous xact + * xidless_commit_lsn should be + * invalidated. Save it before and + * restore it after Autonomous + * xact */ + OXid oxid; + LogicalXidCtx logicalXidContext; +} OAutonomousTxState; + +/* + * Branch undo record: when we apply part of undo (for instance, when we do + * rollback to the savepoint), we still memorize the "long" undo path in the + * "branch" undo record. + */ +typedef struct +{ + UndoStackItem header; + UndoLocation longPathLocation; + UndoLocation prevBranchLocation; +} BranchUndoStackItem; + +/* + * Subxact undo record: memorized undo location for rollback in the future. + */ +typedef struct +{ + UndoStackItem header; + UndoLocation prevSubLocation; + SubTransactionId parentSubid; +} SubXactUndoStackItem; + +typedef enum +{ + UndoStackFull, + UndoStackHead, + UndoStackTail +} UndoStackKind; + +/* + * Stage argument passed to every UndoCallback invocation. + * + * OUndoCallbackStageAbort - transaction is being rolled back; the + * callback should undo whatever the + * transaction did. + * + * OUndoCallbackStagePreCommit - called for callOnCommit items before the + * commit WAL record is written. Use this + * stage for durability work that must + * precede the WAL write (e.g. fsync of new + * data files) so that a crash between the + * WAL write and a later fsync cannot leave + * committed data unsynced. + * + * OUndoCallbackStageCommit - transaction has committed and the WAL + * record is already written; the callback + * should perform post-commit cleanup (e.g. + * drop obsolete files). + */ +typedef enum +{ + OUndoCallbackStageAbort, + OUndoCallbackStagePreCommit, + OUndoCallbackStageCommit, +} OUndoCallbackStage; + +extern bool oxid_needs_wal_flush; +extern UndoLocation curRetainUndoLocations[(int) UndoLogsCount]; +extern PendingTruncatesMeta *pending_truncates_meta; + +#define ORIOLEDB_UNDO_DATA_ROW_FILENAME_TEMPLATE (ORIOLEDB_UNDO_DIR "/%02X%08Xrow") +#define ORIOLEDB_UNDO_DATA_PAGE_FILENAME_TEMPLATE (ORIOLEDB_UNDO_DIR "/%02X%08Xpage") +#define ORIOLEDB_UNDO_SYSTEM_FILENAME_TEMPLATE (ORIOLEDB_UNDO_DIR "/%02X%08Xsystem") +#define UNDO_FILE_SIZE (0x4000000) + +#define UNDO_REC_EXISTS(undoType, location) ((location) >= pg_atomic_read_u64(enable_rewind ? &get_undo_meta_by_type((undoType))->minRewindRetainLocation : &get_undo_meta_by_type((undoType))->minProcRetainLocation) || \ + ((location) >= pg_atomic_read_u64(&get_undo_meta_by_type((undoType))->checkpointRetainStartLocation) && \ + (location) < pg_atomic_read_u64(&get_undo_meta_by_type((undoType))->checkpointRetainEndLocation))) +#define UNDO_REC_XACT_RETAIN(undoType, location) ((location) >= pg_atomic_read_u64(&get_undo_meta_by_type((undoType))->minProcTransactionRetainLocation)) +#define GET_CUR_UNDO_STACK_LOCATIONS(undoType) (AssertMacro(MYPROCNUMBER >= 0 && MYPROCNUMBER < max_procs), \ + AssertMacro((int) (undoType) >= 0 && (int) (undoType) < (int) UndoLogsCount), \ + &oProcData[MYPROCNUMBER].undoStackLocations[oProcData[MYPROCNUMBER].autonomousNestingLevel][(int) (undoType)]) + +extern Size undo_shmem_needs(void); +extern void undo_shmem_init(Pointer buf, bool found); +extern UndoMeta *get_undo_meta_by_type(UndoLogType undoType); +extern const char *get_undo_type_name(UndoLogType undoType); + +extern void update_min_undo_locations(UndoLogType undoType, + bool undoEviction, + bool do_cleanup); +extern void evict_undo_to_disk(UndoLogType undoType, + UndoLocation targetUndoLocation, + UndoLocation minProcReservedLocation, + bool attempt); +extern bool reserve_undo_size_extended(UndoLogType type, Size size, + bool waitForUndoLocation); +extern void steal_reserved_undo_size(UndoLogType type, Size size); +extern void giveup_reserved_undo_size(UndoLogType type); +extern void fsync_undo_range(UndoLogType undoType, + UndoLocation fromLoc, UndoLocation toLoc, + uint32 wait_event_info); +extern Pointer get_undo_record(UndoLogType undoType, UndoLocation *undoLocation, + Size size); +extern Pointer get_undo_record_unreserved(UndoLogType type, + UndoLocation *undoLocation, + Size size); +extern Size get_reserved_undo_size(UndoLogType undoType); +extern void release_undo_size(UndoLogType undoType); +extern void release_reserved_undo_location(UndoLogType undoType); +extern void add_new_undo_stack_item(UndoLogType undoType, + UndoLocation location); +extern UndoLocation get_subxact_undo_location(UndoLogType undoType); +extern void add_new_undo_stack_item_to_process(UndoLogType undoType, + UndoLocation location, + int pgprocno, + int autonomousNestingLevel); +extern void read_shared_undo_locations(UndoStackLocations *to, UndoStackSharedLocations *from); +extern void write_shared_undo_locations(UndoStackSharedLocations *to, UndoStackLocations *from); +extern void get_cur_undo_locations(UndoStackLocations *locations, + UndoLogType undoType); +extern void set_cur_undo_locations(UndoLogType undoType, + UndoStackLocations locations); +extern void reset_cur_undo_locations(void); +extern XLogRecPtr orioledb_get_xidless_commit_lsn(bool *wrote_xlog); +extern void undo_xact_callback(XactEvent event, void *arg); +extern void undo_subxact_callback(SubXactEvent event, SubTransactionId mySubid, + SubTransactionId parentSubid, void *arg); +extern bool have_current_undo(UndoLogType undoType); +extern void apply_undo_branches(UndoLogType undoType, OXid oxid); +extern void apply_undo_stack(UndoLogType undoType, OXid oxid, + UndoStackLocations *toLocation, + bool changeCountsValid); +extern void precommit_undo_stack(UndoLogType undoType, OXid oxid, + bool changeCountsValid); +extern void on_commit_undo_stack(UndoLogType undoType, OXid oxid, + bool changeCountsValid); +extern void free_retained_undo_location(UndoLogType undoType); +extern void start_autonomous_transaction(OAutonomousTxState *state); +extern void abort_autonomous_transaction(OAutonomousTxState *state); +extern void finish_autonomous_transaction(OAutonomousTxState *state); +extern void undo_read(UndoLogType undoType, UndoLocation location, + Size size, Pointer buf); +extern bool undo_read_if_exists(UndoLogType undoType, UndoLocation location, + Size size, Pointer buf); +extern void undo_write(UndoLogType undoType, UndoLocation location, + Size size, Pointer buf); +extern bool undo_write_if_exists(UndoLogType undoType, UndoLocation location, + Size size, Pointer buf); +extern void undo_snapshot_register_hook(Snapshot snapshot); +extern void undo_snapshot_deregister_hook(Snapshot snapshot); +extern void orioledb_snapshot_hook(Snapshot snapshot); +extern void add_subxact_undo_item(SubTransactionId parentSubid); +extern void rollback_to_savepoint(UndoLogType undoType, + UndoStackKind kind, + SubTransactionId parentSubid, + bool changeCountsValid); +extern bool undo_type_has_retained_location(UndoLogType undoType); +extern bool have_retained_undo_location(void); +extern UndoLocation get_snapshot_retained_undo_location(UndoLogType undoType); +extern UndoLocation set_my_snapshot_retain_location(UndoLogType undoType); +extern void clear_my_snapshot_retain_location(UndoLogType undoType); +extern void orioledb_reset_xmin_hook(void); +extern void o_add_rewind_relfilenode_undo_item(RelFileNode *onCommit, + RelFileNode *onAbort, + int nOnCommit, int nOnAbort); + +static inline void +reserve_undo_size(UndoLogType type, Size size) +{ + (void) reserve_undo_size_extended(type, size, true); +} + +extern void reset_command_undo_locations(void); +extern CommandId undo_location_get_command(UndoLocation location); +extern UndoLocation current_command_get_undo_location(void); +extern void update_command_undo_location(CommandId commandId, + UndoLocation undoLocation); +extern void o_set_current_command(CommandId commandId); +extern CommandId o_get_current_command(void); +extern UndoLocation get_current_replication_catalog_retain_undo_location(void); + +#endif /* __UNDO_H__ */ diff --git a/contrib/orioledb/include/tuple/format.h b/contrib/orioledb/include/tuple/format.h new file mode 100644 index 00000000000..2b404ad43be --- /dev/null +++ b/contrib/orioledb/include/tuple/format.h @@ -0,0 +1,214 @@ +/*------------------------------------------------------------------------- + * + * format.h + * Declarations for orioledb tuple format. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tuple/format.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TUPLE_FORMAT_H__ +#define __TUPLE_FORMAT_H__ + +#include "postgres.h" + +typedef struct +{ + TupleDesc desc; + char *tp; + bits8 *bp; + uint32 off; + uint16 attnum; + uint16 natts; + bool hasnulls; + bool slow; +} OTupleReaderState; + +typedef struct +{ + uint16 hasnulls:1, + len:15; + uint16 natts; + uint32 version; +} OTupleHeaderData; + +#define O_TUPLE_FLAGS_FIXED_FORMAT 0x1 + +typedef struct +{ + uint16 natts; + uint16 len; +} OTupleFixedFormatSpec; + +typedef OTupleHeaderData *OTupleHeader; +#define SizeOfOTupleHeader MAXALIGN(sizeof(OTupleHeaderData)) + +typedef struct BrigeData +{ + bool is_pkey; + ItemPointer bridge_iptr; + /* compared with InvalidAttrNumber, so should be greater than 0 */ + AttrNumber attnum; +} BrigeData; + +/* + * Works with orioledb table tuples in primary index. It can fetch + * TOAST pointers from table tuple. + */ +#define o_fastgetattr(tup, attnum, tupleDesc, spec, isnull) \ +( \ + AssertMacro((attnum) > 0), \ + (*(isnull) = false), \ + ((tup).formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT) ? \ + ( \ + ((attnum) - 1 < (spec)->natts) ? \ + ( \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ? \ + ( \ + fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1), \ + (char *) (tup).data + \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff) \ + ) \ + : \ + o_toast_nocachegetattr((tup), (attnum), (tupleDesc), (spec), (isnull)) \ + ) \ + : \ + ( \ + (*(isnull) = true), \ + (Datum) NULL \ + ) \ + ) \ + : \ + ( \ + (!(((OTupleHeader) (tup).data)->hasnulls)) ? \ + ( \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ? \ + ( \ + fetchatt(TupleDescAttr((tupleDesc), (attnum)-1), \ + (char *) (tup).data + SizeOfOTupleHeader + \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff) \ + ) \ + : \ + o_toast_nocachegetattr((tup), (attnum), (tupleDesc), (spec), (isnull)) \ + ) \ + : \ + ( \ + att_isnull((attnum) - 1, (bits8 *) ((tup).data + SizeOfOTupleHeader)) ? \ + ( \ + (*(isnull) = true), \ + (Datum) NULL \ + ) \ + : \ + ( \ + o_toast_nocachegetattr((tup), (attnum), (tupleDesc), (spec), (isnull)) \ + ) \ + ) \ + ) \ +) + +#define o_fastgetattr_ptr(tup, attnum, tupleDesc, spec) \ +( \ + AssertMacro((attnum) > 0), \ + ((tup).formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT) ? \ + ( \ + ((attnum) - 1 < (spec)->natts) ? \ + ( \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ? \ + ( \ + (char *) (tup).data + \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff \ + ) \ + : \ + o_toast_nocachegetattr_ptr((tup), (attnum), (tupleDesc), (spec)) \ + ) \ + : \ + ( \ + NULL \ + ) \ + ) \ + : \ + ( \ + (!(((OTupleHeader) (tup).data)->hasnulls)) ? \ + ( \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ? \ + ( \ + (char *) (tup).data + SizeOfOTupleHeader + \ + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff \ + ) \ + : \ + o_toast_nocachegetattr_ptr((tup), (attnum), (tupleDesc), (spec)) \ + ) \ + : \ + ( \ + att_isnull((attnum) - 1, (bits8 *) ((tup).data + SizeOfOTupleHeader)) ? \ + ( \ + NULL \ + ) \ + : \ + ( \ + o_toast_nocachegetattr_ptr((tup), (attnum), (tupleDesc), (spec)) \ + ) \ + ) \ + ) \ +) + +#define o_tuple_size(tup, spec) \ +( \ + ((tup).formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT) ? \ + ( \ + (spec)->len \ + ) \ + : \ + ( \ + ((OTupleHeader) (tup).data)->len \ + ) \ +) + +#define o_has_nulls(tup) \ +( \ + ((tup).formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT) ? \ + ( \ + false \ + ) \ + : \ + ( \ + ((OTupleHeader) (tup).data)->hasnulls \ + ) \ +) + +extern void o_tuple_init_reader(OTupleReaderState *state, OTuple tuple, + TupleDesc desc, OTupleFixedFormatSpec *spec); +extern Datum o_tuple_read_next_field(OTupleReaderState *state, bool *isnull); +extern uint32 o_tuple_next_field_offset(OTupleReaderState *state, + Form_pg_attribute att); +extern ItemPointer o_tuple_get_last_iptr(TupleDesc desc, + OTupleFixedFormatSpec *spec, + OTuple tuple, bool *isnull); +extern Datum o_toast_nocachegetattr(OTuple tuple, int attnum, + TupleDesc tupleDesc, + OTupleFixedFormatSpec *spec, + bool *is_null); +extern Pointer o_toast_nocachegetattr_ptr(OTuple tuple, int attnum, + TupleDesc tupleDesc, + OTupleFixedFormatSpec *spec); +extern Pointer o_tuple_get_data(OTuple tuple, int *size, OTupleFixedFormatSpec *spec); +extern Size o_new_tuple_size(TupleDesc tupleDesc, OTupleFixedFormatSpec *spec, + ItemPointer iptr, BrigeData *bridge_data, uint32 version, + Datum *values, bool *isnull, char *to_toast); +extern void o_tuple_fill(TupleDesc tupleDesc, OTupleFixedFormatSpec *spec, + OTuple *tuple, Size tuple_size, + ItemPointer iptr, BrigeData *bridge_data, uint32 version, + Datum *values, bool *isnull, char *to_toast); +extern OTuple o_form_tuple(TupleDesc tupleDesc, OTupleFixedFormatSpec *spec, + uint32 version, Datum *values, bool *isnull, + BrigeData *bridge_data); +extern uint32 o_tuple_get_version(OTuple tuple); +extern void o_tuple_set_version(OTupleFixedFormatSpec *spec, OTuple *tuple, + uint32 version); +extern void o_tuple_set_ctid(OTuple tuple, ItemPointer iptr); + +#endif /* __TUPLE_FORMAT_H__ */ diff --git a/contrib/orioledb/include/tuple/slot.h b/contrib/orioledb/include/tuple/slot.h new file mode 100644 index 00000000000..bd92d575241 --- /dev/null +++ b/contrib/orioledb/include/tuple/slot.h @@ -0,0 +1,97 @@ +/*------------------------------------------------------------------------- + * + * slot.h + * Declarations for orioledb tuple slot implementation + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tuple/slot.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TUPLE_SLOT_H__ +#define __TUPLE_SLOT_H__ + +#include "postgres.h" +#include "executor/tuptable.h" + +#include "tableam/key_range.h" +#include "tuple/format.h" + +typedef struct OTableSlot +{ + TupleTableSlot base; + + char *data; /* data for materialized slots */ + char *to_toast; + bool *vfree; + Datum *detoasted; + OTuple tuple; + OTableDescr *descr; + bytea *rowid; + CommitSeqNo csn; + int ixnum; + bool leafTuple; + bool bridgeChanged; + uint32 version; + OTupleReaderState state; + BTreeLocationHint hint; + ItemPointerData bridge_ctid; +} OTableSlot; + +#define ORIOLEDB_TO_TOAST_OFF ('\0') +#define ORIOLEDB_TO_TOAST_ON ('y') +#define ORIOLEDB_TO_TOAST_COMPRESSION_TRIED ('c') + +extern PGDLLIMPORT const TupleTableSlotOps TTSOpsOrioleDB; + +extern void tts_orioledb_detoast(TupleTableSlot *slot); +extern void tts_orioledb_store_tuple(TupleTableSlot *slot, OTuple tuple, + OTableDescr *descr, CommitSeqNo csn, + int ixnum, bool shouldfree, + BTreeLocationHint *hint); +extern void tts_orioledb_store_non_leaf_tuple(TupleTableSlot *slot, + OTuple tuple, + OTableDescr *descr, + CommitSeqNo csn, + int ixnum, bool shouldfree, + BTreeLocationHint *hint); +extern OTuple tts_orioledb_make_secondary_tuple(TupleTableSlot *slot, + OIndexDescr *idx, + bool leaf); +extern void tts_orioledb_fill_key_bound(TupleTableSlot *slot, OIndexDescr *idx, + OBTreeKeyBound *bound); +extern char *tss_orioledb_print_idx_key(TupleTableSlot *slot, OIndexDescr *id); +extern void appendStringInfoIndexKey(StringInfo str, TupleTableSlot *slot, + OIndexDescr *id); +extern char *orioledb_print_idx_key(HeapTuple tuple, OIndexDescr *id); +extern void tts_orioledb_toast(TupleTableSlot *slot, OTableDescr *descr); +extern OTuple tts_orioledb_form_tuple(TupleTableSlot *slot, + OTableDescr *descr); +extern OTuple tts_orioledb_form_orphan_tuple(TupleTableSlot *slot, + OTableDescr *descr); +extern bool tts_orioledb_insert_toast_values(TupleTableSlot *slot, + OTableDescr *descr, + OXid oxid, CommitSeqNo csn); +extern void tts_orioledb_toast_sort_add(TupleTableSlot *slot, + OTableDescr *descr, + Tuplesortstate *sortstate); +extern bool tts_orioledb_remove_toast_values(TupleTableSlot *slot, + OTableDescr *descr, + OXid oxid, CommitSeqNo csn); +extern bool tts_orioledb_update_toast_values(TupleTableSlot *oldSlot, + TupleTableSlot *newSlot, + OTableDescr *descr, + OXid oxid, CommitSeqNo csn); +extern bool tts_orioledb_modified(TupleTableSlot *oldSlot, + TupleTableSlot *newSlot, + Bitmapset *attrs); +extern void tts_orioledb_set_ctid(TupleTableSlot *slot, ItemPointer iptr); +extern Datum o_get_tbl_att(TupleTableSlot *slot, int attnum, bool primaryIsCtid, + bool *isnull, Oid *typid, bool decompress); +Datum o_get_idx_expr_att(TupleTableSlot *slot, OIndexDescr *idx, + ExprState *exp_state, bool *isnull); + +#endif /* __TUPLE_SLOT_H__ */ diff --git a/contrib/orioledb/include/tuple/sort.h b/contrib/orioledb/include/tuple/sort.h new file mode 100644 index 00000000000..a25e1f70421 --- /dev/null +++ b/contrib/orioledb/include/tuple/sort.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------- + * + * sort.h + * Declarations for implementation of orioledb tuple sorting + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tuple/sort.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TUPLE_SORT_H +#define __TUPLE_SORT_H + +#include "tableam/descr.h" + +extern Tuplesortstate *tuplesort_begin_orioledb_index(OIndexDescr *idx, + int workMem, + bool randomAccess, + SortCoordinate coordinate); +extern Tuplesortstate *tuplesort_begin_orioledb_toast(OIndexDescr *toast, + OIndexDescr *primary, + int workMem, + bool randomAccess, + SortCoordinate coordinate); +extern OTuple tuplesort_getotuple(Tuplesortstate *state, bool forward); +extern void tuplesort_putotuple(Tuplesortstate *state, OTuple tup); + +#endif /* __TUPLE_SORT_H */ diff --git a/contrib/orioledb/include/tuple/toast.h b/contrib/orioledb/include/tuple/toast.h new file mode 100644 index 00000000000..8a774ded8d8 --- /dev/null +++ b/contrib/orioledb/include/tuple/toast.h @@ -0,0 +1,228 @@ +/*------------------------------------------------------------------------- + * + * toast.h + * Low-level declarations for orioledb TOAST implementation + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/tuple/toast.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TOAST_H__ +#define __TOAST_H__ + +#include "access/htup.h" +#include "access/detoast.h" +#include "access/toast_compression.h" + +#include "orioledb.h" + +#include "btree/iterator.h" +#include "tableam/descr.h" + +/* position after primary ix tuple */ +#define ATTN_POS (1) +#define CHUNKN_POS (2) +#define DATA_POS (3) + +#define TOAST_LEAF_FIELDS_NUM (3) +#define TOAST_NON_LEAF_FIELDS_NUM (2) + +typedef struct Tuplesortstate Tuplesortstate; + +/* + * Low-level orioledb TOAST interface. + */ + +/* Key bound for TOAST BTree */ +typedef struct OToastKey +{ + /* primary index tuple */ + OTuple pk_tuple; + /* current chunk number */ + uint32 chunknum; + /* attribute number of toasted value in table */ + uint16 attnum; +} OToastKey; + +/* + * Stores into orioledb table tuples in primary index instead TOASTed values. + */ +typedef struct OToastValue +{ + /* always TOAST pointer (0x80 for big-endian or 0x01 for little-endian) */ + uint8 pointer; + /* compression method of TOASTed data */ + uint8 compression; + /* raw size of TOASTed data without headers */ + int32 raw_size; + /* size of TOASTed data */ + int32 toasted_size; +} OToastValue; + +/* + * API, which encapsulates TOAST key and tuple format. + */ +typedef struct +{ + BTreeDescr *(*getBTreeDesc) (void *arg); + uint32 (*getBTreeVersion) (void *arg); + uint32 (*getBaseBTreeVersion) (void *arg); + uint32 (*getKeySize) (void *arg); + uint32 (*getMaxChunkSize) (void *key, void *arg); + void (*updateKey) (void *key, uint32 chunknum, void *arg); + void *(*getNextKey) (void *key, void *arg); + OTuple (*createTuple) (void *key, Pointer data, uint32 offset, uint32 chunknum, + int length, void *arg); + OTuple (*createKey) (void *key, uint32 chunknum, void *arg); + Pointer (*getTupleData) (OTuple tuple, void *arg); + uint32 (*getTupleChunknum) (OTuple tuple, void *arg); + uint32 (*getTupleDataSize) (OTuple tuple, void *arg); + bool deleteLogFullTuple; + TupleFetchCallback fetchCallback; +} ToastAPI; + +extern ToastAPI tableToastAPI; + +/* + * Generic function for working with TOAST, which can work with different + * API implementations. + */ +extern bool generic_toast_insert(ToastAPI *api, void *key, Pointer data, + Size data_size, OXid oxid, CommitSeqNo csn, + void *arg); +extern void generic_toast_sort_add(ToastAPI *api, void *key, Pointer data, + Size data_size, Tuplesortstate *sortstate, + void *arg); +extern bool generic_toast_update(ToastAPI *api, void *key, Pointer data, + Size data_size, OXid oxid, CommitSeqNo csn, + void *arg); +extern bool generic_toast_delete(ToastAPI *api, void *key, OXid oxid, + CommitSeqNo csn, void *arg); + +extern bool generic_toast_insert_optional_wal(ToastAPI *api, void *key, + Pointer data, Size data_size, + OXid oxid, CommitSeqNo csn, + void *arg, bool wal); +extern bool generic_toast_update_optional_wal(ToastAPI *api, void *key, + Pointer data, Size data_size, + OXid oxid, CommitSeqNo csn, + void *arg, bool wal); +extern bool generic_toast_delete_optional_wal(ToastAPI *api, void *key, + OXid oxid, CommitSeqNo csn, + void *arg, bool wal); + +/* Returns tuple only if its size equals data_size, or NULL otherwise */ +extern Pointer generic_toast_get(ToastAPI *api, void *key, Size data_size, + OSnapshot *snapshot, void *arg); + +/* Returns tuple and size of data if found, or NULL otherwise */ +extern Pointer generic_toast_get_any(ToastAPI *api, void *key, + Size *data_size, OSnapshot *snapshot, + void *arg); + +/* + * Same as generic_toast_get_any but: + * - if found_key not NULL, on success it will contain found key + * - if found_key contains valid pointer it used as version callback arg + */ +extern Pointer generic_toast_get_any_with_key(ToastAPI *api, void *key, + Size *data_size, OSnapshot *snapshot, + void *arg, Pointer *found_key); + +/* + * Same as generic_toast_get_any but uses fetch_callback to filter tuples + */ +extern Pointer generic_toast_get_any_with_callback(ToastAPI *api, Pointer key, + Size *data_size, + OSnapshot *snapshot, + void *arg, + TupleFetchCallback fetch_callback, + void *callback_arg); + +/* Copies TupleDescs to toast definition */ +extern void o_toast_init_tupdescs(OIndexDescr *toast, TupleDesc ix_primary); + +/* + * Functions dealing with tableam TOAST trees. + */ +extern bool o_toast_insert(OTableDescr *descr, + OTuple pk, uint16 attn, + Pointer data, Size data_size, + OXid oxid, CommitSeqNo csn); +extern void o_toast_sort_add(OTableDescr *descr, + OTuple pk, uint16 attn, + Pointer data, Size data_size, + Tuplesortstate *sortstate); +extern bool o_toast_delete(OTableDescr *descr, + OTuple pk, uint16 attn, + OXid oxid, CommitSeqNo csn); +extern Pointer o_toast_get(OTableDescr *descr, + OTuple pk, uint16 attn, Size data_size, + OSnapshot *snapshot); + +extern int o_toast_cmp(BTreeDescr *desc, void *p1, BTreeKeyType k1, + void *p2, BTreeKeyType k2); +extern bool o_toast_needs_undo(BTreeDescr *desc, BTreeOperationType action, + OTuple oldTuple, OTupleXactInfo oldXactInfo, bool oldDeleted, + OTuple newTuple, OXid newOxid); + +extern Datum create_o_toast_external(OTableDescr *descr, + OTuple idx_tup, + AttrNumber attnum, + OToastValue *toasted, + CommitSeqNo csn); + +/* gets raw value size without header */ +static inline int32 +o_get_raw_size(Datum value) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + + if (VARATT_IS_EXTERNAL_ORIOLEDB(DatumGetPointer(value))) + { + OToastExternal ote; + + memcpy(&ote, VARDATA_EXTERNAL(DatumGetPointer(value)), O_TOAST_EXTERNAL_SZ); + return ote.raw_size; + } + else if (VARATT_IS_EXTERNAL(value)) + return toast_raw_datum_size(value) - VARHDRSZ; + else if (VARATT_IS_COMPRESSED(attr)) + return VARDATA_COMPRESSED_GET_EXTSIZE(attr); + else + return VARSIZE_ANY_EXHDR(value); +} + +/* gets full value size */ +static inline int32 +o_get_src_size(Datum value) +{ + if (VARATT_IS_EXTERNAL_ORIOLEDB(DatumGetPointer(value))) + { + OToastExternal ote; + + memcpy(&ote, VARDATA_EXTERNAL(DatumGetPointer(value)), O_TOAST_EXTERNAL_SZ); + return ote.toasted_size; + } + else if (VARATT_IS_EXTERNAL_ONDISK(value)) + return toast_datum_size(value) + VARHDRSZ; + else if (VARATT_IS_EXTERNAL(value)) + return toast_datum_size(value); + else + return VARSIZE_ANY(value); +} + + +/* gets raw decompressed value */ +extern Datum o_get_raw_value(Datum value, bool *free); + +extern Datum o_get_src_value(Datum value, bool *free); + +/* returns true if left and right are equal orioledb TOAST values */ +extern bool o_toast_equal(BTreeDescr *primary, Datum left, Datum right); + +#endif /* __TOAST_H__ */ diff --git a/contrib/orioledb/include/utils/compress.h b/contrib/orioledb/include/utils/compress.h new file mode 100644 index 00000000000..eae0ba5833b --- /dev/null +++ b/contrib/orioledb/include/utils/compress.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * compress.h + * Compression functions for BTree pages. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/compress.h + * + *------------------------------------------------------------------------- + */ +#ifndef __COMPRESS_H__ +#define __COMPRESS_H__ + +extern void o_compress_init(void); +extern Pointer o_compress_page(Pointer page, size_t *size, OCompress lvl); +extern void o_decompress_page(Pointer src, size_t size, Pointer page); +extern OCompress o_compress_max_lvl(void); +extern void validate_compress(OCompress compress, char *prefix); + +#endif /* __COMPRESS_H__ */ diff --git a/contrib/orioledb/include/utils/o_buffers.h b/contrib/orioledb/include/utils/o_buffers.h new file mode 100644 index 00000000000..d840829fef2 --- /dev/null +++ b/contrib/orioledb/include/utils/o_buffers.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * o_buffers.h + * Declarations for buffering layer for file access. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/o_buffers.h + * + *------------------------------------------------------------------------- + */ +#ifndef __O_BUFFERS_H__ +#define __O_BUFFERS_H__ + +typedef struct OBuffersMeta OBuffersMeta; +typedef struct OBuffersGroup OBuffersGroup; + +#define OBuffersMaxTags (4) +#define OBuffersMaxTagIsValid(tag) \ + ((tag) >= 0 && (tag) < OBuffersMaxTags) + +typedef struct +{ + /* these fields are initilized by user */ + uint64 singleFileSize; + const char *filenameTemplate[OBuffersMaxTags]; + const char *groupCtlTrancheName; + const char *bufferCtlTrancheName; + uint32 buffersCount; + + /* these fields are initilized in o_buffers.c */ + uint32 groupsCount; + OBuffersMeta *metaPageBlkno; + OBuffersGroup *groups; + File curFile; + char curFileName[MAXPGPATH]; + uint32 curFileTag; + uint64 curFileNum; +} OBuffersDesc; + +extern Size o_buffers_shmem_needs(OBuffersDesc *desc); +extern void o_buffers_shmem_init(OBuffersDesc *desc, void *buf, bool found); +extern void o_buffers_read(OBuffersDesc *desc, Pointer buf, + uint32 tag, int64 offset, int64 size); +extern bool o_buffers_read_if_exists(OBuffersDesc *desc, Pointer buf, + uint32 tag, int64 offset, int64 size); +extern void o_buffers_write(OBuffersDesc *desc, Pointer buf, + uint32 tag, int64 offset, int64 size); +extern bool o_buffers_write_if_exists(OBuffersDesc *desc, Pointer buf, + uint32 tag, int64 offset, int64 size); +extern void o_buffers_sync(OBuffersDesc *desc, uint32 tag, int64 fromOffset, + int64 toOffset, uint32 wait_event_info); +extern void o_buffers_unlink_blocks_range(OBuffersDesc *desc, + uint32 tag, + int64 firstBlockNumber, + int64 lastBlockNumber); +extern void unlink_unretained_o_buffers(OBuffersDesc *desc, uint32 tag, + int64 itemsPerBlock, + int64 cleanupStart, int64 cleanupEnd, + int64 chkpRetainStart, + int64 chkpRetainEnd, + int64 transactionRetainStart); + +#endif diff --git a/contrib/orioledb/include/utils/page_pool.h b/contrib/orioledb/include/utils/page_pool.h new file mode 100644 index 00000000000..4de79653c1d --- /dev/null +++ b/contrib/orioledb/include/utils/page_pool.h @@ -0,0 +1,247 @@ +/*------------------------------------------------------------------------- + * + * page_pool.h + * Declarations for OrioleDB logical page pool implementation. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/page_pool.h + * + *------------------------------------------------------------------------- + */ +#ifndef __PAGE_POOL_H__ +#define __PAGE_POOL_H__ + +#include "common/pg_prng.h" +#include "utils/ucm.h" + +/* + * ucm may not work correctly with lesser page pool size + */ +#define PPOOL_MIN_SIZE (1024) +#define PPOOL_MIN_SIZE_BLCKS (PPOOL_MIN_SIZE * ORIOLEDB_BLCKSZ / BLCKSZ) +#define PPOOL_RESERVE_META 0 +#define PPOOL_RESERVE_INSERT 1 +#define PPOOL_RESERVE_FIND 2 +#define PPOOL_RESERVE_SHARED_INFO_INSERT 3 +#define PPOOL_RESERVE_COUNT 4 + +#define PPOOL_KIND_GET_MASK(kind) (1 << (kind)) + +#define PPOOL_RESERVE_META_MASK PPOOL_KIND_GET_MASK(PPOOL_RESERVE_META) +#define PPOOL_RESERVE_INSERT_MASK PPOOL_KIND_GET_MASK(PPOOL_RESERVE_INSERT) +#define PPOOL_RESERVE_FIND_MASK PPOOL_KIND_GET_MASK(PPOOL_RESERVE_FIND) +#define PPOOL_RESERVE_SHARED_INFO_INSERT_MASK PPOOL_KIND_GET_MASK(PPOOL_RESERVE_SHARED_INFO_INSERT) +#define PPOOL_RESERVE_MASK_ALL (PPOOL_RESERVE_META_MASK | PPOOL_RESERVE_INSERT_MASK \ + | PPOOL_RESERVE_FIND_MASK | PPOOL_RESERVE_SHARED_INFO_INSERT_MASK) + +typedef struct PagePool PagePool; + +/* + * Page pool operations - implemented by each pool type + */ +typedef struct PagePoolOps +{ + /* Page allocation/deallocation */ + OInMemoryBlkno (*alloc_page) (PagePool *pool, int pageReserveKind); + OInMemoryBlkno (*alloc_metapage) (PagePool *pool); + void (*free_page) (PagePool *pool, OInMemoryBlkno blkno, bool haveLock); + + /* Page reservation system */ + void (*reserve_pages) (PagePool *pool, int pageReserveKind, int count); + void (*release_reserved) (PagePool *pool, uint32 kind_mask); + + OInMemoryBlkno (*free_pages_count) (PagePool *pool); + OInMemoryBlkno (*dirty_pages_count) (PagePool *pool); + bool (*run_maintenance) (PagePool *pool, bool evict, volatile sig_atomic_t *shutdown_requested); + OInMemoryBlkno (*size) (PagePool *pool); + + /* Usage tracking */ + void (*ucm_inc_usage) (PagePool *pool, OInMemoryBlkno blkno); + void (*ucm_init) (PagePool *pool, OInMemoryBlkno blkno); +} PagePoolOps; + +typedef struct PagePool +{ + const PagePoolOps *ops; + /* reserved pages count by type array */ + uint32 numPagesReserved[PPOOL_RESERVE_COUNT]; +} PagePool; + +/* Inline dispatch wrappers over PagePoolOps */ + +static inline OInMemoryBlkno +ppool_alloc_page(PagePool *pool, int pageReserveKind) +{ + return pool->ops->alloc_page(pool, pageReserveKind); +} + +static inline OInMemoryBlkno +ppool_alloc_metapage(PagePool *pool) +{ + return pool->ops->alloc_metapage(pool); +} + +static inline void +ppool_free_page(PagePool *pool, OInMemoryBlkno blkno, bool haveLock) +{ + pool->ops->free_page(pool, blkno, haveLock); +} + +static inline void +ppool_reserve_pages(PagePool *pool, int pageReserveKind, int count) +{ + pool->ops->reserve_pages(pool, pageReserveKind, count); +} + +static inline void +ppool_release_reserved(PagePool *pool, uint32 kind_mask) +{ + pool->ops->release_reserved(pool, kind_mask); +} + +static inline OInMemoryBlkno +ppool_free_pages_count(PagePool *pool) +{ + return pool->ops->free_pages_count(pool); +} + +static inline OInMemoryBlkno +ppool_dirty_pages_count(PagePool *pool) +{ + return pool->ops->dirty_pages_count(pool); +} + +static inline bool +ppool_run_maintenance(PagePool *pool, bool evict, + volatile sig_atomic_t *shutdown_requested) +{ + return pool->ops->run_maintenance(pool, evict, shutdown_requested); +} + +static inline OInMemoryBlkno +ppool_size(PagePool *pool) +{ + return pool->ops->size(pool); +} + +static inline void +ppool_ucm_inc_usage(PagePool *pool, OInMemoryBlkno blkno) +{ + pool->ops->ucm_inc_usage(pool, blkno); +} + +static inline void +ppool_ucm_init(PagePool *pool, OInMemoryBlkno blkno) +{ + pool->ops->ucm_init(pool, blkno); +} + +extern void ppool_release_all_pages(void); + +/* Shared memory based page pool handle */ +typedef struct OPagePool +{ + PagePool base; + /* count of available to reserve pages in the pool */ + pg_atomic_uint64 *availablePagesCount; + /* count of dirty pages in the pool */ + pg_atomic_uint32 *dirtyPagesCount; + /* refcount for successful page evictions by all backends */ + pg_atomic_uint64 *pageEvictCount; + /* init position for the ucm */ + OInMemoryBlkno location; + /* offset of the pool in the o_shared_buffers */ + OInMemoryBlkno offset; + /* size of the pool */ + OInMemoryBlkno size; + /* usage counter map and their size in shared memory */ + UsageCountMap ucm; + Size ucmShmemSize; + /* seed for random values */ + pg_prng_state prngSeed; +} OPagePool; + +/* Shared memory based page pool operations */ + +extern Size o_ppool_estimate_space(OPagePool *pool, OInMemoryBlkno offset, OInMemoryBlkno size, bool debug); +extern void o_ppool_shmem_init(OPagePool *pool, Pointer ptr, bool found); + +/* Local memory page pool handler */ +typedef struct LocalPagePool +{ + PagePool base; + MemoryContext slab_context; + uint32 size; + uint32 alloc_current_slot; + uint32 evict_current_slot; + /* count of available to reserve pages in the pool */ + uint32 availablePagesCount; + /* count of dirty pages in the pool */ + uint32 dirtyPagesCount; + uint32 *usage_count; +} LocalPagePool; + +extern void local_ppool_init(LocalPagePool *pool); + +extern int ppool_run_clock_depth; + +#define PAGE_DESC_FLAG_DIRTY 1 /* Modified since the the last + * time being written out */ +#define PAGE_DESC_FLAG_CONCURRENT_DIRTY 2 /* Second "dirty" flag used to + * detect changes concurrent to + * write operatorions */ +#define PAGE_DESC_FLAG_BOTH_DIRTY (PAGE_DESC_FLAG_DIRTY | PAGE_DESC_FLAG_CONCURRENT_DIRTY) +#define IS_DIRTY(blkno) (O_GET_IN_MEMORY_PAGEDESC(blkno)->flags & PAGE_DESC_FLAG_DIRTY) +#define IS_DIRTY_CONCURRENT(blkno) (O_GET_IN_MEMORY_PAGEDESC(blkno)->flags & PAGE_DESC_FLAG_CONCURRENT_DIRTY) +#define CLEAN_DIRTY_CONCURRENT(blkno) (O_GET_IN_MEMORY_PAGEDESC(blkno)->flags &= ~PAGE_DESC_FLAG_CONCURRENT_DIRTY) +#define BLKNO_LOCAL_BIT 0x80000000 + +#define MARK_DIRTY_EXTENDED(desc, blkno, skipMeta) \ + do \ + { \ + if (!(skipMeta)) \ + { \ + BTREE_GET_META(desc)->dirtyFlag1 = true; \ + BTREE_GET_META(desc)->dirtyFlag2 = true; \ + } \ + if (!IS_DIRTY(blkno)) { \ + O_GET_IN_MEMORY_PAGEDESC(blkno)->flags |= PAGE_DESC_FLAG_BOTH_DIRTY; \ + if(O_PAGE_IS_LOCAL(blkno)) { \ + ((LocalPagePool*)(desc)->ppool)->dirtyPagesCount++; \ + } else { \ + pg_atomic_fetch_add_u32(((OPagePool*)(desc)->ppool)->dirtyPagesCount, 1); \ + } \ + } \ + else if (!IS_DIRTY_CONCURRENT(blkno)) \ + { \ + O_GET_IN_MEMORY_PAGEDESC(blkno)->flags |= PAGE_DESC_FLAG_CONCURRENT_DIRTY; \ + } \ + } \ + while (0); + +#define MARK_DIRTY(desc, blkno) \ + MARK_DIRTY_EXTENDED(desc, blkno, false) + +#define CLEAN_DIRTY(pool, blkno) \ + if (IS_DIRTY(blkno)) { \ + O_GET_IN_MEMORY_PAGEDESC(blkno)->flags &= ~PAGE_DESC_FLAG_BOTH_DIRTY; \ + if(O_PAGE_IS_LOCAL(blkno)) { \ + ((LocalPagePool*)(pool))->dirtyPagesCount--; \ + } else { \ + pg_atomic_fetch_sub_u32(((OPagePool*)(pool))->dirtyPagesCount, 1); \ + } \ + } + +#define FREE_PAGE_IF_VALID(pool, blkno) \ + if (OInMemoryBlknoIsValid((blkno))) \ + { \ + CLEAN_DIRTY((pool), (blkno)); \ + ppool_free_page((pool), (blkno), false); \ + (blkno) = OInvalidInMemoryBlkno; \ + } \ + + +#endif /* __PAGE_POOL_H__ */ diff --git a/contrib/orioledb/include/utils/planner.h b/contrib/orioledb/include/utils/planner.h new file mode 100644 index 00000000000..0d35e015872 --- /dev/null +++ b/contrib/orioledb/include/utils/planner.h @@ -0,0 +1,27 @@ +/*------------------------------------------------------------------------- + * + * planner.h + * Routines for query processing. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/planner.h + * + *------------------------------------------------------------------------- + */ +#ifndef __PLANNER_H__ +#define __PLANNER_H__ + +extern void o_validate_funcexpr(Node *node, char *hint_msg); +extern void o_validate_function_by_oid(Oid procoid, char *hint_msg); + +extern void o_collect_funcexpr(Node *node); +extern void o_collect_op_by_oid(Oid opoid); + +extern void o_collect_function_by_oid(Oid procoid, Oid inputcollid, + List **processed); +extern void o_collect_functions_pstmt(PlannedStmt *pstmt, List **processed); + +#endif diff --git a/contrib/orioledb/include/utils/seq_buf.h b/contrib/orioledb/include/utils/seq_buf.h new file mode 100644 index 00000000000..147b2e83087 --- /dev/null +++ b/contrib/orioledb/include/utils/seq_buf.h @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * seq_buf.h + * Declarations for sequential buffered data access routines. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/seq_buf.h + * + *------------------------------------------------------------------------- + */ +#ifndef __SEQ_BUF_H__ +#define __SEQ_BUF_H__ + +typedef enum +{ + SeqBufPrevPageDone, + SeqBufPrevPageInProgress, + SeqBufPrevPageError +} SeqBufPrevPageState; + +typedef struct +{ + ORelOids oids; + Oid tablespace; +} OIndexKey; + +typedef struct +{ + OIndexKey key; + uint32 num; + char type; +} SeqBufTag; + +#define SeqBufTagEqual(l, r) ((l)->key.oids.datoid == (r)->key.oids.datoid && \ + (l)->key.oids.relnode == (r)->key.oids.relnode && \ + (l)->num == (r)->num && \ + (l)->type == (r)->type) + +typedef struct +{ + slock_t lock; /* spinlock protecting the fields below */ + OInMemoryBlkno pages[2]; /* pages with data */ + int location; + int curPageNum; /* current page in usage from previous two */ + uint32 filePageNum; /* file page currently loaded */ + off_t freeBytesNum; /* how many unread bytes left in a file */ + off_t evictOffset; + SeqBufTag tag; + SeqBufPrevPageState prevPageState; +} SeqBufDescShared; + +#define SEQ_BUF_SHARED_EXIST(shared_ptr) (OInMemoryBlknoIsValid((shared_ptr)->pages[0])) + +typedef struct +{ + SeqBufDescShared *shared; + File file; + SeqBufTag tag; + bool write; +} SeqBufDescPrivate; + +typedef struct +{ + off_t offset; + SeqBufTag tag; +} EvictedSeqBufData; + +typedef enum +{ + SeqBufReplaceSuccess, + SeqBufReplaceAlready, + SeqBufReplaceError +} SeqBufReplaceResult; + +extern bool init_seq_buf(SeqBufDescPrivate *seqBufPrivate, SeqBufDescShared *shared, + SeqBufTag *tag, bool write, bool init_shared, int skip_len, EvictedSeqBufData *evicted); + +extern bool seq_buf_write_u32(SeqBufDescPrivate *seqBufPrivate, uint32 offset); +extern bool seq_buf_read_u32(SeqBufDescPrivate *seqBufPrivate, uint32 *ptr); +extern bool seq_buf_write_file_extent(SeqBufDescPrivate *seqBufPrivate, FileExtent extent); +extern bool seq_buf_read_file_extent(SeqBufDescPrivate *seqBufPrivate, FileExtent *extent); + +extern uint64 seq_buf_finalize(SeqBufDescPrivate *seqBufPrivate); +extern char *get_seq_buf_filename(SeqBufTag *tag); +extern uint64 seq_buf_get_offset(SeqBufDescPrivate *seqBufPrivate); +extern SeqBufReplaceResult seq_buf_try_replace(SeqBufDescPrivate *seqBufPrivate, + SeqBufTag *tag, pg_atomic_uint64 *size, + Size data_size); +extern bool seq_buf_file_exist(SeqBufTag *tag); +extern bool seq_buf_remove_file(SeqBufTag *tag); +extern void seq_buf_close_file(SeqBufDescPrivate *seqBufPrivate); + +#endif /* __SEQ_BUF_H__ */ diff --git a/contrib/orioledb/include/utils/stopevent.h b/contrib/orioledb/include/utils/stopevent.h new file mode 100644 index 00000000000..0235fbad1db --- /dev/null +++ b/contrib/orioledb/include/utils/stopevent.h @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------- + * + * stopevent.h + * Declarations for stop events. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/stopevent.h + * + *------------------------------------------------------------------------- + */ +#ifndef __STOPEVENT_H__ +#define __STOPEVENT_H__ + +#include "utils/jsonb.h" +#include "utils/stopevents_defs.h" + +extern bool enable_stopevents; +extern bool trace_stopevents; +extern MemoryContext stopevents_cxt; + +#define STOPEVENTS_ENABLED() \ + (enable_stopevents || trace_stopevents) + +#define STOPEVENT(event_id, params) \ + do { \ + if (STOPEVENTS_ENABLED()) \ + handle_stopevent((event_id), (params)); \ + } while(0) + +#define STOPEVENT_CONDITION(event_id, params) \ + (STOPEVENTS_ENABLED() && check_stopevent((event_id), (params))) + +extern Size StopEventShmemSize(void); +extern void StopEventShmemInit(Pointer ptr, bool found); +extern Datum pg_stopevent_set(PG_FUNCTION_ARGS); +extern Datum pg_stopevent_reset(PG_FUNCTION_ARGS); +extern Datum pg_stopevents(PG_FUNCTION_ARGS); +extern bool pid_is_waiting_for_stopevent(int pid); +extern void handle_stopevent(int event_id, Jsonb *params); +extern bool check_stopevent(int event_id, Jsonb *params); +extern void wait_for_stopevent_enabled(int event_id); +extern void stopevents_make_cxt(void); + +#endif /* __STOPEVENT_H__ */ diff --git a/contrib/orioledb/include/utils/ucm.h b/contrib/orioledb/include/utils/ucm.h new file mode 100644 index 00000000000..bda83486643 --- /dev/null +++ b/contrib/orioledb/include/utils/ucm.h @@ -0,0 +1,83 @@ +/*------------------------------------------------------------------------- + * + * ucm.h + * Declarations of OrioleDB usage count map (UCM). + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/utils/ucm.h + * + *------------------------------------------------------------------------- + */ +#ifndef __UCM_H__ +#define __UCM_H__ + +#include "btree/page_state.h" + +#define UCM_INVALID_LEVEL (0xF) +#define UCM_USAGE_LEVELS (0x7) +#define UCM_FREE_PAGES_LEVEL (0x7) +#define UCM_LEVELS (0x8) + +typedef struct UsageCountMap +{ + pg_atomic_uint32 *epoch; + pg_atomic_uint32 *ucm; + OInMemoryBlkno offset; + OInMemoryBlkno size; + int total; + int nonLeaf; + int rootFactor; + uint32 usageCounter; +} UsageCountMap; + +extern bool skip_ucm; + +extern Size estimate_ucm_space(UsageCountMap *map, OInMemoryBlkno offset, OInMemoryBlkno size); +extern void init_ucm(UsageCountMap *map, Pointer ptr, bool found); +extern void ucm_inc(UsageCountMap *map, OInMemoryBlkno blkno, int prev, int next); +extern void page_inc_usage_count(UsageCountMap *map, OInMemoryBlkno blkno); +extern void page_change_usage_count(UsageCountMap *map, OInMemoryBlkno blkno, uint32 usageCount); +extern bool ucm_check_map(UsageCountMap *map); +extern bool ucm_epoch_needs_shift(UsageCountMap *map); +extern void ucm_epoch_shift(UsageCountMap *map); +extern OInMemoryBlkno ucm_next_blkno(UsageCountMap *map, OInMemoryBlkno init_blkno, uint32 mask_src); +extern OInMemoryBlkno ucm_occupy_free_page(UsageCountMap *map); + +static inline uint64 +ucm_update_state(UsageCountMap *map, OInMemoryBlkno blkno, uint64 state) +{ + uint32 epoch = pg_atomic_read_u32(map->epoch), + mask; + uint32 usageCount = O_PAGE_STATE_GET_USAGE_COUNT(state); + + if (usageCount == UCM_INVALID_LEVEL || + usageCount == UCM_FREE_PAGES_LEVEL) + return state; + + Assert(usageCount < UCM_USAGE_LEVELS); + + map->usageCounter++; + + mask = (1 << ((UCM_USAGE_LEVELS + usageCount - epoch) % UCM_USAGE_LEVELS)) - 1; + + if ((map->usageCounter & mask) == 0 && (usageCount + 1) % UCM_USAGE_LEVELS != epoch) + return O_PAGE_STATE_SET_USAGE_COUNT(state, (usageCount + 1) % UCM_USAGE_LEVELS); + else + return state; +} + +static inline void +ucm_after_update_state(UsageCountMap *map, OInMemoryBlkno blkno, + uint64 oldState, uint64 newState) +{ + uint32 oldUsageCount = O_PAGE_STATE_GET_USAGE_COUNT(oldState); + uint32 newUsageCount = O_PAGE_STATE_GET_USAGE_COUNT(newState); + + if (oldUsageCount != newUsageCount) + ucm_inc(map, blkno - map->offset, oldUsageCount, newUsageCount); +} + +#endif /* __UCM_H__ */ diff --git a/contrib/orioledb/include/workers/bgwriter.h b/contrib/orioledb/include/workers/bgwriter.h new file mode 100644 index 00000000000..05738f6795b --- /dev/null +++ b/contrib/orioledb/include/workers/bgwriter.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * bgwriter.h + * Routines for background writer process. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/workers/bgwriter.h + * + *------------------------------------------------------------------------- + */ +#ifndef __BGWRITER_H__ +#define __BGWRITER_H__ + +extern bool IsBGWriter; +extern int BGWriterNum; + +extern void register_bgwriter(int num); +PGDLLEXPORT void bgwriter_main(Datum); + +#endif /* __BGWRITER_H__ */ diff --git a/contrib/orioledb/include/workers/interrupt.h b/contrib/orioledb/include/workers/interrupt.h new file mode 100644 index 00000000000..cfca9580a90 --- /dev/null +++ b/contrib/orioledb/include/workers/interrupt.h @@ -0,0 +1,20 @@ +/*------------------------------------------------------------------------- + * + * interrupt.h + * Routines for background workers interrupt handling. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/include/workers/interrupt.h + * + *------------------------------------------------------------------------- + */ +#ifndef __WORKERS_INTERRUPT_H__ +#define __WORKERS_INTERRUPT_H__ + +extern void o_worker_shutdown(int elevel); +extern void o_worker_handle_interrupts(void); + +#endif /* __WORKERS_INTERRUPT_H__ */ diff --git a/contrib/orioledb/make.flags.example b/contrib/orioledb/make.flags.example new file mode 100644 index 00000000000..b9aacee45eb --- /dev/null +++ b/contrib/orioledb/make.flags.example @@ -0,0 +1,8 @@ +# Makefile checks if the variable is defined, the value is ignored +# Remove/comment out the flag to deactivate it + +USE_PGXS=1 + +IS_DEV=1 + +#VALGRIND=1 diff --git a/contrib/orioledb/orioledb.control b/contrib/orioledb/orioledb.control new file mode 100644 index 00000000000..c94648ca693 --- /dev/null +++ b/contrib/orioledb/orioledb.control @@ -0,0 +1,5 @@ +# orioledb extension +comment = 'OrioleDB -- the next generation transactional engine' +default_version = '1.8' +module_pathname = '$libdir/orioledb' +relocatable = true diff --git a/contrib/orioledb/orioledb_s3_loader.py b/contrib/orioledb/orioledb_s3_loader.py new file mode 100755 index 00000000000..f3ffab472b9 --- /dev/null +++ b/contrib/orioledb/orioledb_s3_loader.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python3 + +import argparse +import boto3 +import os +import re +import struct +import testgres + +from botocore.config import Config +from botocore.exceptions import ClientError, ParamValidationError +from concurrent.futures import ThreadPoolExecutor +from boto3.s3.transfer import TransferConfig +from threading import Event +from typing import Callable, Dict, Optional +from urllib.parse import urlparse + + +class OrioledbS3ObjectLoader: + + def parse_args(self): + epilog = """ + This util uses boto3 under the hood. + You can set credentials using AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_DEFAULT_REGION variables + Or by using ~/.aws/config file. + Read for more details: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables + """ + parser = argparse.ArgumentParser(usage=argparse.SUPPRESS, + epilog=epilog) + parser.add_argument( + '--endpoint', + dest='endpoint', + required=True, + help="AWS url (must contain bucket name if no prefix set)") + parser.add_argument('-d', + '--data-dir', + dest='data_dir', + required=True, + help="Destination data directory") + parser.add_argument( + '--prefix', + dest='prefix', + required=False, + default="", + help="Prefix to prepend to S3 object name (may contain bucket name)" + ) + parser.add_argument('--cert-file', + dest='cert_file', + help="Path to crt file") + parser.add_argument( + '--verbose', + dest='verbose', + action='store_true', + help="More verbose output. Downloaded files displayed.") + + try: + args = parser.parse_args() + except SystemExit as e: + if e.code != 0: + parser.print_help() + raise + + if 'cert_file' in args: + verify = args.cert_file + else: + verify = None + + parsed_url = urlparse(args.endpoint) + bucket = parsed_url.netloc.split('.')[0] + raw_endpoint = f"{parsed_url.scheme}://{'.'.join(parsed_url.netloc.split('.')[1:])}" + + splitted_prefix = args.prefix.strip('/').split('/') + splitted_path = parsed_url.path.strip('/').split('/') + prefix = os.path.join(*splitted_path, *splitted_prefix) + splitted_prefix = prefix.split('/') + + bucket_in_endpoint = True + bucket_in_prefix = False + try: + config = Config(s3={'addressing_style': 'virtual'}) + s3_client = boto3.client("s3", + endpoint_url=raw_endpoint, + verify=verify, + config=config) + s3_client.head_bucket(Bucket=bucket) + bucket_name = bucket + except ValueError: + bucket_in_endpoint = False + bucket_in_prefix = True + if bucket_in_prefix: + config = None + bucket = splitted_prefix[0] + prefix = '/'.join(splitted_prefix[1:]) + s3_client = boto3.client( + "s3", + endpoint_url=f"{parsed_url.scheme}://{parsed_url.netloc}", + verify=verify) + try: + s3_client.head_bucket(Bucket=bucket) + except ParamValidationError: + bucket_in_prefix = False + except ClientError: + bucket_in_prefix = False + bucket_name = bucket + + if not bucket_in_endpoint and not bucket_in_prefix: + raise Exception("No valid bucket name in endpoint or prefix") + + self._error_occurred = Event() + self.data_dir = args.data_dir + self.bucket_name = bucket_name + self.prefix = prefix + self.verbose = args.verbose + self.s3 = s3_client + + def run(self): + chkp_num = self.last_checkpoint_number(self.bucket_name) + self.download_files_in_directory(self.bucket_name, + 'data/', + chkp_num, + self.data_dir, + transform=self.transform_pg) + self.download_files_in_directory(self.bucket_name, + 'orioledb_data/', + chkp_num, + f"{self.data_dir}/orioledb_data", + transform=self.transform_orioledb, + filter=self.filter_orioledb) + + self.download_unchanged_files( + self.bucket_name, os.path.join("orioledb_data", "file_checksums"), + chkp_num, None) + + self.download_unchanged_small_files( + self.bucket_name, + os.path.join("orioledb_data", "small_file_checksums"), chkp_num, + None) + + control = get_control_data(self.data_dir) + orioledb_control = get_orioledb_control_data(self.data_dir) + self.download_undo(orioledb_control['undoRegularStartLocation'], + orioledb_control['undoRegularEndLocation'], + "orioledb_data/%02X%08Xdata") + self.download_undo(orioledb_control['undoSystemStartLocation'], + orioledb_control['undoSystemEndLocation'], + "orioledb_data/%02X%08Xsystem") + wal_file = control["Latest checkpoint's REDO WAL file"] + local_path = os.path.join(self.data_dir, f"pg_wal/{wal_file}") + wal_file = os.path.join(self.prefix, f"wal/{wal_file}") + self.download_file(self.bucket_name, wal_file, local_path) + + def download_undo(self, startLocation, endLocation, template): + UNDO_FILE_SIZE = 0x4000000 + if startLocation >= endLocation: + return + for fileNum in range(startLocation // UNDO_FILE_SIZE, + (endLocation - 1) // UNDO_FILE_SIZE): + fileName = template % (fileNum >> 32, fileNum & 0xFFFFFFFF) + fileName = os.path.join(self.prefix, fileName) + loader.download_file(self.bucket_name, fileName, fileName) + + def last_checkpoint_number(self, bucket_name): + paginator = self.s3.get_paginator('list_objects_v2') + + numbers = [] + prefix = os.path.join(self.prefix, 'data/') + for page in paginator.paginate(Bucket=bucket_name, + Prefix=prefix, + Delimiter='/'): + if 'CommonPrefixes' in page: + for prefix in page['CommonPrefixes']: + prefix_key = prefix['Prefix'].rstrip('/') + subdirectory = prefix_key.split('/')[-1] + try: + number = int(subdirectory) + numbers += [number] + except ValueError: + pass + + numbers = sorted(numbers) + + found = False + chkp_list_index = len(numbers) - 1 + + last_chkp_data_dir = os.path.join(self.prefix, 'data', + str(numbers[chkp_list_index])) + + while not found and chkp_list_index >= 0: + try: + self.s3.head_object( + Bucket=bucket_name, + Key=f'{last_chkp_data_dir}/global/pg_control') + self.s3.head_object( + Bucket=bucket_name, + Key=f'{last_chkp_data_dir}/orioledb_data/control') + found = True + except ClientError as e: + if e.response['Error']['Code'] == "404": + chkp_list_index -= 1 + if chkp_list_index >= 0: + last_chkp_data_dir = os.path.join( + self.prefix, 'data', str(numbers[chkp_list_index])) + else: + raise + + if chkp_list_index < 0: + raise Exception("Failed to find valid checkpoint in s3 bucket") + + return numbers[chkp_list_index] + + def list_objects(self, bucket_name, directory): + objects = [] + paginator = self.s3.get_paginator('list_objects_v2') + + prefix = os.path.join(self.prefix, directory) + for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix): + if 'Contents' in page: + page_objs = [x["Key"] for x in page['Contents']] + objects.extend(page_objs) + + return objects + + # Reimplement os.dirs so it sets mode for intermediate dirs also + def makedirs(self, name, mode=0o777, exist_ok=False): + """makedirs(name [, mode=0o777][, exist_ok=False]) + + Super-mkdir; create a leaf directory and all intermediate ones. Works like + mkdir, except that any intermediate path segment (not just the rightmost) + will be created if it does not exist. If the target directory already + exists, raise an OSError if exist_ok is False. Otherwise no exception is + raised. This is recursive. + + """ + head, tail = os.path.split(name) + if not tail: + head, tail = os.path.split(head) + if head and tail and not os.path.exists(head): + try: + self.makedirs(head, mode, exist_ok=exist_ok) + except FileExistsError: + # Defeats race condition when another thread created the path + pass + cdir = os.curdir + if isinstance(tail, bytes): + cdir = bytes(os.curdir, 'ASCII') + if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists + return + try: + os.mkdir(name, mode) + except OSError: + # Cannot rely on checking for EEXIST, since the operating system + # could give priority to other errors like EACCES or EROFS + if not exist_ok or not os.path.isdir(name): + raise + + def download_file(self, bucket_name, file_key, local_path) -> bool: + try: + transfer_config = TransferConfig(use_threads=False, + max_concurrency=1) + if file_key[-1] == '/': + dirs = local_path + else: + dirs = '/'.join(local_path.split('/')[:-1]) + self.makedirs(dirs, exist_ok=True, mode=0o700) + if file_key[-1] != '/': + self.s3.download_file(bucket_name, + file_key, + local_path, + Config=transfer_config) + if self.verbose: + print(f"{file_key} -> {local_path}", flush=True) + if re.match(r'.*/orioledb_data/small_files_\d+$', local_path): + base_dir = '/'.join(local_path.split('/')[:-2]) + with open(local_path, 'rb') as file: + data = file.read() + numFiles = struct.unpack('i', data[0:4])[0] + for i in range(0, numFiles): + (nameOffset, dataOffset, + dataLength) = struct.unpack('iii', + data[4 + i * 12:16 + i * 12]) + name = data[nameOffset:data.find(b'\0', nameOffset + )].decode('ascii') + fullname = f"{base_dir}/{name}" + if self.verbose: + print(f"{file_key} -> {fullname}", flush=True) + self.makedirs(os.path.dirname(fullname), + exist_ok=True, + mode=0o700) + with open(fullname, 'wb') as file: + file.write(data[dataOffset:dataOffset + dataLength]) + os.chmod(fullname, 0o600) + os.unlink(local_path) + + except ClientError as e: + if e.response['Error']['Code'] == "404": + print(f"File not found: {file_key}") + else: + print(f"An error occurred: {e}") + self._error_occurred.set() + return False + + return True + + def transform_orioledb(self, val: str) -> str: + offset = 0 + prefix = self.prefix.strip('/') + if prefix != "": + offset = len(prefix.split('/')) + parts = val.split('/') + file_parts = parts[offset + 3].split('.') + result = f"{parts[offset + 2]}/{file_parts[0]}-{parts[offset + 1]}" + if file_parts[-1] == 'map': + result += '.map' + return result + + def filter_orioledb(self, val: str) -> bool: + offset = 0 + prefix = self.prefix.strip('/') + if prefix != "": + offset = len(prefix.split('/')) + parts = val.split('/') + file_parts = parts[offset + 3].split('.') + is_map = file_parts[-1] == 'map' + return is_map + + def transform_pg(self, val: str) -> str: + offset = 0 + prefix = self.prefix.strip('/') + if prefix != "": + offset = len(prefix.split('/')) + parts = val.split('/') + result = '/'.join(parts[offset + 2:]) + return result + + def download_files_in_directory(self, + bucket_name, + directory, + chkp_num, + local_directory, + transform: Callable[[str], str], + filter: Callable[[str], bool] = None): + last_chkp_dir = os.path.join(directory, str(chkp_num)) + objects = self.list_objects(bucket_name, last_chkp_dir) + max_threads = os.cpu_count() + + with ThreadPoolExecutor(max_threads) as executor: + futures = [] + + for file_key in objects: + local_file = transform(file_key) + if filter and not filter(file_key): + continue + local_path = f"{local_directory}/{local_file}" + future = executor.submit(self.download_file, bucket_name, + file_key, local_path) + futures.append(future) + + for future in futures: + future.result() + + if self._error_occurred.is_set(): + print("An error occurred. Stopping all downloads.") + executor.shutdown(wait=False, cancel_futures=True) + break + + def download_unchanged_files(self, bucket_name: str, + file_checksums_name: str, chkp_num: int, + file_checksums: Optional[Dict[str, str]]): + # We won't be able to download unchanged previous files if this is + # the first checkpoint + if chkp_num <= 1: + return + + prev_chkp_num = chkp_num - 1 + prev_chkp_dir = os.path.join(self.prefix, "data", str(prev_chkp_num)) + + if file_checksums is None: + file_checksums_path = os.path.join(self.data_dir, + file_checksums_name) + file_checksums = self.get_unchanged_file_checksums( + file_checksums_path, chkp_num) + + prev_file_checksums = {} + for filename, checkpoint in file_checksums.items(): + # Ignore changed files + if int(checkpoint) == chkp_num: + continue + + # This file needs to be downloaded from pre-previous checkpoint + if int(checkpoint) < prev_chkp_num: + prev_file_checksums[filename] = checkpoint + continue + + remote_file = os.path.join(prev_chkp_dir, filename) + local_file = os.path.join(self.data_dir, filename) + + self.download_file(bucket_name, remote_file, local_file) + + # Some files are still missing + if len(prev_file_checksums) > 0: + # Recursively download unchanged files + self.download_unchanged_files(bucket_name, file_checksums_name, + prev_chkp_num, prev_file_checksums) + + def download_unchanged_small_files(self, bucket_name: str, + file_checksums_name: str, chkp_num: int, + file_checksums: Optional[Dict[str, + str]]): + # We won't be able to download unchanged previous files if this is + # the first checkpoint + if chkp_num <= 1: + return + + prev_chkp_num = chkp_num - 1 + prev_chkp_dir = os.path.join(self.prefix, "data", str(prev_chkp_num)) + + if file_checksums is None: + file_checksums_path = os.path.join(self.data_dir, + file_checksums_name) + file_checksums = self.get_unchanged_file_checksums( + file_checksums_path, chkp_num) + + if len(file_checksums) == 0: + return + + small_files_num = 0 + files_restored = 0 + while True: + small_filename = os.path.join("orioledb_data", + f"small_files_{small_files_num}") + remote_path = os.path.join(prev_chkp_dir, small_filename) + temp_path = os.path.join(self.data_dir, + f"{small_filename}.{prev_chkp_num}") + + # Looks like the file doesn't exist, break + if not self.download_file(bucket_name, remote_path, temp_path): + break + + with open(temp_path, 'rb') as file: + data = file.read() + numFiles = struct.unpack('i', data[0:4])[0] + + for i in range(0, numFiles): + (nameOffset, dataOffset, + dataLength) = struct.unpack('iii', + data[4 + i * 12:16 + i * 12]) + + name = data[nameOffset:data.find(b'\0', nameOffset + )].decode('ascii') + fullname = os.path.join(self.data_dir, name) + + if not name in file_checksums: + continue + + if self.verbose: + print(f"{remote_path} -> {fullname}", flush=True) + + self.makedirs(os.path.dirname(fullname), + exist_ok=True, + mode=0o700) + + with open(fullname, 'wb') as file: + file.write(data[dataOffset:dataOffset + dataLength]) + os.chmod(fullname, 0o600) + + files_restored += 1 + # Looks like we restored all unchanged files, break + if files_restored == len(file_checksums): + break + + os.unlink(temp_path) + small_files_num += 1 + + # Looks like we restored all unchanged files, break + if files_restored == len(file_checksums): + break + + # Not all files were restored, check the previous checkpoint recursively + if files_restored < len(file_checksums): + prev_file_checksums = {} + for filename, checkpoint in file_checksums: + if int(checkpoint) < prev_chkp_num: + prev_file_checksums[filename] = checkpoint + + assert len(prev_file_checksums) > 0 + + # Recursively download unchanged small files + self.download_unchanged_small_files(bucket_name, + file_checksums_name, + prev_chkp_num, + prev_file_checksums) + + def get_unchanged_file_checksums(self, file_checksums_name: str, + chkp_num: int) -> dict[str, str]: + res = {} + + pattern_str = r"^FILE: (?P.+), CHECKSUM: (?P.+), CHECKPOINT: (?P\d+)$" + pattern = re.compile(pattern_str) + with open(file_checksums_name) as file: + for line in file: + m = pattern.search(line) + + if m is None or len(m.groups()) != 3: + raise Exception( + f"Invalid line format of the checksum file {file_checksums_name}: {line}" + ) + + line_dict = m.groupdict() + + if int(line_dict["checkpoint"]) > chkp_num: + raise Exception(f'Unexpected checkpoint number "{line}"') + + if int(line_dict["checkpoint"]) < chkp_num: + res[line_dict["filename"]] = line_dict["checkpoint"] + + return res + + +def get_control_data(data_dir: str): + """ + Return contents of pg_control file. + """ + + # this one is tricky (blame PG 9.4) + _params = [testgres.get_bin_path("pg_controldata")] + _params += ["-D"] + _params += [data_dir] + + data = testgres.utils.execute_utility(_params) + + out_dict = {} + + for line in data.splitlines(): + key, _, value = line.partition(':') + out_dict[key.strip()] = value.strip() + + return out_dict + + +def get_orioledb_control_data(data_dir: str): + """ + Return contents of OrioleDB control file. + """ + + f = open(f"{data_dir}/orioledb_data/control", 'rb') + data = f.read(8 * 13) + (undoRegularStartLocation, + undoRegularEndLocation) = struct.unpack('QQ', data[8 * 8:8 * 10]) + (undoSystemStartLocation, + undoSystemEndLocation) = struct.unpack('QQ', data[8 * 11:8 * 13]) + f.close() + + dict = { + 'undoRegularStartLocation': undoRegularStartLocation, + 'undoRegularEndLocation': undoRegularEndLocation, + 'undoSystemStartLocation': undoSystemStartLocation, + 'undoSystemEndLocation': undoSystemEndLocation + } + + return dict + + +if __name__ == '__main__': + loader = OrioledbS3ObjectLoader() + loader.parse_args() + loader.run() diff --git a/contrib/orioledb/requirements.txt b/contrib/orioledb/requirements.txt new file mode 100644 index 00000000000..d2b35b31d04 --- /dev/null +++ b/contrib/orioledb/requirements.txt @@ -0,0 +1,36 @@ +asn1crypto==1.5.1 +blinker==1.9.0 +boto3==1.43.0 +botocore==1.43.2 +certifi==2026.4.22 +cffi==2.0.0 +charset-normalizer==3.4.7 +click==8.3.3 +cryptography==47.0.0 +Flask==3.1.3 +flask-cors==6.0.2 +idna==3.13 +itsdangerous==2.2.0 +Jinja2==3.1.6 +jmespath==1.1.0 +MarkupSafe==3.0.3 +moto==5.1.22 +packaging==26.2 +pg8000==1.31.5 +port-for==1.0.0 +psutil==7.2.2 +psycopg2==2.9.12 +pycparser==3.0 +pyOpenSSL==26.1.0 +python-dateutil==2.9.0.post0 +PyYAML==6.0.3 +requests==2.33.1 +responses==0.26.0 +s3transfer==0.17.0 +scramp==1.4.8 +six==1.17.0 +testgres==1.11.0 +unidiff==0.7.5 +urllib3==2.6.3 +Werkzeug==3.1.8 +xmltodict==1.0.4 diff --git a/contrib/orioledb/sql/orioledb--1.0--1.1.sql b/contrib/orioledb/sql/orioledb--1.0--1.1.sql new file mode 100644 index 00000000000..e018c0cb84f --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.0--1.1.sql @@ -0,0 +1,11 @@ +/* contrib/orioledb/sql/orioledb--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.1'" to load this file. \quit + +CREATE FUNCTION orioledb_tbl_bin_structure(relid oid, + print_bytes bool default 'false', + depth int default 32) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.0_dev.sql b/contrib/orioledb/sql/orioledb--1.0_dev.sql new file mode 100644 index 00000000000..35aaa2b8993 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.0_dev.sql @@ -0,0 +1,21 @@ +/* contrib/orioledb/sql/orioledb--1.0_dev.sql */ + +CREATE FUNCTION orioledb_parallel_debug_start() +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_parallel_debug_stop() +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION s3_get(objectname text) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION s3_put(objectname text, filename text) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.0_prod.sql b/contrib/orioledb/sql/orioledb--1.0_prod.sql new file mode 100644 index 00000000000..ecff86fb5b4 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.0_prod.sql @@ -0,0 +1,209 @@ +/* contrib/orioledb/sql/orioledb--1.0_prod.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION orioledb" to load this file. \quit + +------------------------------------- +-- Table AM interface functions +------------------------------------- +CREATE FUNCTION orioledb_tableam_handler(internal) +RETURNS table_am_handler +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE ACCESS METHOD orioledb TYPE TABLE +HANDLER orioledb_tableam_handler; + +------------------------------------- +-- Diagnostic functions +------------------------------------- +CREATE FUNCTION orioledb_version() +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_commit_hash() +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_tbl_structure(relid oid, + options varchar default '', + depth int default 32) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_idx_structure(relid oid, + tree_name text, + options varchar default '', + depth int default 32) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_tbl_check(relid oid, force_map_check bool default False) +RETURNS bool +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_compression_max_level() +RETURNS int8 +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_tbl_compression_check +( + level int8, + relid oid, + ranges integer[] default array[1024, 2048, 3072, 4096, 5120, 6144, 7168] +) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_tbl_indices(relid oid) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_sys_tree_structure(num int, + options varchar default '', + depth int default 32) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_tbl_are_indices_equal(idx_oid1 regclass, + idx_oid2 regclass) +RETURNS bool +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_sys_tree_check(num integer, force_map_check bool default False) +RETURNS bool +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_sys_tree_rows(num integer) +RETURNS SETOF jsonb +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_index_rows(relid oid, OUT total int, OUT dead int) +RETURNS record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_table_oids(OUT datoid oid, OUT reloid oid, OUT relnode oid) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_index_oids(OUT datoid oid, + OUT table_reloid oid, OUT table_relnode oid, + OUT index_reloid oid, OUT index_relnode oid, + OUT index_type text) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_table_pages(relid oid, OUT blkno int8, OUT level int4, OUT rightlink int8, OUT hikey jsonb) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_page_stats(OUT pool_name text, OUT busy_pages int8, OUT free_pages int8, OUT dirty_pages int8, OUT all_pages int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_table_description(relid oid) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_table_description(datoid oid, relid oid, relnode oid) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_index_description(IN datoid oid, IN relid oid, IN relnode oid, IN index_type text, + OUT name text, OUT description text) +RETURNS record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_relation_size(relid oid) +RETURNS BIGINT +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +------------------------------------- +-- Debug support functions +------------------------------------- +CREATE FUNCTION orioledb_has_retained_undo() +RETURNS bool +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_evict_pages(relid oid, maxLevel int) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_write_pages(relid oid) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_ucm_check() +RETURNS bool +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION pg_stopevent_set(eventname text, condition jsonpath) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_stopevent_reset(eventname text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_stopevents(OUT stopevent text, OUT condition jsonpath, OUT waiter_pids int[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_recovery_synchronized() +RETURNS boolean +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_table_descrs(OUT datoid oid, OUT reloid oid, OUT relnode oid, OUT refcnt oid) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_index_descrs(OUT datoid oid, OUT reloid oid, OUT relnode oid, OUT refcnt oid) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE VIEW orioledb_table AS + SELECT t.*, + orioledb_table_description(t.datoid, t.reloid, t.relnode) AS description + FROM orioledb_table_oids() t; + +CREATE VIEW orioledb_index AS + SELECT t.*, + (orioledb_index_description(t.datoid, t.index_reloid, t.index_relnode, t.index_type)).* + FROM orioledb_index_oids() t; + +CREATE VIEW orioledb_table_descr AS + SELECT * FROM orioledb_get_table_descrs(); + +CREATE VIEW orioledb_index_descr AS + SELECT * FROM orioledb_get_index_descrs(); diff --git a/contrib/orioledb/sql/orioledb--1.1--1.2.sql b/contrib/orioledb/sql/orioledb--1.1--1.2.sql new file mode 100644 index 00000000000..0c135b5a9ce --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.1--1.2.sql @@ -0,0 +1,12 @@ +/* contrib/orioledb/sql/orioledb--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.2'" to load this file. \quit + +CREATE FUNCTION orioledb_get_evicted_trees(OUT datoid oid, + OUT relnode oid, + OUT root_downlink int8, + OUT file_length int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.2--1.3.sql b/contrib/orioledb/sql/orioledb--1.2--1.3.sql new file mode 100644 index 00000000000..ee3c7afa5ba --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.2--1.3.sql @@ -0,0 +1,13 @@ +/* contrib/orioledb/sql/orioledb--1.2--1.3.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.3'" to load this file. \quit + +CREATE FUNCTION orioledb_tree_stat(relid regclass, + OUT level int, + OUT count int8, + OUT avgoccupied float8, + OUT avgvacated float8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.3--1.4.sql b/contrib/orioledb/sql/orioledb--1.3--1.4.sql new file mode 100644 index 00000000000..56663b75bc7 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.3--1.4.sql @@ -0,0 +1,9 @@ +/* contrib/orioledb/sql/orioledb--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION orioledb_tbl_indices(relid oid, internal bool, oids bool default false) +RETURNS text +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.4--1.5_dev.sql b/contrib/orioledb/sql/orioledb--1.4--1.5_dev.sql new file mode 100644 index 00000000000..4cda8f32c13 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.4--1.5_dev.sql @@ -0,0 +1,15 @@ +/* contrib/orioledb/sql/orioledb--1.4--1.5_dev.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.5'" to load this file. \quit + +CREATE FUNCTION orioledb_rewind_set_complete(xid int, oxid bigint) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_rewind_sync() +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + diff --git a/contrib/orioledb/sql/orioledb--1.4--1.5_prod.sql b/contrib/orioledb/sql/orioledb--1.4--1.5_prod.sql new file mode 100644 index 00000000000..f08801e85c5 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.4--1.5_prod.sql @@ -0,0 +1,55 @@ +/* contrib/orioledb/sql/orioledb--1.4--1.5_prod.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.5'" to load this file. \quit + +-- Rewind by rewind_time (in seconds) back from the present +CREATE FUNCTION orioledb_rewind_by_time(rewind_time int) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +-- Rewind to just before xid/oxid pair (remembered previously using pg_current_xact_id() and orioledb_get_current_oxid()) +CREATE FUNCTION orioledb_rewind_to_transaction(xid int, oxid bigint) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +-- Rewind to before a particular timestamp +CREATE FUNCTION orioledb_rewind_to_timestamp(rewind_timestamp TimestampTz) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +-- Get current oxid to remember it. This does the same for Oriole transactions that pg_current_xact_id() does for heap transaction +CREATE FUNCTION orioledb_get_current_oxid() +RETURNS bigint +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_rewind_queue_length() +RETURNS bigint +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_rewind_evicted_length() +RETURNS bigint +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +-- Get last non-complete xid and oxid still in the queue. Mainly intended for tests. +CREATE FUNCTION orioledb_get_complete_oxid() +RETURNS bigint +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_complete_xid() +RETURNS int +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + + +--CREATE FUNCTION orioledb_rewind_queue_age() +--RETURNS bigint +--AS 'MODULE_PATHNAME' +--VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.5--1.6_dev.sql b/contrib/orioledb/sql/orioledb--1.5--1.6_dev.sql new file mode 100644 index 00000000000..361361da258 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.5--1.6_dev.sql @@ -0,0 +1,31 @@ +/* contrib/orioledb/sql/orioledb--1.5--1.6_dev.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.6'" to load this file. \quit + +-- Get current logical xid to remember it +CREATE FUNCTION orioledb_get_current_logical_xid() +RETURNS int8 +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +-- Get current heap xid to remember it +CREATE FUNCTION orioledb_get_current_heap_xid() +RETURNS int8 +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_insert_sys_xid_undo_location(xid int, undoLocation bigint) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_read_sys_xid_undo_location(xid int) +RETURNS bigint +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_int4range_immutable(input_str text) +RETURNS int4range +AS 'MODULE_PATHNAME' +IMMUTABLE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.5--1.6_prod.sql b/contrib/orioledb/sql/orioledb--1.5--1.6_prod.sql new file mode 100644 index 00000000000..bb9721eb490 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.5--1.6_prod.sql @@ -0,0 +1,9 @@ +/* contrib/orioledb/sql/orioledb--1.5--1.6_prod.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.6'" to load this file. \quit + +CREATE FUNCTION pg_stopevent_set(eventname text, condition jsonpath, flags text) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; diff --git a/contrib/orioledb/sql/orioledb--1.6--1.7_dev.sql b/contrib/orioledb/sql/orioledb--1.6--1.7_dev.sql new file mode 100644 index 00000000000..cd9f7cc084a --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.6--1.7_dev.sql @@ -0,0 +1,59 @@ +/* contrib/orioledb/sql/orioledb--1.6--1.7_dev.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.7'" to load this file. \quit + +CREATE FUNCTION reset_read_page_checkpoint_stats() +RETURNS VOID +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION fetch_read_page_checkpoint_stats(OUT min_read_page_checkpoint int4, OUT max_read_page_checkpoint int4) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_xid_meta( + OUT nextXid int8, + OUT runXmin int8, + OUT globalXmin int8, + OUT lastXidWhenUpdatedGlobalXmin int8, + OUT writeInProgressXmin int8, + OUT writtenXmin int8, + OUT checkpointRetainXmin int8, + OUT checkpointRetainXmax int8, + OUT cleanedXmin int8) +RETURNS record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_undo_meta( + OUT undo_type text, + OUT lastUsedLocation int8, + OUT advanceReservedLocation int8, + OUT writeInProgressLocation int8, + OUT writtenLocation int8, + OUT minProcTransactionRetainLocation int8, + OUT minProcRetainLocation int8, + OUT minProcReservedLocation int8, + OUT checkpointRetainStartLocation int8, + OUT checkpointRetainEndLocation int8, + OUT cleanedLocation int8, + OUT cleanedCheckpointStartLocation int8, + OUT cleanedCheckpointEndLocation int8, + OUT minRewindRetainLocation int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_get_proc_retain_undo_locations( + OUT pid int4, + OUT procno int4, + OUT undo_type text, + OUT reservedUndoLocation int8, + OUT transactionUndoRetainLocation int8, + OUT snapshotRetainUndoLocation int8, + OUT effectiveRetainLocation int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.6--1.7_prod.sql b/contrib/orioledb/sql/orioledb--1.6--1.7_prod.sql new file mode 100644 index 00000000000..37e0413e952 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.6--1.7_prod.sql @@ -0,0 +1,29 @@ +/* contrib/orioledb/sql/orioledb--1.6--1.7_prod.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.7'" to load this file. \quit + +CREATE FUNCTION orioledb_rewind_by_time(rewind_time int, attempt_restart bool) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_rewind_to_transaction(xid int, oxid bigint, attempt_restart bool) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_rewind_to_timestamp(rewind_timestamp TimestampTz, attempt_restart bool) +RETURNS void +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_undo_size(OUT undo_type text, OUT undo_size int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; + +CREATE FUNCTION orioledb_print_pool_pages(ppool_arg integer DEFAULT NULL, OUT block_num int8, OUT name TEXT, OUT datoid int8, OUT reloid int8, OUT relnode int8, OUT type TEXT, OUT state int8) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +VOLATILE LANGUAGE C; \ No newline at end of file diff --git a/contrib/orioledb/sql/orioledb--1.7--1.8_dev.sql b/contrib/orioledb/sql/orioledb--1.7--1.8_dev.sql new file mode 100644 index 00000000000..11aa61b6a0d --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.7--1.8_dev.sql @@ -0,0 +1,10 @@ +/* contrib/orioledb/sql/orioledb--1.7--1.8_dev.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.8'" to load this file. \quit + +DROP FUNCTION orioledb_int4range_immutable(input_str text); +CREATE FUNCTION orioledb_int4range_immutable(input_str text) +RETURNS int4range +AS 'MODULE_PATHNAME' +IMMUTABLE PARALLEL SAFE LANGUAGE C; diff --git a/contrib/orioledb/sql/orioledb--1.7--1.8_prod.sql b/contrib/orioledb/sql/orioledb--1.7--1.8_prod.sql new file mode 100644 index 00000000000..ae6bd92bcf1 --- /dev/null +++ b/contrib/orioledb/sql/orioledb--1.7--1.8_prod.sql @@ -0,0 +1,52 @@ +/* contrib/orioledb/sql/orioledb--1.7--1.8_prod.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION orioledb UPDATE TO '1.8'" to load this file. \quit + + +CREATE FUNCTION orioledb_list_orphaned( + older_than interval default null, + OUT dbname text, + OUT path text, + OUT name text, + OUT size bigint, + OUT mod_time timestamptz, + OUT relfilenode bigint, + OUT reloid bigint, + OUT older bool) +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'orioledb_list_orphaned' +LANGUAGE C VOLATILE; + +CREATE FUNCTION orioledb_list_orphaned_moved( + OUT dbname text, + OUT path text, + OUT name text, + OUT size bigint, + OUT mod_time timestamptz, + OUT relfilenode bigint, + OUT reloid bigint) +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'orioledb_list_orphaned_moved' +LANGUAGE C VOLATILE; + +CREATE FUNCTION orioledb_move_orphaned(older_than interval default null) +RETURNS int +AS 'MODULE_PATHNAME', 'orioledb_move_orphaned' +LANGUAGE C VOLATILE; + +CREATE FUNCTION orioledb_remove_moved_orphaned() +RETURNS void +AS 'MODULE_PATHNAME', 'orioledb_remove_moved_orphaned' +LANGUAGE C VOLATILE; + +CREATE FUNCTION orioledb_move_back_orphaned() +RETURNS int +AS 'MODULE_PATHNAME', 'orioledb_move_back_orphaned' +LANGUAGE C VOLATILE; + +REVOKE EXECUTE ON FUNCTION orioledb_list_orphaned(interval) FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION orioledb_list_orphaned_moved() FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION orioledb_move_orphaned(interval) FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION orioledb_remove_moved_orphaned() FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION orioledb_move_back_orphaned() FROM PUBLIC; diff --git a/contrib/orioledb/src/btree/btree.c b/contrib/orioledb/src/btree/btree.c new file mode 100644 index 00000000000..18a0227a03f --- /dev/null +++ b/contrib/orioledb/src/btree/btree.c @@ -0,0 +1,422 @@ +/*------------------------------------------------------------------------- + * + * btree.c + * Routines for OrioleDB B-tree initialization and cleanup. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/btree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/insert.h" +#include "btree/io.h" +#include "btree/page_chunks.h" +#include "btree/scan.h" +#include "btree/undo.h" +#include "catalog/o_tables.h" +#include "recovery/recovery.h" +#include "recovery/wal.h" +#include "tableam/descr.h" +#include "tableam/tree.h" +#include "transam/undo.h" +#include "transam/oxid.h" +#include "tuple/format.h" +#include "utils/page_pool.h" +#include "utils/stopevent.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "utils/fmgrprotos.h" +#include "utils/numeric.h" + +LWLockPadded *unique_locks; +int num_unique_locks; + +void +o_btree_init_unique_lwlocks(void) +{ + num_unique_locks = max_procs * 4; + unique_locks = GetNamedLWLockTranche("orioledb_unique_locks"); +} + +void +o_btree_init(BTreeDescr *desc) +{ + init_new_btree_page(desc, desc->rootInfo.rootPageBlkno, + O_BTREE_FLAGS_ROOT_INIT, 0, false); + init_page_first_chunk(desc, O_GET_IN_MEMORY_PAGE(desc->rootInfo.rootPageBlkno), 0); + unlock_page(desc->rootInfo.rootPageBlkno); + init_meta_page(desc->rootInfo.metaPageBlkno, 1); + + /* + * Always mark root page dirty so that the first checkpoint writes the + * .map file header. Without this, a tree that gets evicted before + * checkpoint would leave a .map file with an unwritten header. + */ + MARK_DIRTY(desc, desc->rootInfo.rootPageBlkno); +} + +static bool +get_page_children(OInMemoryBlkno blkno, uint32 pageChangeCount, + OInMemoryBlkno childPageNumbers[BTREE_PAGE_MAX_CHUNK_ITEMS], + uint32 childPageChangeCounts[BTREE_PAGE_MAX_CHUNK_ITEMS], + int *childPagesCount) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + BTreePageItemLocator loc; + int ionum; + +retry: + lock_page(blkno); + if (desc->ionum >= 0) + { + ionum = desc->ionum; + unlock_page(blkno); + + wait_for_io_completion(ionum); + goto retry; + } + *childPagesCount = 0; + + if (O_PAGE_GET_CHANGE_COUNT(p) != pageChangeCount) + { + /* + * It seems that page has been evicted concurrently. So, nothing to + * do. + */ + unlock_page(blkno); + return false; + } + + if (!O_PAGE_IS(p, LEAF)) + { + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (DOWNLINK_IS_IN_IO(tuphdr->downlink)) + { + ionum = DOWNLINK_GET_IO_LOCKNUM(tuphdr->downlink); + unlock_page(blkno); + + wait_for_io_completion(ionum); + goto retry; + } + else if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + childPageNumbers[*childPagesCount] = DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink); + childPageChangeCounts[*childPagesCount] = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(tuphdr->downlink); + (*childPagesCount)++; + } + } + } + return true; +} + +/* + * Recursively sets O_BTREE_FLAG_PRE_CLEANUP to the given page and all its + * children. + */ +static void +mark_page_pre_cleanup(OInMemoryBlkno blkno, uint32 pageChangeCount) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + OInMemoryBlkno childPageNumbers[BTREE_PAGE_MAX_CHUNK_ITEMS]; + uint32 childPageChangeCounts[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int childPagesCount; + int i, + ionum; + + if (!get_page_children(blkno, pageChangeCount, + childPageNumbers, childPageChangeCounts, + &childPagesCount)) + return; + + page_block_reads(blkno); + header->flags |= O_BTREE_FLAG_PRE_CLEANUP; + ionum = O_GET_IN_MEMORY_PAGEDESC(blkno)->ionum; + unlock_page(blkno); + + if (ionum >= 0) + wait_for_io_completion(ionum); + + for (i = 0; i < childPagesCount; i++) + mark_page_pre_cleanup(childPageNumbers[i], + childPageChangeCounts[i]); +} + +/* + * Frees given page and all of its children recursively. + */ +static void +free_page(PagePool *pool, OInMemoryBlkno blkno, uint32 pageChangeCount) +{ + OInMemoryBlkno childPageNumbers[BTREE_PAGE_MAX_CHUNK_ITEMS]; + uint32 childPageChangeCounts[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int childPagesCount; + int i; + + if (!get_page_children(blkno, pageChangeCount, + childPageNumbers, childPageChangeCounts, + &childPagesCount)) + return; + Assert(O_PAGE_IS(O_GET_IN_MEMORY_PAGE(blkno), PRE_CLEANUP)); + Assert(O_PAGE_GET_CHANGE_COUNT(O_GET_IN_MEMORY_PAGE(blkno)) == pageChangeCount); + Assert(O_GET_IN_MEMORY_PAGEDESC(blkno)->ionum < 0); + unlock_page(blkno); + + for (i = 0; i < childPagesCount; i++) + free_page(pool, + childPageNumbers[i], + childPageChangeCounts[i]); + + lock_page(blkno); + Assert(O_PAGE_IS(O_GET_IN_MEMORY_PAGE(blkno), PRE_CLEANUP)); + Assert(O_PAGE_GET_CHANGE_COUNT(O_GET_IN_MEMORY_PAGE(blkno)) == pageChangeCount); + Assert(O_GET_IN_MEMORY_PAGEDESC(blkno)->ionum < 0); + page_block_reads(blkno); + CLEAN_DIRTY(pool, blkno); + ppool_free_page(pool, blkno, true); +} + +static inline void +free_meta_page(PagePool *pool, OInMemoryBlkno metaPageBlkno) +{ + BTreeMetaPage *meta_page; + int i, + j; + + meta_page = (BTreeMetaPage *) O_GET_IN_MEMORY_PAGE(metaPageBlkno); + + for (i = 0; i < 2; i++) + { + FREE_PAGE_IF_VALID(pool, meta_page->freeBuf.pages[i]); + for (j = 0; j < 2; j++) + { + FREE_PAGE_IF_VALID(pool, meta_page->nextChkp[j].pages[i]); + FREE_PAGE_IF_VALID(pool, meta_page->tmpBuf[j].pages[i]); + } + } + + /* + * Additional protection: the resource owner might not have released its + * seq scans yet (other transactions are excluded by locks). Defer + * freeing the meta page until the last scan is released. + */ + if (meta_page_get_num_seq_scans(metaPageBlkno) == 0) + ppool_free_page(pool, metaPageBlkno, false); + else + meta_page->toBeFreedOnSeqScanRelease = true; +} + +/* + * Two phase algorithm for pages cleanup, which can run concurrently + * to walk_page(). + * + * The first phase sets O_BTREE_FLAG_PRE_CLEANUP preventing walk_page() from + * evicting or writing these pages. + * + * The second phase cleans pages previously marked with + * O_BTREE_FLAG_PRE_CLEANUP flag from bottom to top. + * + * Therefore walk_page() never gets in trouble trying to find parent page + * using find_page(). + */ +void +o_btree_cleanup_pages(OInMemoryBlkno rootPageBlkno, OInMemoryBlkno metaPageBlkno, uint32 rootPageChangeCount) +{ + PagePool *pool = get_ppool_by_blkno(rootPageBlkno); + + Assert(OInMemoryBlknoIsValid(rootPageBlkno)); + Assert(OInMemoryBlknoIsValid(metaPageBlkno)); + Assert(pool != NULL); + + mark_page_pre_cleanup(rootPageBlkno, rootPageChangeCount); + free_page(pool, rootPageBlkno, rootPageChangeCount); + + free_meta_page(pool, metaPageBlkno); +} + +void +o_btree_check_size_of_tuple(int len, char *relation_name, bool index) +{ + if (len > O_BTREE_MAX_TUPLE_SIZE) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %d exceeds orioledb maximum %zu for %s \"%s\"", + len, + O_BTREE_MAX_TUPLE_SIZE, + index ? "index" : "table", + relation_name))); +} + +ItemPointerData +btree_ctid_get_and_inc(BTreeDescr *desc) +{ + BTreeMetaPage *metaPageBlkno = BTREE_GET_META(desc); + ItemPointerData result; + uint64 ctid = pg_atomic_fetch_add_u64(&metaPageBlkno->ctid, 1); + + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc)); + Assert(ctid / (MaxOffsetNumber - FirstOffsetNumber) < InvalidBlockNumber); + + ItemPointerSet(&result, + (uint32) (ctid / (MaxOffsetNumber - FirstOffsetNumber)), + (OffsetNumber) (ctid % (MaxOffsetNumber - FirstOffsetNumber) + FirstOffsetNumber)); + return result; +} + +void +btree_ctid_update_if_needed(BTreeDescr *desc, ItemPointerData ctid) +{ + BTreeMetaPage *metaPageBlkno = BTREE_GET_META(desc); + uint64 old_ctid, + new_ctid; + + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc)); + new_ctid = (uint64) ItemPointerGetBlockNumber(&ctid) * (MaxOffsetNumber - FirstOffsetNumber); + new_ctid += ctid.ip_posid - FirstOffsetNumber; + Assert(new_ctid < (uint64) (MaxOffsetNumber - FirstOffsetNumber) * (uint64) InvalidBlockNumber); + + new_ctid++; + do + { + old_ctid = pg_atomic_read_u64(&metaPageBlkno->ctid); + if (old_ctid >= new_ctid) + break; + } while (!pg_atomic_compare_exchange_u64(&metaPageBlkno->ctid, &old_ctid, new_ctid)); +} + +ItemPointerData +btree_bridge_ctid_get_and_inc(BTreeDescr *desc, bool *overflow) +{ + BTreeMetaPage *metaPageBlkno = BTREE_GET_META(desc); + ItemPointerData result; + uint64 ctid = pg_atomic_fetch_add_u64(&metaPageBlkno->bridge_ctid, 1); + + BlockNumber max_block_number = MaxBlockNumber; + + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc)); + + if (BlockNumberIsValid(max_bridge_ctid_blkno)) + max_block_number = max_bridge_ctid_blkno; + + *overflow = ctid / MaxHeapTuplesPerPage >= max_block_number; + + ItemPointerSet(&result, + (uint32) (ctid / MaxHeapTuplesPerPage % max_block_number), + (OffsetNumber) (ctid % MaxHeapTuplesPerPage + FirstOffsetNumber)); + return result; +} + +static inline OIndexDescr * +o_get_tree_def(BTreeDescr *desc) +{ + return desc->arg; +} + +void +btree_desc_stopevent_params_internal(BTreeDescr *desc, JsonbParseState **state) +{ + jsonb_push_int8_key(state, "datoid", desc->oids.datoid); + jsonb_push_int8_key(state, "reloid", desc->oids.reloid); + jsonb_push_int8_key(state, "relnode", desc->oids.relnode); + + if (IS_SYS_TREE_OIDS(desc->oids)) + jsonb_push_string_key(state, "treeName", "sys_tree"); + else if (desc->type == oIndexToast) + jsonb_push_string_key(state, "treeName", "toast"); + else + jsonb_push_string_key(state, "treeName", o_get_tree_def(desc)->name.data); +} + +void +btree_page_stopevent_params_internal(BTreeDescr *desc, Page p, + JsonbParseState **state) +{ + jsonb_push_int8_key(state, "level", PAGE_GET_LEVEL(p)); + jsonb_push_int8_key(state, "pageChangeCount", O_PAGE_GET_CHANGE_COUNT(p)); + + jsonb_push_key(state, "hikey"); + if (!O_PAGE_IS(p, RIGHTMOST)) + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, p); + (void) o_btree_key_to_jsonb(desc, hikey, state); + } + else + { + JsonbValue jval; + + jval.type = jbvNull; + (void) pushJsonbValue(state, WJB_VALUE, &jval); + } +} + +Jsonb * +btree_page_stopevent_params(BTreeDescr *desc, Page p) +{ + JsonbParseState *state = NULL; + Jsonb *res; + MemoryContext mctx = MemoryContextSwitchTo(stopevents_cxt); + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + btree_desc_stopevent_params_internal(desc, &state); + btree_page_stopevent_params_internal(desc, p, &state); + res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL)); + MemoryContextSwitchTo(mctx); + + return res; +} + +Jsonb * +btree_downlink_stopevent_params(BTreeDescr *desc, Page p, BTreePageItemLocator *loc) +{ + JsonbParseState *state = NULL; + Jsonb *res; + MemoryContext mctx = MemoryContextSwitchTo(stopevents_cxt); + BTreeNonLeafTuphdr *internal_ptr; + + internal_ptr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, loc); + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + btree_desc_stopevent_params_internal(desc, &state); + btree_page_stopevent_params_internal(desc, p, &state); + + jsonb_push_key(&state, "downlink"); + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + jsonb_push_int8_key(&state, "blkno", DOWNLINK_GET_IN_MEMORY_BLKNO(internal_ptr->downlink)); + jsonb_push_int8_key(&state, "pageChangeCount", DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(internal_ptr->downlink)); + jsonb_push_key(&state, "key"); + if (BTREE_PAGE_LOCATOR_GET_OFFSET(p, loc) > 0) + { + OTuple key; + + BTREE_PAGE_READ_INTERNAL_TUPLE(key, p, loc); + (void) o_btree_key_to_jsonb(desc, key, &state); + } + else + { + JsonbValue jval; + + jval.type = jbvNull; + (void) pushJsonbValue(&state, WJB_VALUE, &jval); + } + pushJsonbValue(&state, WJB_END_OBJECT, NULL); + + res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL)); + MemoryContextSwitchTo(mctx); + + return res; +} diff --git a/contrib/orioledb/src/btree/build.c b/contrib/orioledb/src/btree/build.c new file mode 100644 index 00000000000..ad41b0bff58 --- /dev/null +++ b/contrib/orioledb/src/btree/build.c @@ -0,0 +1,486 @@ +/*------------------------------------------------------------------------- + * + * build.c + * Routines for sort-based B-tree index building. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/build.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/build.h" +#include "btree/insert.h" +#include "btree/io.h" +#include "btree/page_chunks.h" +#include "btree/split.h" +#include "checkpoint/checkpoint.h" +#include "recovery/recovery.h" +#include "s3/worker.h" +#include "tableam/descr.h" +#include "tuple/toast.h" +#include "tuple/sort.h" +#include "transam/oxid.h" +#include "utils/seq_buf.h" +#include "utils/page_pool.h" + +#include "access/genam.h" +#include "access/relation.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/lsyscache.h" +#include "utils/memdebug.h" + +typedef struct OIndexBuildStackItem +{ + char img[ORIOLEDB_BLCKSZ]; + BTreePageItemLocator loc; + OFixedKey key; + int keysize; +} OIndexBuildStackItem; + +static bool put_item_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, + int level, OTuple tuple, int tuplesize, + Pointer tupleheader, LocationIndex header_size, + int *root_level, BTreeMetaPage *metaPage); +static bool put_tuple_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, + OTuple tuple, int *root_level, + BTreeMetaPage *metaPage); +static bool put_downlink_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, + int level, uint64 downlink, OTuple key, + int keysize, int *root_level, + BTreeMetaPage *metaPage); + +static void +stack_page_split(BTreeDescr *desc, OIndexBuildStackItem *stack, int level, + OTuple tuple, int tuplesize, Pointer tupleheader, + LocationIndex header_size, Page new_page) +{ + Page img = stack[level].img; + OffsetNumber left_count, + rightbound_key_size; + bool key_palloc = false; + Pointer tuple_ptr; + OTuple rightbound_key; + bool leaf = O_PAGE_IS(img, LEAF); + BTreePageItemLocator loc, + newLoc; + BTreeSplitItems items; + OffsetNumber offset; + + btree_page_update_max_key_len(desc, img); + offset = BTREE_PAGE_LOCATOR_GET_OFFSET(img, &stack[level].loc); + + make_split_items(desc, img, &items, &offset, + tupleheader, tuple, tuplesize, false, + COMMITSEQNO_INPROGRESS); + + left_count = btree_page_split_location(desc, &items, offset, 0.9, NULL); + + /* Distribute the tuples according the the split location */ + BTREE_PAGE_OFFSET_GET_LOCATOR(img, left_count, &loc); + BTREE_PAGE_LOCATOR_FIRST(new_page, &newLoc); + while (BTREE_PAGE_LOCATOR_IS_VALID(img, &loc)) + { + LocationIndex itemsize; + + itemsize = BTREE_PAGE_GET_ITEM_SIZE(img, &loc); + + page_locator_insert_item(new_page, &newLoc, itemsize); + memcpy(BTREE_PAGE_LOCATOR_GET_ITEM(new_page, &newLoc), + BTREE_PAGE_LOCATOR_GET_ITEM(img, &loc), + itemsize); + BTREE_PAGE_SET_ITEM_FLAGS(new_page, &newLoc, BTREE_PAGE_GET_ITEM_FLAGS(stack[level].img, &loc)); + + BTREE_PAGE_LOCATOR_NEXT(img, &loc); + BTREE_PAGE_LOCATOR_NEXT(new_page, &newLoc); + } + + BTREE_PAGE_LOCATOR_TAIL(new_page, &newLoc); + page_locator_insert_item(new_page, &newLoc, + MAXALIGN(tuplesize) + header_size); + tuple_ptr = BTREE_PAGE_LOCATOR_GET_ITEM(new_page, &newLoc); + memcpy(tuple_ptr, tupleheader, header_size); + tuple_ptr += header_size; + memcpy(tuple_ptr, tuple.data, tuplesize); + BTREE_PAGE_SET_ITEM_FLAGS(new_page, &newLoc, tuple.formatFlags); + + /* Setup the new high key on the left page */ + BTREE_PAGE_LOCATOR_FIRST(new_page, &newLoc); + BTREE_PAGE_READ_TUPLE(rightbound_key, new_page, &newLoc); + if (leaf) + { + rightbound_key = o_btree_tuple_make_key(desc, rightbound_key, NULL, false, &key_palloc); + rightbound_key_size = o_btree_len(desc, rightbound_key, OKeyLength); + } + else + { + rightbound_key_size = BTREE_PAGE_GET_ITEM_SIZE(new_page, &newLoc) - + header_size; + } + + btree_page_reorg(desc, img, &items.items[0], left_count, + rightbound_key_size, rightbound_key); + + if (key_palloc) + pfree(rightbound_key.data); +} + +static bool +put_item_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, int level, + OTuple tuple, int tuplesize, Pointer tupleheader, + LocationIndex header_size, int *root_level, + BTreeMetaPage *metaPage) +{ + BTreeItemPageFitType fit; + Pointer tuple_ptr; + uint64 downlink = 0; + + Assert(level < ORIOLEDB_MAX_DEPTH); + + if (BTREE_PAGE_FREE_SPACE(stack[level].img) - MAXALIGN(tuplesize) - header_size >= + ORIOLEDB_BLCKSZ * (100 - desc->fillfactor) / 100) + fit = page_locator_fits_item(desc, + stack[level].img, + &stack[level].loc, + MAXALIGN(tuplesize) + header_size, + false, + COMMITSEQNO_INPROGRESS); + else + fit = BTreeItemPageFitSplitRequired; + + if (fit == BTreeItemPageFitAsIs) + { + page_locator_insert_item(stack[level].img, &stack[level].loc, + MAXALIGN(tuplesize) + header_size); + tuple_ptr = BTREE_PAGE_LOCATOR_GET_ITEM(stack[level].img, &stack[level].loc); + memcpy(tuple_ptr, tupleheader, header_size); + tuple_ptr += header_size; + memcpy(tuple_ptr, tuple.data, tuplesize); + BTREE_PAGE_SET_ITEM_FLAGS(stack[level].img, &stack[level].loc, tuple.formatFlags); + + BTREE_PAGE_LOCATOR_NEXT(stack[level].img, &stack[level].loc); + } + else + { + FileExtent extent; + char new_page[ORIOLEDB_BLCKSZ] = {0}; + OFixedKey key; + int keysize; + BTreePageHeader *new_page_header = (BTreePageHeader *) new_page; + BTreePageHeader *header = (BTreePageHeader *) stack[level].img; + BTreePageHeader *parent_header = (BTreePageHeader *) stack[level + 1].img; + + new_page_header->rightLink = InvalidRightLink; + new_page_header->csn = COMMITSEQNO_FROZEN; + new_page_header->undoLocation = InvalidUndoLocation; + new_page_header->o_header.checkpointNum = 0; + new_page_header->prevInsertOffset = MaxOffsetNumber; + + new_page_header->flags = O_BTREE_FLAG_RIGHTMOST; + + if (level == 0) + new_page_header->flags |= O_BTREE_FLAG_LEAF; + else + PAGE_SET_LEVEL(new_page, level); + + init_page_first_chunk(desc, new_page, 0); + + header->rightLink = InvalidRightLink; + header->csn = COMMITSEQNO_FROZEN; + header->undoLocation = InvalidUndoLocation; + header->o_header.checkpointNum = 0; + header->prevInsertOffset = MaxOffsetNumber; + + header->flags &= ~O_BTREE_FLAG_RIGHTMOST; + + if (level == 0) + header->flags |= O_BTREE_FLAG_LEAF; + + stack_page_split(desc, stack, level, tuple, tuplesize, tupleheader, + header_size, new_page); + + if (level == *root_level) + { + parent_header->flags = O_BTREE_FLAG_RIGHTMOST | O_BTREE_FLAG_LEFTMOST; + header->flags |= O_BTREE_FLAG_LEFTMOST; + if (level != 0) + PAGE_SET_LEVEL(stack[level].img, level); + + *root_level = level + 1; + } + + if (level != 0) + PAGE_SET_N_ONDISK(stack[level].img, + BTREE_PAGE_ITEMS_COUNT(stack[level].img)); + + /* write old page to disk */ + + extent.len = InvalidFileExtentLen; + extent.off = InvalidFileExtentOff; + + VALGRIND_CHECK_MEM_IS_DEFINED(stack[level].img, ORIOLEDB_BLCKSZ); + + downlink = perform_page_io_build(desc, stack[level].img, &extent, metaPage); + if (level == 0) + pg_atomic_add_fetch_u32(&metaPage->leafPagesNum, 1); + + copy_fixed_key(desc, &key, stack[level].key.tuple); + keysize = stack[level].keysize; + + stack[level].keysize = BTREE_PAGE_GET_HIKEY_SIZE(stack[level].img); + copy_fixed_hikey(desc, &stack[level].key, stack[level].img); + + if (level > 0) + { +#ifdef ORIOLEDB_CUT_FIRST_KEY + page_cut_first_key(new_page); +#endif + } + + /* copy new page to stack */ + memcpy(stack[level].img, new_page, ORIOLEDB_BLCKSZ); + BTREE_PAGE_LOCATOR_TAIL(stack[level].img, &stack[level].loc); + + put_downlink_to_stack(desc, stack, level + 1, downlink, + key.tuple, keysize, + root_level, metaPage); + } + return true; +} + +static bool +put_downlink_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, int level, + uint64 downlink, OTuple key, int keysize, + int *root_level, BTreeMetaPage *metaPage) +{ + BTreeNonLeafTuphdr internal_header = {0}; + bool result; + + internal_header.downlink = downlink; + result = put_item_to_stack(desc, stack, level, key, keysize, + (Pointer) &internal_header, + sizeof(internal_header), root_level, + metaPage); + return result; +} + +static bool +put_tuple_to_stack(BTreeDescr *desc, OIndexBuildStackItem *stack, + OTuple tuple, int *root_level, BTreeMetaPage *metaPage) +{ + BTreeLeafTuphdr leaf_header = {0}; + int tuplesize; + + leaf_header.deleted = BTreeLeafTupleNonDeleted; + leaf_header.undoLocation = InvalidUndoLocation; + leaf_header.xactInfo = OXID_GET_XACT_INFO(BootstrapTransactionId, RowLockUpdate, false); + tuplesize = o_btree_len(desc, tuple, OTupleLength); + return put_item_to_stack(desc, stack, 0, + tuple, tuplesize, (Pointer) &leaf_header, + sizeof(leaf_header), root_level, metaPage); +} + +void +btree_write_index_data(BTreeDescr *desc, TupleDesc tupdesc, + Tuplesortstate *sortstate, + uint64 ctid, uint64 bridge_ctid, + CheckpointFileHeader *file_header) +{ + OTuple idx_tup; + OIndexBuildStackItem *stack; + int root_level = 0, + saved_root_level; + Page root_page; + uint64 downlink; + BTreePageHeader *root_page_header; + FileExtent extent; + BTreeMetaPage metaPage = {0}; + int i; + Datum *values; + bool *isnull; + uint32 chkpNum; + + btree_open_smgr(desc); + + stack = (OIndexBuildStackItem *) palloc0(sizeof(OIndexBuildStackItem) * ORIOLEDB_MAX_DEPTH); + values = (Datum *) palloc(sizeof(Datum) * tupdesc->natts); + isnull = (bool *) palloc(sizeof(bool) * tupdesc->natts); + + pg_atomic_init_u64(&metaPage.datafileLength[0], 0); + pg_atomic_init_u64(&metaPage.datafileLength[1], 0); + pg_atomic_init_u64(&metaPage.numFreeBlocks, 0); + pg_atomic_init_u32(&metaPage.leafPagesNum, 0); + pg_atomic_init_u64(&metaPage.ctid, ctid); + pg_atomic_init_u64(&metaPage.bridge_ctid, bridge_ctid); + for (i = 0; i < ORIOLEDB_MAX_DEPTH; i++) + { + /* init_page_first_chunk() needs leaf flag to be set */ + if (i == 0) + ((BTreePageHeader *) stack[i].img)->flags = O_BTREE_FLAG_LEAF; + init_page_first_chunk(desc, stack[i].img, 0); + BTREE_PAGE_LOCATOR_FIRST(stack[i].img, &stack[i].loc); + } + + idx_tup = tuplesort_getotuple(sortstate, true); + while (!O_TUPLE_IS_NULL(idx_tup)) + { + Assert(o_tuple_size(idx_tup, &((OIndexDescr *) desc->arg)->leafSpec) <= O_BTREE_MAX_TUPLE_SIZE); + put_tuple_to_stack(desc, stack, idx_tup, &root_level, &metaPage); + idx_tup = tuplesort_getotuple(sortstate, true); + } + + pfree(values); + pfree(isnull); + + saved_root_level = root_level; + for (i = 0; i < saved_root_level; i++) + { + if (i != 0) + PAGE_SET_N_ONDISK(stack[i].img, BTREE_PAGE_ITEMS_COUNT(stack[i].img)); + + extent.len = InvalidFileExtentLen; + extent.off = InvalidFileExtentOff; + + VALGRIND_CHECK_MEM_IS_DEFINED(stack[i].img, ORIOLEDB_BLCKSZ); + + split_page_by_chunks(desc, stack[i].img); + downlink = perform_page_io_build(desc, stack[i].img, &extent, &metaPage); + if (i == 0) + pg_atomic_add_fetch_u32(&metaPage.leafPagesNum, 1); + + put_downlink_to_stack(desc, stack, i + 1, downlink, + stack[i].key.tuple, stack[i].keysize, + &root_level, &metaPage); + } + + root_page = stack[root_level].img; + + root_page_header = (BTreePageHeader *) root_page; + if (root_level == 0) + root_page_header->flags = O_BTREE_FLAGS_ROOT_INIT; + root_page_header->rightLink = InvalidRightLink; + root_page_header->csn = COMMITSEQNO_FROZEN; + root_page_header->undoLocation = InvalidUndoLocation; + root_page_header->o_header.checkpointNum = 0; + root_page_header->prevInsertOffset = MaxOffsetNumber; + + if (!O_PAGE_IS(root_page, LEAF)) + { + PAGE_SET_N_ONDISK(root_page, BTREE_PAGE_ITEMS_COUNT(root_page)); + PAGE_SET_LEVEL(root_page, root_level); + } + + extent.len = InvalidFileExtentLen; + extent.off = InvalidFileExtentOff; + + VALGRIND_CHECK_MEM_IS_DEFINED(root_page, ORIOLEDB_BLCKSZ); + + split_page_by_chunks(desc, root_page); + downlink = perform_page_io_build(desc, root_page, &extent, &metaPage); + if (root_level == 0) + pg_atomic_add_fetch_u32(&metaPage.leafPagesNum, 1); + + btree_close_smgr(desc); + pfree(stack); + + if (orioledb_s3_mode) + chkpNum = S3_GET_CHKP_NUM(DOWNLINK_GET_DISK_OFF(downlink)); + else + chkpNum = 0; + + memset(file_header, 0, sizeof(*file_header)); + file_header->rootDownlink = downlink; + file_header->datafileLength = pg_atomic_read_u64(&metaPage.datafileLength[chkpNum % 2]); + file_header->numFreeBlocks = pg_atomic_read_u64(&metaPage.numFreeBlocks); + file_header->leafPagesNum = pg_atomic_read_u32(&metaPage.leafPagesNum); + file_header->ctid = pg_atomic_read_u64(&metaPage.ctid); + file_header->bridgeCtid = pg_atomic_read_u64(&metaPage.bridge_ctid); +} + +S3TaskLocation +btree_write_file_header(BTreeDescr *desc, CheckpointFileHeader *file_header) +{ + File file; + uint32 checkpoint_number; + bool checkpoint_concurrent; + char *filename; + S3TaskLocation result = 0; + + Assert(desc->storageType == BTreeStoragePersistence || + desc->storageType == BTreeStorageTemporary || + desc->storageType == BTreeStorageUnlogged); + + checkpoint_number = get_cur_checkpoint_number(&desc->oids, desc->type, + &checkpoint_concurrent); + + if (desc->storageType == BTreeStoragePersistence || desc->storageType == BTreeStorageUnlogged) + { + SeqBufTag prev_chkp_tag; + + memset(&prev_chkp_tag, 0, sizeof(prev_chkp_tag)); + prev_chkp_tag.key.oids = desc->oids; + prev_chkp_tag.key.tablespace = desc->tablespace; + prev_chkp_tag.num = checkpoint_number; + prev_chkp_tag.type = 'm'; + + filename = get_seq_buf_filename(&prev_chkp_tag); + + Assert(!DiskDownlinkIsValid(file_header->rootDownlink) || + FileExtentLenIsValid(DOWNLINK_GET_DISK_LEN(file_header->rootDownlink))); + + elog(DEBUG1, "btree_write_file_header: (%u, %u) chkp=%u " + "rootDownlink=%lu datafileLength=%lu leafPagesNum=%u", + desc->oids.datoid, desc->oids.relnode, checkpoint_number, + (unsigned long) file_header->rootDownlink, + (unsigned long) file_header->datafileLength, + file_header->leafPagesNum); + + file = PathNameOpenFile(filename, O_WRONLY | O_CREAT | PG_BINARY); + + if (OFileWrite(file, (Pointer) file_header, + sizeof(CheckpointFileHeader), 0, + WAIT_EVENT_DATA_FILE_WRITE) != + sizeof(CheckpointFileHeader)) + { + pfree(filename); + ereport(FATAL, + (errcode_for_file_access(), + errmsg("Could not write checkpoint header to file %s: %m", + filename))); + } + FileClose(file); + pfree(filename); + + o_update_latest_chkp_num(desc->oids.datoid, + desc->oids.relnode, + checkpoint_number); + + if (orioledb_s3_mode) + { + OIndexKey key = {.oids = desc->oids,.tablespace = desc->tablespace}; + + result = s3_schedule_file_part_write(checkpoint_number, key, -1, -1); + } + } + else + { + EvictedTreeData evicted_tree_data = {{0}}; + + evicted_tree_data.key.datoid = desc->oids.datoid; + evicted_tree_data.key.relnode = desc->oids.relnode; + evicted_tree_data.file_header = *file_header; + insert_evicted_data(&evicted_tree_data); + } + + return result; +} diff --git a/contrib/orioledb/src/btree/check.c b/contrib/orioledb/src/btree/check.c new file mode 100644 index 00000000000..bc04b6fde9b --- /dev/null +++ b/contrib/orioledb/src/btree/check.c @@ -0,0 +1,715 @@ +/*------------------------------------------------------------------------- + * + * check.c + * Routines for checking OrioleDB B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/check.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/check.h" +#include "btree/io.h" +#include "btree/page_chunks.h" +#include "catalog/free_extents.h" +#include "checkpoint/checkpoint.h" +#include "recovery/recovery.h" +#include "tableam/descr.h" +#include "utils/compress.h" +#include "utils/page_pool.h" +#include "utils/seq_buf.h" +#include "utils/ucm.h" + +#include "pgstat.h" +#include "access/transam.h" + +/* + * Dynamic array of file extents. + */ +typedef struct +{ + /* array of extents */ + FileExtent *extents; + /* number of allocated extents */ + uint64 allocated; + /* number of valid extents in the array */ + uint64 size; + /* number of blocks in file containing by extents in the array */ + uint64 blocksCount; +} ExtentsArray; + +typedef struct +{ + ExtentsArray busy; + BTreeDescr *desc; + bool hasError; + OBTreeFindPageContext context; +} BTreeCheckStatus; + +static int file_extent_cmp(const void *p1, const void *p2); +static void check_walk_btree(BTreeCheckStatus *status, OInMemoryBlkno blkno, + OInMemoryBlkno parentPagenum); +static void add_extent(ExtentsArray *arr, FileExtent extent); +static bool check_extents(ExtentsArray *busy, ExtentsArray *free); +static void get_free_extents(BTreeDescr *desc, ExtentsArray *free_extents, + bool force_file_check, uint32 chkp_num); +static void get_free_extents_from_file(SeqBufTag *tag, off_t offset, + ExtentsArray *free_extents, + bool compressed, bool should_exists); +static bool is_sorted_by_off(ExtentsArray *array); +static bool is_sorted_by_len_off(ExtentsArray *array); + +bool +check_btree(BTreeDescr *desc, bool force_file_check, bool wait_for_checkpoint) +{ + BTreeMetaPage *metaPageBlkno = BTREE_GET_META(desc); + BTreeCheckStatus status; + ExtentsArray free_extents; + uint64 data_file_len = pg_atomic_read_u64(&metaPageBlkno->datafileLength[0]); /* Fix for S3 mode */ + bool is_compressed = OCompressIsValid(desc->compress); + uint32 checkpoint_number = 0; + bool copy_blkno; + + memset(&status, 0, sizeof(BTreeCheckStatus)); + memset(&free_extents, 0, sizeof(ExtentsArray)); + + /* get busy file extents */ + status.desc = desc; + status.hasError = false; + init_page_find_context(&status.context, desc, COMMITSEQNO_INPROGRESS, BTREE_PAGE_FIND_MODIFY); + + if (wait_for_checkpoint) + { + /* + * Repeat until we get checkpoint number to avoid spurious failure due + * to concurrent checkpoint when called by amcheck. + */ + while (!get_checkpoint_number(desc, desc->rootInfo.rootPageBlkno, + &checkpoint_number, ©_blkno)) + { + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + } + else if (!get_checkpoint_number(desc, desc->rootInfo.rootPageBlkno, + &checkpoint_number, ©_blkno)) + { + elog(NOTICE, "Tree is under checkpoint now"); + return false; + } + + Assert(checkpoint_number > 0); + + check_walk_btree(&status, desc->rootInfo.rootPageBlkno, OInvalidInMemoryBlkno); + + if (status.hasError) + return false; + + if (desc->storageType != BTreeStoragePersistence) + return true; + + /* get free file extents */ + get_free_extents(desc, &free_extents, force_file_check, + checkpoint_number - 1); + + if (status.hasError) + return false; + + /* check extents */ + status.hasError = !check_extents(&status.busy, &free_extents); + + if (status.hasError) + return false; + + if (data_file_len > status.busy.blocksCount + free_extents.blocksCount) + { + elog(NOTICE, "Not used file blocks from %lu to %lu", + status.busy.blocksCount + free_extents.blocksCount, + data_file_len); + status.hasError = true; + } + else if (data_file_len < status.busy.blocksCount + free_extents.blocksCount) + { + elog(NOTICE, "Excess file blocks from %lu to %lu", + data_file_len, + status.busy.blocksCount + free_extents.blocksCount); + status.hasError = true; + } + + /* frees allocated bytes */ + if (status.busy.size > 0) + { + Assert(status.busy.extents != NULL); + pfree(status.busy.extents); + } + + if (free_extents.size > 0) + { + Assert(free_extents.extents != NULL); + pfree(free_extents.extents); + } + + if (checkpoint_number > 1) + { + /* file extents sort check */ + SeqBufTag tag; + ExtentsArray map_extents, + tmp_extents; + bool found; + + memset(&map_extents, 0, sizeof(ExtentsArray)); + memset(&tmp_extents, 0, sizeof(ExtentsArray)); + + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.num = o_get_latest_chkp_num(desc->oids.datoid, + desc->oids.relnode, + checkpoint_number - 1, + &found); + tag.type = 'm'; + + + if (seq_buf_file_exist(&tag)) + { + get_free_extents_from_file(&tag, sizeof(CheckpointFileHeader), + &map_extents, is_compressed, false); + } + else if (found) + { + elog(NOTICE, "%s not exist", get_seq_buf_filename(&tag)); + status.hasError = true; + } + + tag.type = 't'; + if (!is_compressed && seq_buf_file_exist(&tag)) + { + get_free_extents_from_file(&tag, 0, &tmp_extents, false, false); + } + + if (map_extents.size != 0) + { + bool sorted = is_compressed ? is_sorted_by_len_off(&map_extents) + : is_sorted_by_off(&map_extents); + + if (!sorted) + { + tag.type = 'm'; + elog(NOTICE, "%s file is not sorted", get_seq_buf_filename(&tag)); + status.hasError = true; + } + pfree(map_extents.extents); + } + + if (tmp_extents.size != 0) + { + bool sorted = is_compressed ? is_sorted_by_len_off(&tmp_extents) + : is_sorted_by_off(&tmp_extents); + + if (!sorted) + { + tag.type = 't'; + elog(NOTICE, "%s file is not sorted", get_seq_buf_filename(&tag)); + status.hasError = true; + } + pfree(tmp_extents.extents); + } + } + + return !status.hasError; +} + +/* + * Appends extent into the extents array. + */ +static void +foreach_extent_append(BTreeDescr *desc, FileExtent extent, void *arg) +{ + ExtentsArray *arr = (ExtentsArray *) arg; + + add_extent(arr, extent); +} + +/* + * Gets free file extents for an index. + */ +static void +get_free_extents(BTreeDescr *desc, ExtentsArray *free_extents, + bool force_file_check, uint32 chkp_num) +{ + SeqBufTag chkp_tag; + bool is_compressed = OCompressIsValid(desc->compress); + + chkp_tag.key.oids = desc->oids; + chkp_tag.key.tablespace = desc->tablespace; + + if (force_file_check) + { + bool found; + + /* + * Reads free blocks from map file. + */ + chkp_tag.type = 'm'; + chkp_tag.num = o_get_latest_chkp_num(desc->oids.datoid, + desc->oids.relnode, + chkp_num, + &found); + + get_free_extents_from_file(&chkp_tag, sizeof(CheckpointFileHeader), + free_extents, is_compressed, found); + } + else if (!is_compressed) + { + /* + * Reads free blocks as normal process for uncompressed index. + */ + off_t freebuf_offset; + uint32 num; + + chkp_tag = desc->freeBuf.shared->tag; + freebuf_offset = seq_buf_get_offset(&desc->freeBuf); + + get_free_extents_from_file(&chkp_tag, freebuf_offset, free_extents, false, false); + for (num = chkp_tag.num; num < chkp_num; num++) + { + chkp_tag.num = num + 1; + chkp_tag.type = 't'; + get_free_extents_from_file(&chkp_tag, 0, free_extents, false, false); + } + } + else + { + /* + * Reads free blocks as normal process for compressed index. + * foreach_free_extent reads from in-memory free extent trees, which + * contain extents from consumed .tmp files. We also need to read any + * unconsumed .tmp files. + */ + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + uint32 num; + uint32 consumed_num = metaPage->freeBuf.tag.num; + + foreach_free_extent(desc, foreach_extent_append, (void *) free_extents); + for (num = consumed_num + 1; num <= chkp_num + 1; num++) + { + chkp_tag.num = num; + chkp_tag.type = 't'; + get_free_extents_from_file(&chkp_tag, 0, free_extents, true, false); + } + } +} + +/* + * Appends file extents from file to the free extents array. + */ +static void +get_free_extents_from_file(SeqBufTag *tag, off_t offset, + ExtentsArray *free_extents, bool compressed, + bool should_exists) +{ + char buf[ORIOLEDB_BLCKSZ], + *filename; + File file; + FileExtent extent; + off_t bytes_read; + uint32 off; + int i; + + filename = get_seq_buf_filename(tag); + file = PathNameOpenFile(filename, O_RDONLY | PG_BINARY); + if (file == -1) + { + if (should_exists) + ereport(NOTICE, (errcode_for_file_access(), + errmsg("could not open map file %s: %m", filename))); + pfree(filename); + return; + } + + do + { + bytes_read = OFileRead(file, buf, ORIOLEDB_BLCKSZ, offset, + WAIT_EVENT_DATA_FILE_READ); + offset += bytes_read; + + i = 0; + while (i < bytes_read) + { + if (compressed || use_device) + { + memcpy(&extent, buf + i, sizeof(FileExtent)); + i += sizeof(FileExtent); + } + else + { + memcpy(&off, buf + i, sizeof(uint32)); + i += sizeof(uint32); + extent.off = off; + extent.len = 1; + } + add_extent(free_extents, extent); + } + } while (bytes_read == ORIOLEDB_BLCKSZ); + + FileClose(file); + pfree(filename); +} + +/* + * Returns true if the busy and free extents array do not intersect and have no + * holes. + */ +static bool +check_extents(ExtentsArray *busy, ExtentsArray *free) +{ + FileExtent cur; + uint64 b, + f, + next_off; + bool result = true; + + qsort(busy->extents, busy->size, sizeof(FileExtent), file_extent_cmp); + qsort(free->extents, free->size, sizeof(FileExtent), file_extent_cmp); + + b = 0; + f = 0; + cur.off = 0; + cur.len = 0; + while (true) + { + next_off = cur.off + cur.len; + + while (b < busy->size && next_off > busy->extents[b].off) + { + elog(NOTICE, "Excess busy extent %lu %u", + (unsigned long) busy->extents[b].off, + (unsigned) busy->extents[b].len); + result = false; + b++; + } + + while (f < free->size && next_off > free->extents[f].off) + { + elog(NOTICE, "Excess free extent %lu %u", + (unsigned long) free->extents[f].off, + (unsigned) free->extents[f].len); + result = false; + f++; + } + + if (f >= free->size && b >= busy->size) + break; + + if (f >= free->size || (b < busy->size && file_extent_cmp(&free->extents[f], &busy->extents[b]) > 0)) + { + if (next_off != busy->extents[b].off) + { + elog(NOTICE, "Extent %lu %u is neither free or busy", + (unsigned long) (next_off), + (unsigned) (busy->extents[b].off - next_off)); + result = false; + } + cur = busy->extents[b++]; + } + else + { + if (next_off != free->extents[f].off) + { + elog(NOTICE, "Extent %lu %u is neither free or busy", + (unsigned long) (next_off), + (unsigned) (free->extents[f].off - next_off)); + result = false; + } + cur = free->extents[f++]; + } + } + + return result; +} + +/* + * (off, len) sort comparator + */ +static int +file_extent_cmp(const void *p1, const void *p2) +{ + FileExtent v1 = *((const FileExtent *) p1); + FileExtent v2 = *((const FileExtent *) p2); + + if (v1.off != v2.off) + return v1.off > v2.off ? 1 : -1; + if (v1.len != v2.len) + return v1.len > v2.len ? 1 : -1; + return 0; +} + +/* + * Returns false if the array is sorted by off order. + */ +static bool +is_sorted_by_off(ExtentsArray *array) +{ + uint64 i; + bool sorted = true; + + if (array->size > 1) + { + for (i = 1; i < array->size && sorted; i++) + { + sorted = array->extents[i - 1].off <= array->extents[i].off; + } + + if (!sorted) + { + i--; + elog(NOTICE, "Extents (%lu, %u), (%lu, %u) have wrong sort order", + (unsigned long) array->extents[i - 1].off, + (unsigned) array->extents[i - 1].len, + (unsigned long) array->extents[i].off, + (unsigned) array->extents[i].len); + } + } + + return sorted; +} + +/* + * Returns true if the array is sorted by (reverse len, off) order. + */ +static bool +is_sorted_by_len_off(ExtentsArray *array) +{ + uint64 i; + bool sorted = true; + + if (array->size > 1) + { + for (i = 1; i < array->size && sorted; i++) + { + if (array->extents[i - 1].len != array->extents[i].len) + sorted = array->extents[i - 1].len > array->extents[i].len; + else + sorted = array->extents[i - 1].off <= array->extents[i].off; + } + + if (!sorted) + { + i--; + elog(NOTICE, "Extents (%lu, %u), (%lu, %u) have wrong sort order", + (unsigned long) array->extents[i - 1].off, + (unsigned) array->extents[i - 1].len, + (unsigned long) array->extents[i].off, + (unsigned) array->extents[i].len); + } + } + + return sorted; +} + +/* + * Appends the extent to the array. + */ +static void +add_extent(ExtentsArray *arr, FileExtent extent) +{ + if (arr->size >= arr->allocated) + { + if (arr->allocated == 0) + { + arr->allocated = 16; + arr->extents = (FileExtent *) palloc(sizeof(FileExtent) * arr->allocated); + } + else + { + arr->allocated *= 2; + arr->extents = (FileExtent *) repalloc(arr->extents, + sizeof(FileExtent) * arr->allocated); + } + } + arr->extents[arr->size++] = extent; + arr->blocksCount += extent.len; +} + +static void +check_walk_btree(BTreeCheckStatus *status, OInMemoryBlkno blkno, + OInMemoryBlkno parentPagenum) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + OBTreeFindPageContext *context = &status->context; + FileExtent extent; + uint64 rightLink; + + Assert(OInMemoryBlknoIsValid(blkno)); + + lock_page(blkno); + if (OInMemoryBlknoIsValid(parentPagenum)) + unlock_page(parentPagenum); + + context->index++; + context->items[context->index].blkno = blkno; + context->items[context->index].pageChangeCount = O_PAGE_GET_CHANGE_COUNT(p); + + rightLink = header->rightLink; + if (RightLinkIsValid(rightLink)) + { + Page rightP = O_GET_IN_MEMORY_PAGE(RIGHTLINK_GET_BLKNO(rightLink)); + + if (O_PAGE_IS(rightP, BROKEN_SPLIT)) + { + elog(NOTICE, "BTree has a broken split."); + status->hasError = true; + } + } + + if (!O_PAGE_IS(p, LEAF)) + { + BTreePageItemLocator loc; + + BTREE_PAGE_LOCATOR_FIRST(p, &loc); + while (BTREE_PAGE_LOCATOR_IS_VALID(p, &loc)) + { + Pointer ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) ptr; + + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + check_walk_btree(status, DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink), + blkno); + } + else if (DOWNLINK_IS_IN_IO(tuphdr->downlink)) + { + wait_for_io_completion(DOWNLINK_GET_IO_LOCKNUM(tuphdr->downlink)); + continue; + } + else if (DOWNLINK_IS_ON_DISK(tuphdr->downlink)) + { + context->items[context->index].locator = loc; + load_page(context); + continue; + } + BTREE_PAGE_LOCATOR_NEXT(p, &loc); + } + } + + if (FileExtentIsValid(page_desc->fileExtent)) + { + extent = page_desc->fileExtent; + add_extent(&status->busy, extent); + } + + if (OInMemoryBlknoIsValid(parentPagenum)) + lock_page(parentPagenum); + unlock_page(blkno); + context->index--; +} + +static void +btree_check_compression_recursive(BTreeDescr *desc, BTreeCompressStats *stats, OCompress lvl, + OBTreeFindPageContext *context, OInMemoryBlkno blkno) +{ + char buf[ORIOLEDB_BLCKSZ]; + Page p = O_GET_IN_MEMORY_PAGE(blkno); + size_t compressed_size; + + ppool_ucm_inc_usage(desc->ppool, blkno); + + context->index++; + context->items[context->index].blkno = blkno; + context->items[context->index].pageChangeCount = O_PAGE_GET_CHANGE_COUNT(p); + + if (!O_PAGE_IS(p, LEAF)) + { + BTreePageItemLocator loc; + + BTREE_PAGE_LOCATOR_FIRST(p, &loc); + while (BTREE_PAGE_LOCATOR_IS_VALID(p, &loc)) + { + Pointer ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) ptr; + + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + btree_check_compression_recursive(desc, stats, lvl, context, + DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink)); + } + else if (DOWNLINK_IS_IN_IO(tuphdr->downlink)) + { + wait_for_io_completion(DOWNLINK_GET_IO_LOCKNUM(tuphdr->downlink)); + continue; + } + else if (DOWNLINK_IS_ON_DISK(tuphdr->downlink)) + { + context->items[context->index].locator = loc; + lock_page(blkno); + load_page(context); + unlock_page(blkno); + continue; + } + BTREE_PAGE_LOCATOR_NEXT(p, &loc); + } + } + + memcpy(buf, p, ORIOLEDB_BLCKSZ); + null_unused_bytes(buf); + + PG_TRY(); + { + o_compress_page(buf, &compressed_size, lvl); + + stats->totalSize += ORIOLEDB_BLCKSZ; + stats->totalCompressedSize += compressed_size; + + if (compressed_size > ORIOLEDB_BLCKSZ) + { + stats->oversize++; + } + else + { + int i; + + for (i = 0; i < stats->nranges; i++) + { + if (stats->ranges[i].from <= compressed_size + && compressed_size <= stats->ranges[i].to) + { + if (O_PAGE_IS(p, LEAF)) + stats->ranges[i].leaf_count++; + else + stats->ranges[i].node_count++; + break; + } + } + } + } + PG_CATCH(); + { + stats->errors++; + } + PG_END_TRY(); + + context->index--; +} + +void +check_btree_compression(BTreeDescr *desc, BTreeCompressStats *stats, OCompress lvl) +{ + OBTreeFindPageContext context; + bool recovery = is_recovery_in_progress(); + + o_tables_rel_lock_extended(&desc->oids, AccessShareLock, recovery); + o_btree_load_shmem(desc); + init_page_find_context(&context, desc, COMMITSEQNO_INPROGRESS, BTREE_PAGE_FIND_MODIFY); + + btree_check_compression_recursive(desc, stats, lvl, &context, desc->rootInfo.rootPageBlkno); + + o_tables_rel_unlock_extended(&desc->oids, AccessShareLock, recovery); +} diff --git a/contrib/orioledb/src/btree/fastpath.c b/contrib/orioledb/src/btree/fastpath.c new file mode 100644 index 00000000000..e1f701103a7 --- /dev/null +++ b/contrib/orioledb/src/btree/fastpath.c @@ -0,0 +1,712 @@ +/*------------------------------------------------------------------------- + * + * fastpath.c + * Routines for fastpath intra-page navigation in B-tree. + * + * The "fast path" navigation enables us to find a downlink (child pointer) + * without copying page chunks into local memory and performing a full + * binary search on the tuple array. In certain cases, we can walk a + * cache-friendly, fixed-stride array of values that mirrors the page layout, + * thereby reducing memory copying, branch mispredictions, and memory + * dereferences when descending the tree. + * + * Copyright (c) 2025-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/fastpath.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/btree.h" +#include "btree/fastpath.h" +#include "btree/find.h" +#include "postgres_ext.h" +#include "storage/itemptr.h" +#include "tableam/key_range.h" + +#include "catalog/pg_opclass_d.h" +#include "commands/defrem.h" + +typedef struct +{ + Oid typeid; + Oid opcid; + int typlen; + int align; + ArraySearchFunc func; +} ArraySearchDesc; + +static ArraySearchDesc *find_array_search_desc_by_typeid(Oid typeid); + +static bool find_downlink_get_keys(BTreeDescr *desc, + void *key, BTreeKeyType keyType, + bool *inclusive, int numValues, + Oid *types, Datum *values, uint8 *flags); + +static void oid_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); +static void int4_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); +static void int8_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); +static void float4_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); +static void float8_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); +static void tid_array_search(Pointer p, int stride, int *lower, + int *upper, Datum keyDatum); + +ArraySearchDesc arraySearchDescs[] = { + {OIDOID, OID_BTREE_OPS_OID, sizeof(Oid), ALIGNOF_INT, oid_array_search}, + {INT4OID, INT4_BTREE_OPS_OID, sizeof(int32), ALIGNOF_INT, int4_array_search}, + {INT8OID, INT8_BTREE_OPS_OID, sizeof(int64), ALIGNOF_DOUBLE, int8_array_search}, + {FLOAT4OID, InvalidOid, sizeof(float4), ALIGNOF_INT, float4_array_search}, + {FLOAT8OID, FLOAT8_BTREE_OPS_OID, sizeof(float8), ALIGNOF_DOUBLE, float8_array_search}, + {TIDOID, InvalidOid, sizeof(ItemPointerData), ALIGNOF_SHORT, tid_array_search} +}; + +/* + * Checks if the "fast path" the navigation can be applied to the given search + * and fills *meta structure if so. + */ +void +can_fastpath_find_downlink(OBTreeFindPageContext *context, + void *key, + BTreeKeyType keyType, + FastpathFindDownlinkMeta *meta) +{ + BTreeDescr *desc = context->desc; + OIndexDescr *id; + Oid types[FASTPATH_FIND_DOWNLINK_MAX_KEYS] = {InvalidOid}; + int i; + int offset; + + ASAN_UNPOISON_MEMORY_REGION(meta, sizeof(*meta)); + + if (!BTREE_PAGE_FIND_IS(context, FETCH) || + IS_SYS_TREE_OIDS(desc->oids)) + { + meta->enabled = false; + return; + } + + id = (OIndexDescr *) desc->arg; + + if (id->nonLeafTupdesc->natts >= FASTPATH_FIND_DOWNLINK_MAX_KEYS || + id->nonLeafSpec.natts != id->nonLeafTupdesc->natts) + { + meta->enabled = false; + return; + } + + if (keyType == BTreeKeyUniqueLowerBound || + keyType == BTreeKeyUniqueUpperBound) + meta->numKeys = id->nUniqueFields; + else if (id->desc.type != oIndexToast && id->desc.type != oIndexBridge) + meta->numKeys = id->nKeyFields; + else + meta->numKeys = id->nonLeafSpec.natts; + + offset = 0; + for (i = 0; i < meta->numKeys; i++) + { + ArraySearchDesc *searchDesc = find_array_search_desc_by_typeid(id->nonLeafTupdesc->attrs[i].atttypid); + OIndexField *field = &id->fields[i]; + + if (!searchDesc || searchDesc->opcid != field->opclass) + { + meta->enabled = false; + return; + } + + offset = TYPEALIGN(searchDesc->align, offset); + meta->funcs[i] = searchDesc->func; + meta->offsets[i] = offset; + types[i] = searchDesc->typeid; + + offset += searchDesc->typlen; + } + + if (!find_downlink_get_keys(context->desc, key, keyType, + &meta->inclusive, meta->numKeys, types, + meta->values, meta->flags)) + { + meta->enabled = false; + return; + } + + meta->enabled = true; + meta->length = MAXALIGN(id->nonLeafSpec.len); +} + +static ArraySearchDesc * +find_array_search_desc_by_typeid(Oid typeid) +{ + int i; + + for (i = 0; i < sizeof(arraySearchDescs) / sizeof(ArraySearchDesc); i++) + { + if (arraySearchDescs[i].typeid == typeid) + { + if (!OidIsValid(arraySearchDescs[i].opcid)) + { + bool was_saving; + + was_saving = o_start_saving_inval_messages(); + arraySearchDescs[i].opcid = GetDefaultOpClass(typeid, BTREE_AM_OID); + o_stop_saving_inval_messages(was_saving); + } + return &arraySearchDescs[i]; + } + } + return NULL; +} + +/* + * Decompose search key into values for the "fast path" tree navigation. + */ +static bool +find_downlink_get_keys(BTreeDescr *desc, void *key, BTreeKeyType keyType, + bool *inclusive, int numValues, Oid *types, + Datum *values, uint8 *flags) +{ + TupleDesc tupdesc; + OTupleFixedFormatSpec *spec; + OIndexDescr *id; + OTuple *tuple; + int i; + + Assert(!IS_SYS_TREE_OIDS(desc->oids)); + + id = (OIndexDescr *) desc->arg; + *inclusive = false; + + if (keyType == BTreeKeyNone || + keyType == BTreeKeyRightmost) + { + for (i = 0; i < numValues; i++) + { + flags[i] = (keyType == BTreeKeyNone) ? FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF : FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF; + values[i] = (Datum) 0; + } + return true; + } + + if (keyType == BTreeKeyBound || + keyType == BTreeKeyUniqueLowerBound || + keyType == BTreeKeyUniqueUpperBound) + { + OBTreeKeyBound *bound = (OBTreeKeyBound *) key; + int num = Min(numValues, bound->nkeys); + + for (i = 0; i < num; i++) + { + uint8 f = bound->keys[i].flags; + + if (bound->keys[i].type != types[i]) + return false; + + if (f & O_VALUE_BOUND_UNBOUNDED) + { + flags[i] = (f & O_VALUE_BOUND_LOWER) ? FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF : FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF; + values[i] = (Datum) 0; + } + else + { + flags[i] = 0; + values[i] = bound->keys[i].value; + } + } + return true; + } + + Assert(keyType == BTreeKeyLeafTuple || + keyType == BTreeKeyNonLeafKey || + keyType == BTreeKeyPageHiKey); + + if (keyType == BTreeKeyPageHiKey) + *inclusive = true; + + if (keyType == BTreeKeyLeafTuple) + { + tupdesc = id->leafTupdesc; + spec = &id->leafSpec; + } + else + { + tupdesc = id->nonLeafTupdesc; + spec = &id->nonLeafSpec; + } + + tuple = (OTuple *) key; + + for (i = 0; i < numValues; i++) + { + bool isnull; + int attnum; + + attnum = OIndexKeyAttnumToTupleAttnum(keyType, id, i + 1); + values[i] = o_fastgetattr(*tuple, attnum, tupdesc, spec, &isnull); + + if (isnull) + flags[i] = (id->fields[i].nullfirst) ? FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF : FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF; + else + flags[i] = 0; + } + return true; +} + +OBTreeFastPathFindResult +fastpath_find_downlink(Pointer pagePtr, + OInMemoryBlkno blkno, + FastpathFindDownlinkMeta *meta, + BTreePageItemLocator *loc, + BTreeNonLeafTuphdr **tuphdrPtr) +{ + BTreePageHeader *imgHdr = (BTreePageHeader *) pagePtr; + BTreePageHeader *hdr = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(blkno); + int lower; + int upper; + int count; + int i; + int chunkIndex; + int itemIndex; + BTreePageChunk *chunk; + int chunkSize, + chunkItemsCount; + Pointer base; + uint64 state; + uint64 imageChangeCount = pg_atomic_read_u64(&imgHdr->o_header.state) & PAGE_STATE_CHANGE_COUNT_MASK; + OBTreeFastPathFindResult result; + static BTreeNonLeafTuphdr tuphdr; + + result = fastpath_find_chunk(pagePtr, blkno, meta, &chunkIndex); + + if (result != OBTreeFastPathFindOK) + return result; + + if (!hdr->chunkDesc[chunkIndex].chunkKeysFixed) + return OBTreeFastPathFindSlowpath; + + chunk = (BTreePageChunk *) ((Pointer) hdr + SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation)); + if (chunkIndex < imgHdr->chunksCount - 1) + { + chunkSize = SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex + 1].shortLocation) - SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation); + chunkItemsCount = hdr->chunkDesc[chunkIndex + 1].offset - hdr->chunkDesc[chunkIndex].offset; + } + else + { + chunkSize = imgHdr->dataSize - SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation); + chunkItemsCount = imgHdr->itemsCount - hdr->chunkDesc[chunkIndex].offset; + } + + pg_read_barrier(); + + if (chunkIndex == 0) + { + count = chunkItemsCount - 1; + base = (Pointer) chunk + MAXALIGN(sizeof(LocationIndex) * chunkItemsCount) + MAXALIGN(sizeof(BTreeNonLeafTuphdr)); + } + else + { + count = chunkItemsCount; + base = (Pointer) chunk + MAXALIGN(sizeof(LocationIndex) * chunkItemsCount); + } + + if (chunkSize != MAXALIGN(sizeof(LocationIndex) * chunkItemsCount) + + MAXALIGN(sizeof(BTreeNonLeafTuphdr)) * chunkItemsCount + + meta->length * count) + return OBTreeFastPathFindSlowpath; + + lower = 0; + upper = count; + for (i = 0; lower < upper && i < meta->numKeys; i++) + { + if (meta->flags[i] == 0) + meta->funcs[i] (base + MAXALIGN(sizeof(BTreeNonLeafTuphdr)) + meta->offsets[i], + MAXALIGN(sizeof(BTreeNonLeafTuphdr)) + meta->length, + &lower, &upper, meta->values[i]); + else if (meta->flags[i] & FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF) + upper = lower; + else if (meta->flags[i] & FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF) + lower = upper; + } + + itemIndex = meta->inclusive ? lower : upper; + + pg_read_barrier(); + + state = pg_atomic_read_u64(&hdr->o_header.state); + if (O_PAGE_STATE_READ_IS_BLOCKED(state) || + (state & PAGE_STATE_CHANGE_COUNT_MASK) != imageChangeCount) + return OBTreeFastPathFindRetry; + + if (chunkIndex == 0) + { + if (itemIndex == 0) + tuphdr = *((BTreeNonLeafTuphdr *) (base - MAXALIGN(sizeof(BTreeNonLeafTuphdr)))); + else + tuphdr = *((BTreeNonLeafTuphdr *) (base + (MAXALIGN(sizeof(BTreeNonLeafTuphdr)) + meta->length) * (itemIndex - 1))); + *tuphdrPtr = &tuphdr; + loc->chunk = chunk; + loc->chunkItemsCount = chunkItemsCount; + loc->chunkSize = chunkSize; + loc->itemOffset = itemIndex; + loc->chunkOffset = chunkIndex; + } + else + { + if (itemIndex > 0) + { + tuphdr = *((BTreeNonLeafTuphdr *) (base + (MAXALIGN(sizeof(BTreeNonLeafTuphdr)) + meta->length) * (itemIndex - 1))); + *tuphdrPtr = &tuphdr; + loc->chunk = chunk; + loc->chunkItemsCount = chunkItemsCount; + loc->chunkSize = chunkSize; + loc->itemOffset = itemIndex - 1; + loc->chunkOffset = chunkIndex; + } + else + { + chunkIndex--; + if (!hdr->chunkDesc[chunkIndex].chunkKeysFixed) + return OBTreeFastPathFindSlowpath; + + chunk = (BTreePageChunk *) ((Pointer) hdr + SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation)); + if (chunkIndex < imgHdr->chunksCount - 1) + { + chunkSize = SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex + 1].shortLocation) - SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation); + chunkItemsCount = hdr->chunkDesc[chunkIndex + 1].offset - hdr->chunkDesc[chunkIndex].offset; + } + else + { + chunkSize = imgHdr->dataSize - SHORT_GET_LOCATION(hdr->chunkDesc[chunkIndex].shortLocation); + chunkItemsCount = imgHdr->itemsCount - hdr->chunkDesc[chunkIndex].offset; + } + + pg_read_barrier(); + + if (chunkIndex == 0) + { + count = chunkItemsCount - 1; + base = (Pointer) chunk + MAXALIGN(sizeof(LocationIndex) * chunkItemsCount) + MAXALIGN(sizeof(BTreeNonLeafTuphdr)); + } + else + { + count = chunkItemsCount; + base = (Pointer) chunk + MAXALIGN(sizeof(LocationIndex) * chunkItemsCount); + } + + if (chunkSize != MAXALIGN(sizeof(LocationIndex) * chunkItemsCount) + + MAXALIGN(sizeof(BTreeNonLeafTuphdr)) * chunkItemsCount + + meta->length * count) + return OBTreeFastPathFindSlowpath; + + itemIndex = chunkItemsCount - 1; + + if (chunkIndex == 0 && itemIndex == 0) + tuphdr = *((BTreeNonLeafTuphdr *) (base - MAXALIGN(sizeof(BTreeNonLeafTuphdr)))); + else + tuphdr = *((BTreeNonLeafTuphdr *) (base + (MAXALIGN(sizeof(BTreeNonLeafTuphdr)) + meta->length) * (count - 1))); + *tuphdrPtr = &tuphdr; + + loc->chunk = chunk; + loc->chunkItemsCount = chunkItemsCount; + loc->chunkSize = chunkSize; + loc->itemOffset = itemIndex; + loc->chunkOffset = chunkIndex; + } + } + + pg_read_barrier(); + + state = pg_atomic_read_u64(&hdr->o_header.state); + if (O_PAGE_STATE_READ_IS_BLOCKED(state) || + (state & PAGE_STATE_CHANGE_COUNT_MASK) != imageChangeCount) + return OBTreeFastPathFindRetry; + + return OBTreeFastPathFindOK; +} + +OBTreeFastPathFindResult +fastpath_find_chunk(Pointer pagePtr, + OInMemoryBlkno blkno, + FastpathFindDownlinkMeta *meta, + int *chunkIndex) +{ + BTreePageHeader *imgHdr = (BTreePageHeader *) pagePtr; + BTreePageHeader *hdr = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(blkno); + int i; + int lower; + int upper; + int count; + int offset; + Pointer base; + uint64 imageChangeCount = pg_atomic_read_u64(&imgHdr->o_header.state) & PAGE_STATE_CHANGE_COUNT_MASK; + uint64 state; + + if (!O_PAGE_IS(pagePtr, HIKEYS_FIXED)) + return OBTreeFastPathFindSlowpath; + + count = O_PAGE_IS(pagePtr, RIGHTMOST) ? imgHdr->chunksCount - 1 : imgHdr->chunksCount; + + offset = SHORT_GET_LOCATION(hdr->chunkDesc[0].hikeyShortLocation); + + pg_read_barrier(); + + if (imgHdr->hikeysEnd - offset != count * meta->length) + return OBTreeFastPathFindSlowpath; + + base = (Pointer) hdr + offset; + lower = 0; + upper = count; + for (i = 0; lower < upper && i < meta->numKeys; i++) + { + if (meta->flags[i] == 0) + meta->funcs[i] (base + meta->offsets[i], + meta->length, &lower, &upper, + meta->values[i]); + else if (meta->flags[i] & FASTPATH_FIND_DOWNLINK_FLAG_MINUS_INF) + upper = lower; + else if (meta->flags[i] & FASTPATH_FIND_DOWNLINK_FLAG_PLUS_INF) + lower = upper; + } + + *chunkIndex = meta->inclusive ? lower : upper; + + pg_read_barrier(); + + /* Possible we need to visit the rightlink */ + if (*chunkIndex >= count) + return OBTreeFastPathFindSlowpath; + + state = pg_atomic_read_u64(&hdr->o_header.state); + if (O_PAGE_STATE_READ_IS_BLOCKED(state) || + (state & PAGE_STATE_CHANGE_COUNT_MASK) != imageChangeCount) + return OBTreeFastPathFindRetry; + + return OBTreeFastPathFindOK; +} + +/* + * Find the given value in the fixed-stride array of integers. The functions + * below do the same for other datatypes. + */ +static void +int4_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + int32 key = DatumGetInt32(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + int32 value = *((int32 *) p); + + if (value == key && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (value > key) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} + +static void +int8_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + int64 key = DatumGetInt64(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + int64 value = *((int64 *) p); + + if (value == key && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (value > key) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} + +static void +oid_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + Oid key = DatumGetObjectId(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + Oid value = *((Oid *) p); + + if (value == key && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (value > key) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} + +static void +float4_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + float4 key = DatumGetFloat4(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + /* cppcheck-suppress invalidPointerCast */ + float4 value = *((float4 *) p); + + if (value == key && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (value > key) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} + +static void +float8_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + float8 key = DatumGetFloat8(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + /* cppcheck-suppress invalidPointerCast */ + float8 value = *((float8 *) p); + + if (value == key && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (value > key) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} + +static int +tid_cmp(ItemPointer arg1, ItemPointer arg2) +{ + BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1); + BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) < + ItemPointerGetOffsetNumberNoCheck(arg2)) + return -1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) > + ItemPointerGetOffsetNumberNoCheck(arg2)) + return 1; + else + return 0; +} + +static void +tid_array_search(Pointer p, int stride, int *lower, int *upper, Datum keyDatum) +{ + int i; + bool lowerSet = false; + ItemPointer key = DatumGetItemPointer(keyDatum); + + p += *lower * stride; + + for (i = *lower; i < *upper; i++) + { + int cmp = tid_cmp((ItemPointer) p, key); + + if (cmp == 0 && !lowerSet) + { + *lower = i; + lowerSet = true; + } + else if (cmp > 0) + { + if (!lowerSet) + *lower = i; + *upper = i; + return; + } + + p += stride; + } + if (!lowerSet) + *lower = *upper; +} diff --git a/contrib/orioledb/src/btree/find.c b/contrib/orioledb/src/btree/find.c new file mode 100644 index 00000000000..ec3b73e62f3 --- /dev/null +++ b/contrib/orioledb/src/btree/find.c @@ -0,0 +1,1900 @@ +/*------------------------------------------------------------------------- + * + * find.c + * Routines for finding appropriate page in B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/find.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/fastpath.h" +#include "btree/find.h" +#include "btree/insert.h" +#include "btree/io.h" +#include "btree/page_chunks.h" +#include "tableam/descr.h" +#include "utils/stopevent.h" + +#include "access/transam.h" + +typedef struct +{ + OBTreeFindPageContext *context; + void *key; + BTreeKeyType keyType; + Page pagePtr; + int targetLevel; + OInMemoryBlkno blkno; + uint32 pageChangeCount; + PartialPageState *partial; + bool haveLock; + bool inserted; + bool tryLockFailed; +} OBTreeFindPageInternalContext; + +static bool follow_rightlink(OBTreeFindPageInternalContext *intCxt); +static void step_upward_level(OBTreeFindPageInternalContext *intCxt); +static bool btree_find_read_page(OBTreeFindPageContext *context, + OInMemoryBlkno blkno, uint32 pageChangeCount, + bool parent, void *key, BTreeKeyType keyType, + PartialPageState *partial, + bool loadHikeysChunk); +static ReadPageResult btree_find_try_read_page(OBTreeFindPageContext *context, + OInMemoryBlkno blkno, + uint32 pageChangeCount, bool parent, + void *key, BTreeKeyType keyType, + PartialPageState *partial, + bool loadHikeysChunk); + +static OffsetNumber btree_page_binary_search_chunks(BTreeDescr *desc, Page p, + Pointer key, + BTreeKeyType keyType); +static void btree_page_search_items(BTreeDescr *desc, Page p, Pointer key, + BTreeKeyType keyType, + BTreePageItemLocator *locator); +static void refresh_parent_img_chunk(OBTreeFindPageInternalContext *intCxt); + +/* + * Initialize B-tree page find context. + */ +void +init_page_find_context(OBTreeFindPageContext *context, BTreeDescr *desc, + CommitSeqNo csn, uint16 flags) +{ + ASAN_UNPOISON_MEMORY_REGION(context, sizeof(*context)); + context->partial.isPartial = false; + context->desc = desc; + context->csn = csn; + context->index = 0; + context->flags = flags; + context->imgUndoLoc = InvalidUndoLocation; + context->img = NULL; + context->parentImg = NULL; + O_TUPLE_SET_NULL(context->insertTuple); + O_TUPLE_SET_NULL(context->lokey.tuple); +} + + + +static OBTreeFastPathFindResult +page_find_downlink(OBTreeFindPageInternalContext *intCxt, + FastpathFindDownlinkMeta *meta, + int level, + bool fastPathDownlink, + BTreePageItemLocator *loc, + BTreeNonLeafTuphdr **tuphdr) +{ + OBTreeFindPageContext *context = intCxt->context; + BTreeDescr *desc = context->desc; + void *key = intCxt->key; + BTreeKeyType keyType = intCxt->keyType; + bool itemFound = true; + + if (fastPathDownlink) + { + OBTreeFastPathFindResult result; + + result = fastpath_find_downlink(intCxt->pagePtr, intCxt->blkno, + meta, loc, tuphdr); + + if (result != OBTreeFastPathFindSlowpath) + return result; + } + + if (intCxt->partial && + intCxt->partial->isPartial && + !intCxt->partial->hikeysChunkIsLoaded) + { + if (!partial_load_hikeys_chunk(intCxt->partial, intCxt->pagePtr)) + return OBTreeFastPathFindRetry; + } + + /* + * BTreeKeyNone requests leftmost page. Otherwise, consider following the + * rightlink. + */ + if (keyType != BTreeKeyNone) + { + if (follow_rightlink(intCxt)) + { + if (intCxt->tryLockFailed) + return OBTreeFastPathFindFailure; + if (intCxt->inserted) + return OBTreeFastPathFindFailure; + Assert(context->index > 0); + Assert(!intCxt->haveLock); + step_upward_level(intCxt); + return OBTreeFastPathFindRetry; + } + } + + /* + * Choose the appropriate downlink for further search. + */ + if (keyType == BTreeKeyRightmost) + BTREE_PAGE_LOCATOR_LAST(intCxt->pagePtr, loc); + else if (keyType == BTreeKeyNone) + BTREE_PAGE_LOCATOR_FIRST(intCxt->pagePtr, loc); + else + { + Assert(key); + /* Have to do the binary search otherwise */ + itemFound = btree_page_search(desc, intCxt->pagePtr, key, keyType, + intCxt->partial, loc); + if (itemFound) + { + BTREE_PAGE_LOCATOR_PREV(intCxt->pagePtr, loc); + if (intCxt->partial) + itemFound = partial_load_chunk(intCxt->partial, + intCxt->pagePtr, + loc->chunkOffset, + NULL); + } + } + + if (intCxt->partial) + { + if (!itemFound || !partial_load_chunk(intCxt->partial, + intCxt->pagePtr, + loc->chunkOffset, + NULL)) + { + Assert(!intCxt->haveLock); + if (BTREE_PAGE_FIND_IS(context, TRY_LOCK)) + return OBTreeFastPathFindFailure; + return OBTreeFastPathFindRetry; + } + + if (BTREE_PAGE_FIND_IS(context, IMAGE) && + level == intCxt->targetLevel + 1 && + BTREE_PAGE_FIND_IS(context, KEEP_LOKEY)) + { + /* + * We may need to load another one tuple for a backward iteration. + */ + if (loc->itemOffset == 0 && loc->chunkOffset > 0 && + !partial_load_chunk(intCxt->partial, intCxt->pagePtr, + loc->chunkOffset - 1, NULL)) + { + Assert(!intCxt->haveLock); + return OBTreeFastPathFindRetry; + } + } + } + + *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(intCxt->pagePtr, loc); + + return OBTreeFastPathFindOK; +} + +static OBTreeFastPathFindResult +page_find_item(OBTreeFindPageInternalContext *intCxt, + FastpathFindDownlinkMeta *meta, + int level, + bool fastpath, + BTreePageItemLocator *loc, + BTreeNonLeafTuphdr **tuphdr) +{ + OBTreeFindPageContext *context = intCxt->context; + BTreeDescr *desc = context->desc; + void *key = intCxt->key; + BTreeKeyType keyType = intCxt->keyType; + bool itemFound = true; + + if (fastpath && intCxt->partial->isPartial) + { + OBTreeFastPathFindResult result; + int chunkIndex; + + Assert(!BTREE_PAGE_FIND_IS(context, MODIFY)); + + result = fastpath_find_chunk(intCxt->pagePtr, + intCxt->blkno, + meta, + &chunkIndex); + + if (result == OBTreeFastPathFindOK && + !partial_load_chunk(intCxt->partial, + intCxt->pagePtr, + chunkIndex, + loc)) + result = OBTreeFastPathFindRetry; + + if (result == OBTreeFastPathFindOK) + { + if (keyType == BTreeKeyRightmost) + { + loc->itemOffset = loc->chunkItemsCount - 1; + } + else if (keyType == BTreeKeyNone) + { + loc->itemOffset = 0; + } + else + { + btree_page_search_items(desc, intCxt->pagePtr, + key, keyType, loc); + } + + if (page_locator_find_real_item(intCxt->pagePtr, + intCxt->partial, + loc)) + { + if (level > 0) + *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(intCxt->pagePtr, loc); + + return OBTreeFastPathFindOK; + } + else + { + result = OBTreeFastPathFindRetry; + } + } + + if (result == OBTreeFastPathFindRetry) + { + /* + * Can not read partial page, it happens if the pages was + * concurrently changed. But it should not happen under the + * lock_page(). + */ + Assert(!intCxt->haveLock); + if (BTREE_PAGE_FIND_IS(context, TRY_LOCK)) + return OBTreeFastPathFindFailure; + return OBTreeFastPathFindRetry; + } + else if (result == OBTreeFastPathFindFailure) + { + return OBTreeFastPathFindFailure; + } + Assert(result == OBTreeFastPathFindSlowpath); + } + + if (intCxt->partial && + intCxt->partial->isPartial && + !intCxt->partial->hikeysChunkIsLoaded) + { + if (!partial_load_hikeys_chunk(intCxt->partial, intCxt->pagePtr)) + return OBTreeFastPathFindRetry; + } + + /* + * BTreeKeyNone requests leftmost page. Otherwise, consider following the + * rightlink. + */ + if (keyType != BTreeKeyNone) + { + if (follow_rightlink(intCxt)) + { + if (intCxt->tryLockFailed) + return OBTreeFastPathFindFailure; + Assert(context->index > 0); + Assert(!intCxt->haveLock); + step_upward_level(intCxt); + return OBTreeFastPathFindRetry; + } + } + + /* + * Choose the appropriate downlink for further search. + */ + if (keyType == BTreeKeyRightmost) + BTREE_PAGE_LOCATOR_LAST(intCxt->pagePtr, loc); + else if (keyType == BTreeKeyNone) + BTREE_PAGE_LOCATOR_FIRST(intCxt->pagePtr, loc); + else + { + Assert(key); + /* Have to do the binary search otherwise */ + itemFound = btree_page_search(desc, intCxt->pagePtr, + key, keyType, + intCxt->partial, loc); + if (itemFound && !BTREE_PAGE_FIND_IS(context, MODIFY)) + itemFound = page_locator_find_real_item(intCxt->pagePtr, + intCxt->partial, + loc); + } + + if (intCxt->partial && + (!itemFound || !partial_load_chunk(intCxt->partial, + intCxt->pagePtr, + loc->chunkOffset, + NULL))) + { + /* + * Can not read partial page, it happens if the pages was concurrently + * changed. But it should not happen under the lock_page(). + */ + Assert(!intCxt->haveLock); + if (BTREE_PAGE_FIND_IS(context, TRY_LOCK)) + return OBTreeFastPathFindFailure; + return OBTreeFastPathFindRetry; + } + + if (level > 0) + *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(intCxt->pagePtr, loc); + + return OBTreeFastPathFindOK; +} + +/* + * Refresh context->parentImg from the locked shared-memory page held + * by `intCxt` and rebind the current locator's chunk onto parentImg. + * Used by find_page at level == targetLevel + 1 in IMAGE mode when + * intCxt->pagePtr is the real shared-memory page (not parentImg): + * without this rebind, the iterator's later find_right_page / + * find_left_page would navigate through a chunk pointer into a page + * the descent has already unlocked. + * + * Only the page header (with hikeys) and the chunk that the locator + * currently references are copied; the partial state is set up so + * other chunks can be loaded on demand by partial_load_chunk if + * find_right_page / find_left_page later visit them. The standard + * consistency check in partial_load_chunk then falls through to a + * find_page re-descent if the source has been concurrently mutated. + */ +static void +refresh_parent_img_chunk(OBTreeFindPageInternalContext *intCxt) +{ + OBTreeFindPageContext *context = intCxt->context; + Pointer src = intCxt->pagePtr; + BTreePageItemLocator *locator = &context->items[context->index].locator; + BTreePageHeader *hdr = (BTreePageHeader *) src; + OffsetNumber chunkOffset = locator->chunkOffset; + LocationIndex chunkBegin; + LocationIndex chunkEnd; + + chunkBegin = SHORT_GET_LOCATION(hdr->chunkDesc[chunkOffset].shortLocation); + if (chunkOffset + 1 < hdr->chunksCount) + chunkEnd = SHORT_GET_LOCATION(hdr->chunkDesc[chunkOffset + 1].shortLocation); + else + chunkEnd = hdr->dataSize; + + /* Header including the hikeys chunk. */ + memcpy(context->parentImg, src, hdr->hikeysEnd); + /* The single chunk that `locator` references. */ + memcpy(context->parentImg + chunkBegin, + src + chunkBegin, + chunkEnd - chunkBegin); + + context->partial.src = src; + context->partial.isPartial = true; + context->partial.hikeysChunkIsLoaded = true; + memset(context->partial.chunkIsLoaded, 0, + sizeof(context->partial.chunkIsLoaded)); + context->partial.chunkIsLoaded[chunkOffset] = true; + + locator->chunk = + (BTreePageChunk *) (context->parentImg + chunkBegin); +} + +/*-- + * Locate page and location within it for given key + * + * - context - context of parent pages + * - key - key/tuple for search (NULL for the leftmost page) + * - keyType - type of the key + * - targetLevel - target page targetLevel to find + * + * For better efficiency on large pages we use partial approach for page read + * from the shared memory. We have 3 alternative types of the call + * depending on context->flags: + * + * 1. BTREE_PAGE_FIND_FETCH - fetches a single tuple. It uses partial read for + * all pages. + * + * 2. BTREE_PAGE_FIND_MODIFY - find the page for modification. It uses partial read + * for all parent pages, call lock_page() on a target page and search a tuple + * on the target page in the shared memory. + * + * 3. BTREE_PAGE_FIND_IMAGE - copy a target leaf(!) to context->img. It useful + * for iteration through the page. Reads parent pages partial and then + * memcpy() a leaf page to the context.image. It holds lokey + * if BTREE_PAGE_FIND_KEEP_LOKEY is set. + */ +OFindPageResult +find_page(OBTreeFindPageContext *context, void *key, BTreeKeyType keyType, + uint16 targetLevel) +{ + BTreeDescr *desc = context->desc; + OBTreeFindPageInternalContext intCxt; + BTreePageItemLocator loc; + bool needLock = false, + fetchFlag PG_USED_FOR_ASSERTS_ONLY = BTREE_PAGE_FIND_IS(context, FETCH), + modifyFlag = BTREE_PAGE_FIND_IS(context, MODIFY), + imageFlag = BTREE_PAGE_FIND_IS(context, IMAGE), + tryFlag = BTREE_PAGE_FIND_IS(context, TRY_LOCK), + fixLeafFlag = BTREE_PAGE_FIND_IS(context, FIX_LEAF_SPLIT), + noFixFlag PG_USED_FOR_ASSERTS_ONLY = BTREE_PAGE_FIND_IS(context, NO_FIX_SPLIT), + keepLokeyFlag = BTREE_PAGE_FIND_IS(context, KEEP_LOKEY), + downlinkLocationFlag = BTREE_PAGE_FIND_IS(context, DOWNLINK_LOCATION); + bool shmemIsReloaded = false; + FastpathFindDownlinkMeta fastpathMeta; + Jsonb *params = NULL; + + memset(&intCxt, 0, sizeof(intCxt)); + ASAN_UNPOISON_MEMORY_REGION(&intCxt, sizeof(intCxt)); + intCxt.context = context; + intCxt.key = key; + intCxt.keyType = keyType; + intCxt.targetLevel = targetLevel; + intCxt.inserted = false; + + + ASAN_UNPOISON_MEMORY_REGION(&fastpathMeta, sizeof(fastpathMeta)); + if (STOPEVENTS_ENABLED()) + fastpathMeta.enabled = false; + else + can_fastpath_find_downlink(context, key, keyType, &fastpathMeta); + + /* + * See description of the function. + */ + Assert((imageFlag && (targetLevel <= ORIOLEDB_MAX_DEPTH) && !fetchFlag && !modifyFlag) + || (imageFlag && targetLevel == 0 && !fetchFlag && modifyFlag) + || (!imageFlag && fetchFlag && !modifyFlag && !keepLokeyFlag) + || (!imageFlag && !fetchFlag && modifyFlag && !keepLokeyFlag)); + Assert(!(COMMITSEQNO_IS_NORMAL(context->csn) && modifyFlag)); + + /* resets the context before start */ + if (BTREE_PAGE_FIND_IS(context, KEEP_LOKEY)) + { + BTREE_PAGE_FIND_UNSET(context, LOKEY_EXISTS); + BTREE_PAGE_FIND_UNSET(context, LOKEY_SIBLING); + BTREE_PAGE_FIND_UNSET(context, LOKEY_UNDO); + } + context->imgUndoLoc = InvalidUndoLocation; + context->partial.isPartial = false; + context->index = 0; + + if (!tryFlag) + { + o_btree_load_shmem(desc); + } + else + { + if (!o_btree_try_use_shmem(desc)) + return OFindPageResultFailure; + } + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc)); + + /* starts from the rootPageBlkno */ + intCxt.blkno = desc->rootInfo.rootPageBlkno; + intCxt.pageChangeCount = desc->rootInfo.rootPageChangeCount; + while (true) + { + BTreeNonLeafTuphdr *nonLeafHdr = NULL; + int level; + OInMemoryBlkno parentBlkno; + bool wrongChangeCount = false; + Pointer p; + bool fastpath; + + /* + * Local-pool slots are NULLed on eviction, unlike shared-pool slots + * whose shmem page stays readable (only pageChangeCount changes). An + * IN_MEMORY downlink we just descended through may reference a slot + * the backend evicted earlier in the same call chain -- e.g. a + * reserve_page triggered during a seq scan, or a find_page invoked + * from an undo callback. PAGE_GET_LEVEL below would segfault on the + * NULL slot, so step back to the parent and re-resolve the downlink + * (it now points to disk). At the root there is no parent, so report + * failure. + */ + if (O_PAGE_IS_LOCAL(intCxt.blkno) && + local_ppool_pages[intCxt.blkno & O_BLKNO_MASK] == NULL) + { + if (context->index == 0) + return OFindPageResultFailure; + step_upward_level(&intCxt); + continue; + } + + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + level = PAGE_GET_LEVEL(p); + + fastpath = fastpathMeta.enabled && !needLock; + fastpath = fastpath && (keyType != BTreeKeyPageHiKey || level > 0); + + intCxt.partial = NULL; + if (!imageFlag || level > 0) + context->partial.isPartial = false; + + /* + * else saves isPartial flag for the parent of the leaf in imageFlag + * case + */ + + if (needLock || (modifyFlag && level == targetLevel)) + { + if (tryFlag) + { + if (!try_lock_page(intCxt.blkno)) + return OFindPageResultFailure; + intCxt.pagePtr = p; + intCxt.haveLock = true; + needLock = false; + } + else if (!O_TUPLE_IS_NULL(context->insertTuple)) + { + OLockPageWithTupleResult result; + + result = lock_page_with_tuple(desc, + &intCxt.blkno, + &intCxt.pageChangeCount, + context->insertXactInfo, + context->insertTuple); + + if (result == OLockPageWithTupleResultLocked) + { + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + intCxt.pagePtr = p; + intCxt.haveLock = true; + needLock = false; + } + else if (result == OLockPageWithTupleResultInserted) + { + return OFindPageResultInserted; + } + else + { + Assert(result == OLockPageWithTupleResultRefindNeeded); + wrongChangeCount = true; + } + } + else + { + lock_page(intCxt.blkno); + intCxt.pagePtr = p; + intCxt.haveLock = true; + needLock = false; + } + } + else + { + bool useParentImg = false; + + if (imageFlag) + { + /* + * In BTREE_PAGE_FIND_IMAGE case we read a target targetLevel + * to the context.img without partial and read upper non-leaf + * pages to the context.parentImg partially. + * + * We consider it's OK to return page of lower targetLevel + * than required, if tree doesn't have enough height. That's + * suitable for sequential scan (see btree_scan.c). + */ + if (level <= targetLevel) + { + useParentImg = false; + intCxt.partial = NULL; + Assert(!fastpath); + } + else + { + useParentImg = true; + intCxt.partial = &context->partial; + } + } + else + { + /* + * In other cases we can use the img to hold a partial data. + */ + useParentImg = false; + intCxt.partial = &context->partial; + } + + intCxt.haveLock = false; + if (tryFlag) + { + ReadPageResult result; + + result = btree_find_try_read_page(context, intCxt.blkno, + intCxt.pageChangeCount, + useParentImg, + key, keyType, + intCxt.partial, + !fastpath); + intCxt.pagePtr = useParentImg ? context->parentImg : context->img; + if (result == ReadPageResultWrongPageChangeCount) + { + wrongChangeCount = true; + } + else if (result == ReadPageResultFailed) + { + return OFindPageResultFailure; + } + } + else + { + bool result; + + result = btree_find_read_page(context, intCxt.blkno, + intCxt.pageChangeCount, + useParentImg, key, keyType, + intCxt.partial, + !fastpath); + intCxt.pagePtr = useParentImg ? context->parentImg : context->img; + if (!result) + { + if (context->index == 0) + { + wrongChangeCount = true; + } + else + { + step_upward_level(&intCxt); + continue; + } + } + } + } + + /* Re-try the page level has been changed */ + if (!wrongChangeCount && level != PAGE_GET_LEVEL(intCxt.pagePtr)) + { + if (intCxt.haveLock) + { + unlock_page(intCxt.blkno); + intCxt.haveLock = false; + } + continue; + } + + if (!wrongChangeCount && STOPEVENTS_ENABLED()) + { + params = btree_page_stopevent_params(desc, intCxt.pagePtr); + STOPEVENT(STOPEVENT_PAGE_READ, params); + } + + /* Handle the incorrect root situation */ + if (context->index == 0 && (wrongChangeCount || + intCxt.pageChangeCount != O_PAGE_GET_CHANGE_COUNT(intCxt.pagePtr))) + { + /* Release lock if needed */ + if (intCxt.haveLock) + { + unlock_page(intCxt.blkno); + intCxt.haveLock = false; + } + + /* + * We don't need to re-read shared memory more that once with TRY + * flag. + */ + if (tryFlag && shmemIsReloaded) + return OFindPageResultFailure; + + /* Reload root information from the shared memory */ + desc->rootInfo.rootPageBlkno = OInvalidInMemoryBlkno; + desc->rootInfo.metaPageBlkno = OInvalidInMemoryBlkno; + desc->rootInfo.rootPageChangeCount = 0; + if (tryFlag) + { + if (!o_btree_try_use_shmem(desc)) + return OFindPageResultFailure; + } + else + { + o_btree_load_shmem(desc); + } + shmemIsReloaded = true; + + /* Initiate another attempt */ + intCxt.blkno = desc->rootInfo.rootPageBlkno; + intCxt.pageChangeCount = desc->rootInfo.rootPageChangeCount; + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + continue; + } + + if (context->index > 0 && (wrongChangeCount || + intCxt.pageChangeCount != O_PAGE_GET_CHANGE_COUNT(intCxt.pagePtr))) + { + /* + * It's not the expected page, try to refind it. + */ + step_upward_level(&intCxt); + continue; + } + + if (level > targetLevel || (downlinkLocationFlag && level > 0)) + { + OBTreeFastPathFindResult result; + + result = page_find_downlink(&intCxt, &fastpathMeta, level, + fastpath, &loc, &nonLeafHdr); + + Assert(result != OBTreeFastPathFindSlowpath); + + if (result == OBTreeFastPathFindFailure) + return OFindPageResultFailure; + else if (result == OBTreeFastPathFindRetry) + continue; + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + } + else + { + OBTreeFastPathFindResult result; + + result = page_find_item(&intCxt, &fastpathMeta, level, + fastpath, &loc, &nonLeafHdr); + + if (result == OBTreeFastPathFindFailure) + return OFindPageResultFailure; + else if (result == OBTreeFastPathFindRetry) + { + if (intCxt.inserted) + return OFindPageResultInserted; + continue; + } + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + } + + if (STOPEVENTS_ENABLED()) + { + params = btree_page_stopevent_params(desc, intCxt.pagePtr); + STOPEVENT(STOPEVENT_AFTER_FIND_DOWNLINK, params); + } + + /* Place new item to the context */ + Assert(context->index < ORIOLEDB_MAX_DEPTH); + + context->items[context->index].locator = loc; + context->items[context->index].blkno = intCxt.blkno; + context->items[context->index].pageChangeCount = O_PAGE_GET_CHANGE_COUNT(intCxt.pagePtr); + + /* Save the lokey if needed */ + if (keepLokeyFlag && level > 1 && + BTREE_PAGE_LOCATOR_GET_OFFSET(intCxt.pagePtr, &loc) > 0) + { + OTuple lokey; + + Assert(nonLeafHdr); + + BTREE_PAGE_READ_INTERNAL_TUPLE(lokey, intCxt.pagePtr, &loc); + copy_fixed_key(context->desc, &context->lokey, lokey); + BTREE_PAGE_FIND_SET(context, LOKEY_EXISTS); + BTREE_PAGE_FIND_UNSET(context, LOKEY_SIBLING); + BTREE_PAGE_FIND_UNSET(context, LOKEY_UNDO); + } + + if (level != targetLevel && (!imageFlag || level > targetLevel) && !nonLeafHdr) + { + Assert(tryFlag); + if (intCxt.haveLock) + { + unlock_page(intCxt.blkno); + intCxt.haveLock = false; + } + return OFindPageResultFailure; + } + + if (level == targetLevel || (imageFlag && level <= targetLevel)) + { + if (intCxt.haveLock) + { + /* + * The only way the target is reached under a page lock is the + * modify path -- needLock is set only on level > targetLevel + * and is cleared before we step down, and step_upward_level() + * clears haveLock when it unlocks. The IMAGE/FETCH callers + * expect context->img to be populated, which only happens in + * the lockless else branch above; if we ever reached here + * holding a lock without modifyFlag, that contract would be + * silently broken. + */ + Assert(modifyFlag); + + if (level == 0 && fixLeafFlag) + { + /* called from o_btree_normal_modify() */ + /* try to fix incomplete split for leafs here */ + bool relocked = false; + + Assert(!noFixFlag); + + if (O_PAGE_IS(p, BROKEN_SPLIT)) + { + o_btree_split_fix_for_right_page_and_unlock(desc, intCxt.blkno); + intCxt.haveLock = false; + step_upward_level(&intCxt); + continue; + } + else if (relocked) + { + step_upward_level(&intCxt); + continue; + } + } + } + + O_TUPLE_SET_NULL(context->insertTuple); + return OFindPageResultSuccess; + } + else if (!nonLeafHdr) + { + Assert(false); /* make clang static analyzer happy */ + } + else if (DOWNLINK_IS_ON_DISK(nonLeafHdr->downlink)) + { + if (tryFlag) + { + /* + * Don't try to load page from write_page() + */ + if (intCxt.haveLock) + unlock_page(intCxt.blkno); + return OFindPageResultFailure; + } + + if (intCxt.haveLock) + { + load_page(context); + intCxt.blkno = context->items[context->index].blkno; + loc = context->items[context->index].locator; + intCxt.pagePtr = p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + nonLeafHdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(intCxt.pagePtr, &loc); + + if (level != PAGE_GET_LEVEL(p)) + { + unlock_page(intCxt.blkno); + intCxt.haveLock = false; + continue; + } + + if (imageFlag && level == targetLevel + 1) + { + /* + * Just loaded the target's child into shared memory and + * refound the parent under MODIFY lock; the parent's + * downlinks differ from the pre-load partial read still + * sitting in parentImg. Refresh parentImg and rebind the + * locator before stepping down. + */ + refresh_parent_img_chunk(&intCxt); + } + } + else + { + needLock = true; + continue; + } + } + else if (DOWNLINK_IS_IN_IO(nonLeafHdr->downlink)) + { + int ionum = DOWNLINK_GET_IO_LOCKNUM(nonLeafHdr->downlink); + + if (intCxt.haveLock) + { + unlock_page(intCxt.blkno); + intCxt.haveLock = false; + } + wait_for_io_completion(ionum); + continue; + } + else + { + /* + * IN_MEMORY downlink at the parent of the target in IMAGE mode. + * If we got here under the lock (needLock = true on an earlier + * iteration) intCxt.pagePtr is the real shared-memory page, not + * parentImg, and the locator that find_right_page/find_left_page + * will later consult still has its chunk pointer bound to shared + * memory. Refresh parentImg from the locked page and rebind the + * locator onto parentImg so subsequent reads do not race against + * concurrent writers on the unlocked shared page. + */ + if (imageFlag && level == targetLevel + 1 && + intCxt.haveLock && intCxt.pagePtr != context->parentImg) + refresh_parent_img_chunk(&intCxt); + } + + parentBlkno = intCxt.blkno; + context->index++; + intCxt.blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(nonLeafHdr->downlink); + intCxt.pageChangeCount = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(nonLeafHdr->downlink); + + if (STOPEVENTS_ENABLED()) + { + params = btree_downlink_stopevent_params(desc, intCxt.pagePtr, &loc); + } + + if (intCxt.haveLock) + { + unlock_page(parentBlkno); + intCxt.haveLock = false; + } + + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + STOPEVENT(STOPEVENT_STEP_DOWN, params); + } +} + +static bool +follow_rightlink(OBTreeFindPageInternalContext *intCxt) +{ + OBTreeFindPageContext *context = intCxt->context; + BTreeDescr *desc = context->desc; + BTreeKeyType keykind = (intCxt->keyType == BTreeKeyPageHiKey ? + BTreeKeyNonLeafKey : + intCxt->keyType); + int followVal = (intCxt->keyType == BTreeKeyPageHiKey ? 1 : 0); + OTuple pageHiKey; + + if (!O_PAGE_IS(intCxt->pagePtr, RIGHTMOST)) + BTREE_PAGE_GET_HIKEY(pageHiKey, intCxt->pagePtr); + while (!O_PAGE_IS(intCxt->pagePtr, RIGHTMOST) && + (intCxt->keyType == BTreeKeyRightmost || + o_btree_cmp(desc, intCxt->key, keykind, + &pageHiKey, BTreeKeyNonLeafKey) >= followVal)) + { + uint64 rightlink = BTREE_PAGE_GET_RIGHTLINK(intCxt->pagePtr); + + if (!OInMemoryBlknoIsValid(RIGHTLINK_GET_BLKNO(rightlink))) + { + if (intCxt->haveLock) + { + unlock_page(intCxt->blkno); + intCxt->haveLock = false; + } + return true; + } + + if (BTREE_PAGE_FIND_IS(context, KEEP_LOKEY)) + { + copy_fixed_hikey(desc, &context->lokey, intCxt->pagePtr); + \ + Assert(!O_TUPLE_IS_NULL(context->lokey.tuple)); + BTREE_PAGE_FIND_SET(context, LOKEY_EXISTS); + if (PAGE_GET_LEVEL(intCxt->pagePtr) == intCxt->targetLevel) + { + BTREE_PAGE_FIND_SET(context, LOKEY_SIBLING); + BTREE_PAGE_FIND_UNSET(context, LOKEY_UNDO); + } + else + { + BTREE_PAGE_FIND_UNSET(context, LOKEY_SIBLING); + BTREE_PAGE_FIND_UNSET(context, LOKEY_UNDO); + } + } + + if (intCxt->haveLock) + unlock_page(intCxt->blkno); + + intCxt->blkno = RIGHTLINK_GET_BLKNO(rightlink); + + if (intCxt->haveLock) + { + if (BTREE_PAGE_FIND_IS(context, TRY_LOCK)) + { + if (!try_lock_page(intCxt->blkno)) + { + intCxt->haveLock = false; + intCxt->tryLockFailed = true; + return true; + } + } + else if (!O_TUPLE_IS_NULL(context->insertTuple)) + { + OLockPageWithTupleResult result; + + result = lock_page_with_tuple(desc, + &intCxt->blkno, + &intCxt->pageChangeCount, + context->insertXactInfo, + context->insertTuple); + + if (result == OLockPageWithTupleResultInserted) + { + intCxt->haveLock = false; + intCxt->inserted = true; + return true; + } + else if (result == OLockPageWithTupleResultRefindNeeded) + { + intCxt->haveLock = false; + return true; + } + Assert(result == OLockPageWithTupleResultLocked); + } + else + { + lock_page(intCxt->blkno); + } + intCxt->pagePtr = O_GET_IN_MEMORY_PAGE(intCxt->blkno); + intCxt->pageChangeCount = O_PAGE_GET_CHANGE_COUNT(intCxt->pagePtr); + if (intCxt->pageChangeCount != + RIGHTLINK_GET_CHANGECOUNT(rightlink)) + { + /* + * Split was finished and right page is already + * merged/evicted. Have to retry. + */ + unlock_page(intCxt->blkno); + intCxt->haveLock = false; + return true; + } + } + else + { + bool useParentImg = (intCxt->pagePtr == context->parentImg); + + if (!btree_find_read_page(context, intCxt->blkno, + RIGHTLINK_GET_CHANGECOUNT(rightlink), + useParentImg, + intCxt->key, + intCxt->keyType, + intCxt->partial, + true)) + return true; + intCxt->pagePtr = useParentImg ? context->parentImg : context->img; + intCxt->pageChangeCount = O_PAGE_GET_CHANGE_COUNT(intCxt->pagePtr); + Assert(RIGHTLINK_GET_CHANGECOUNT(rightlink) == + O_PAGE_GET_CHANGE_COUNT(intCxt->pagePtr)); + } + if (!O_PAGE_IS(intCxt->pagePtr, RIGHTMOST)) + BTREE_PAGE_GET_HIKEY(pageHiKey, intCxt->pagePtr); + } + return false; +} + +/* + * Step to the upward level of the tree and retry the search. + */ +static void +step_upward_level(OBTreeFindPageInternalContext *intCxt) +{ + OBTreeFindPageContext *context = intCxt->context; + + if (intCxt->haveLock) + { + unlock_page(intCxt->blkno); + intCxt->haveLock = false; + } + context->index--; + intCxt->blkno = context->items[context->index].blkno; + intCxt->pageChangeCount = context->items[context->index].pageChangeCount; +} + +/* + * Re-find the location of previously found key. If search for modification, + * assume lock was relesed (otherwise, no point to refind). + */ +OFindPageResult +refind_page(OBTreeFindPageContext *context, void *key, BTreeKeyType keyType, + uint16 level, OInMemoryBlkno _blkno, uint32 _pageChangeCount) +{ + BTreeDescr *desc = context->desc; + OBTreeFindPageInternalContext intCxt; + BTreePageItemLocator loc; + bool item_found = true; + + ASAN_UNPOISON_MEMORY_REGION(&intCxt, sizeof(intCxt)); + intCxt.context = context; + intCxt.key = key; + intCxt.keyType = keyType; + intCxt.blkno = _blkno; + intCxt.targetLevel = level; + intCxt.pageChangeCount = _pageChangeCount; + intCxt.partial = NULL; + intCxt.inserted = false; + intCxt.tryLockFailed = false; + + if (!BTREE_PAGE_FIND_IS(context, TRY_LOCK)) + { + o_btree_load_shmem(desc); + } + else + { + if (!o_btree_try_use_shmem(desc)) + return OFindPageResultFailure; + } + +retry: + + if (BTREE_PAGE_FIND_IS(context, MODIFY)) + { + Pointer p; + + if (intCxt.pageChangeCount == InvalidOPageChangeCount) + return find_page(context, key, keyType, level); + + /* + * Local-pool slots are NULLed on eviction, unlike shared-pool slots + * where pageChangeCount alone signals replacement (the shmem page + * stays readable). The slot at the caller's saved (blkno, + * pageChangeCount) may have been evicted since, so PAGE_GET_LEVEL + * below would segfault. Fall back to find_page() to resolve the + * downlink from scratch. + */ + if (O_PAGE_IS_LOCAL(intCxt.blkno) && + local_ppool_pages[intCxt.blkno & O_BLKNO_MASK] == NULL) + return find_page(context, key, keyType, level); + + if (!O_TUPLE_IS_NULL(context->insertTuple)) + { + OLockPageWithTupleResult result; + + result = lock_page_with_tuple(desc, + &intCxt.blkno, + &intCxt.pageChangeCount, + context->insertXactInfo, + context->insertTuple); + + if (result == OLockPageWithTupleResultInserted) + return OFindPageResultInserted; + else if (result == OLockPageWithTupleResultRefindNeeded) + return find_page(context, key, keyType, level); + Assert(result == OLockPageWithTupleResultLocked); + } + else + { + lock_page(intCxt.blkno); + } + p = O_GET_IN_MEMORY_PAGE(intCxt.blkno); + intCxt.haveLock = true; + intCxt.pagePtr = p; + if (PAGE_GET_LEVEL(p) != level || + O_PAGE_GET_CHANGE_COUNT(p) != intCxt.pageChangeCount) + { + unlock_page(intCxt.blkno); + return find_page(context, key, keyType, level); + } + + if (level == 0 && BTREE_PAGE_FIND_IS(context, FIX_LEAF_SPLIT)) + { + /* called from o_btree_normal_modify() */ + /* try to fix incomplete split for leafs here */ + + Assert(!BTREE_PAGE_FIND_IS(context, NO_FIX_SPLIT)); + + if (O_PAGE_IS(p, BROKEN_SPLIT)) + { + o_btree_split_fix_for_right_page_and_unlock(desc, intCxt.blkno); + intCxt.haveLock = false; + o_btree_split_fix_and_unlock(desc, intCxt.blkno); + goto retry; + } + } + } + else if (BTREE_PAGE_FIND_IS(context, FETCH)) + { + Pointer img; + bool success; + + if (intCxt.pageChangeCount == InvalidOPageChangeCount) + return find_page(context, key, keyType, level); + + context->partial.isPartial = false; + intCxt.partial = &context->partial; + success = btree_find_read_page(context, + intCxt.blkno, + intCxt.pageChangeCount, + false, + key, + keyType, + intCxt.partial, + true); + img = context->img; + + intCxt.haveLock = false; + intCxt.pagePtr = img; + if (!success || + PAGE_GET_LEVEL(img) != level) + { + return find_page(context, key, keyType, level); + } + Assert(O_PAGE_GET_CHANGE_COUNT(img) == intCxt.pageChangeCount); + } + else + { + Assert(false); + /* quiet compiler warnings */ + intCxt.haveLock = false; + intCxt.pagePtr = NULL; + } + + /* Follow the page rightlink if needed */ + if (keyType != BTreeKeyNone) + { + if (follow_rightlink(&intCxt)) + { + if (intCxt.tryLockFailed) + return OFindPageResultFailure; + if (intCxt.inserted) + return OFindPageResultInserted; + Assert(!intCxt.haveLock); + return find_page(context, key, keyType, level); + } + } + + if (keyType == BTreeKeyRightmost) + { + /* We're looking for the rightmost page, so go the rightmost downlink */ + BTREE_PAGE_LOCATOR_LAST(intCxt.pagePtr, &loc); + } + else if (keyType == BTreeKeyNone) + { + /* We're looking for the leftmost page, so go the leftmost downlink */ + BTREE_PAGE_LOCATOR_FIRST(intCxt.pagePtr, &loc); + } + else + { + /* Locate the correct downlink within the non-leaf page */ + Assert(key); + item_found = btree_page_search(desc, intCxt.pagePtr, key, keyType, + intCxt.partial, &loc); + if (item_found) + { + if (BTREE_PAGE_FIND_IS(context, DOWNLINK_LOCATION)) + { + Assert(!O_PAGE_IS(intCxt.pagePtr, LEAF)); + BTREE_PAGE_LOCATOR_PREV(intCxt.pagePtr, &loc); + if (intCxt.partial) + item_found = partial_load_chunk(intCxt.partial, + intCxt.pagePtr, + loc.chunkOffset, + NULL); + } + else if (!BTREE_PAGE_FIND_IS(context, MODIFY)) + item_found = page_locator_find_real_item(intCxt.pagePtr, + intCxt.partial, + &loc); + } + } + + if (intCxt.partial) + { + if (!item_found) + goto retry; + + if (!partial_load_chunk(intCxt.partial, intCxt.pagePtr, + loc.chunkOffset, NULL)) + goto retry; + } + + context->items[context->index].locator = loc; + context->items[context->index].blkno = intCxt.blkno; + context->items[context->index].pageChangeCount = intCxt.pageChangeCount; + return OFindPageResultSuccess; +} + +/* + * Find the right sibling of the current page. + * + * Old page hikey will be saved to hikey_buf. It helps to avoid redundant + * buffering at BTree iterators code. + * + * Returns true on success, false for rightmost page. + */ +bool +find_right_page(OBTreeFindPageContext *context, OFixedKey *hikey) +{ + BTreeDescr *desc = context->desc; + BTreePageItemLocator loc; + OBtreePageFindItem *parentItem, + *item; + int level; + Jsonb *params; + + /* Nothing to do with rightmost page */ + if (O_PAGE_IS(context->img, RIGHTMOST)) + return false; + + /* + * Currenlty, the only user of this function is iterator, which is + * read-only. So, no support for modification, but could we added later. + */ + Assert(!BTREE_PAGE_FIND_IS(context, MODIFY)); + + if (STOPEVENTS_ENABLED()) + { + params = btree_page_stopevent_params(desc, context->img); + STOPEVENT(STOPEVENT_STEP_RIGHT, params); + } + + level = PAGE_GET_LEVEL(context->img); + + /* In this case, we shoudn't be in the rootPageBlkno... */ + Assert(context->index > 0); + + parentItem = &context->items[context->index - 1]; + item = &context->items[context->index]; + + /* Try to get next item from the parent page */ + loc = context->items[context->index - 1].locator; + + Assert(loc.chunk == NULL || + ((Pointer) loc.chunk >= context->parentImg && + (Pointer) loc.chunk < context->parentImg + ORIOLEDB_BLCKSZ)); + + if (BTREE_PAGE_LOCATOR_IS_VALID(context->parentImg, &loc)) + BTREE_PAGE_LOCATOR_NEXT(context->parentImg, &loc); + + /* copy hikey */ + copy_fixed_hikey(desc, hikey, context->img); + + /* Try to load next page using next parent downlink */ + if (BTREE_PAGE_LOCATOR_IS_VALID(context->parentImg, &loc)) + { + OTuple internalTuple; + BTreeNonLeafTuphdr *tuphdr = NULL; + bool tup_loaded = true; + + tup_loaded = partial_load_chunk(&context->partial, context->parentImg, + loc.chunkOffset, NULL); + if (tup_loaded) + { + BTREE_PAGE_READ_INTERNAL_ITEM(tuphdr, internalTuple, context->parentImg, &loc); + Assert(tuphdr != NULL); + } + + /* Check it's consistent with our hikey */ + if (tup_loaded && DOWNLINK_IS_IN_MEMORY(tuphdr->downlink) && + o_btree_cmp(desc, + hikey, BTreeKeyNonLeafKey, + &internalTuple, BTreeKeyNonLeafKey) == 0) + { + /* Try to traverse downlink */ + bool success; + + item->blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink); + item->pageChangeCount = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(tuphdr->downlink); + + success = btree_find_read_page(context, item->blkno, item->pageChangeCount, + false, &hikey->tuple, BTreeKeyNonLeafKey, NULL, + true); + if (success && + PAGE_GET_LEVEL(context->img) == level) + { + Assert(O_PAGE_GET_CHANGE_COUNT(context->img) == item->pageChangeCount); + BTREE_PAGE_LOCATOR_FIRST(context->img, &item->locator); + parentItem->locator = loc; + return true; + } + } + } + + /* + * Give up with parent downlink. Find the page from the root in a usual + * way. Should happend rarely. + */ + (void) find_page(context, hikey, BTreeKeyNonLeafKey, level); + return true; +} + +/* + * Find the left sibling of the current page. + * + * Expected new page hikey (lokey for old page) will be saved to hikey_buf. + * It helps to avoid redundant buffer at BTree iterators code. + * + * Returns true on success, false for leftmost page. + */ +bool +find_left_page(OBTreeFindPageContext *context, OFixedKey *hikey) +{ + BTreeNonLeafTuphdr *tuphdr; + BTreeDescr *desc = context->desc; + OBtreePageFindItem *parentItem, + *item; + int level; + UndoLocation prevLoc; + Jsonb *params; + OTuple imgHikey; + + Assert(BTREE_PAGE_FIND_IS(context, KEEP_LOKEY)); + + /* + * Currenlty, the only user of this function is iterator, which is + * read-only. So, no support for modification, but could we added later. + */ + Assert(!BTREE_PAGE_FIND_IS(context, MODIFY)); + + if (STOPEVENTS_ENABLED()) + { + params = btree_page_stopevent_params(desc, context->img); + STOPEVENT(STOPEVENT_STEP_LEFT, params); + } + + level = PAGE_GET_LEVEL(context->img); + /* In this case, we shoudn't be in the rootPageBlkno... */ + Assert(level == 0); + Assert(context->index > 0); + parentItem = &context->items[context->index - 1]; + item = &context->items[context->index]; + + prevLoc = context->imgUndoLoc; + while (true) + { + /* Nothing to do with leftmost page */ + if (O_PAGE_IS(context->img, LEFTMOST)) + return false; + + Assert(!O_TUPLE_IS_NULL(btree_find_context_lokey(context))); + copy_fixed_key(desc, hikey, btree_find_context_lokey(context)); + + /* + * if we have rightlink hikey on the same level (leaf in this case) + * just follow it. + */ + if (!BTREE_PAGE_FIND_IS(context, LOKEY_SIBLING) && + !BTREE_PAGE_FIND_IS(context, LOKEY_UNDO)) + { + BTreePageItemLocator loc = parentItem->locator; + bool next_lokey_loaded = true; + + Assert(loc.chunk == NULL || + ((Pointer) loc.chunk >= context->parentImg && + (Pointer) loc.chunk < context->parentImg + ORIOLEDB_BLCKSZ)); + + /* + * Tries to read image from parent downlink without find_page(). + */ + if (BTREE_PAGE_LOCATOR_IS_VALID(context->parentImg, &loc)) + { + BTREE_PAGE_LOCATOR_PREV(context->parentImg, &loc); + next_lokey_loaded = partial_load_chunk(&context->partial, + context->parentImg, + loc.chunkOffset, + NULL); + } + + if (next_lokey_loaded && BTREE_PAGE_LOCATOR_IS_VALID(context->parentImg, &loc)) + { + tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(context->parentImg, &loc); + + /* + * else next lokey saved in context.lokey + */ + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + bool success; + + item->blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink); + item->pageChangeCount = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(tuphdr->downlink); + + success = btree_find_read_page(context, + item->blkno, + item->pageChangeCount, + false, + NULL, + BTreeKeyRightmost, + NULL, + true); + + if (success && + context->imgUndoLoc != InvalidUndoLocation && + prevLoc == context->imgUndoLoc) + { + parentItem->locator = loc; + continue; + } + + + if (success && + PAGE_GET_LEVEL(context->img) == level && + !O_PAGE_IS(context->img, RIGHTMOST)) + { + BTREE_PAGE_GET_HIKEY(imgHikey, context->img); + + if (o_btree_cmp(desc, &hikey->tuple, BTreeKeyNonLeafKey, + &imgHikey, BTreeKeyNonLeafKey) == 0) + { + Assert(O_PAGE_GET_CHANGE_COUNT(context->img) == item->pageChangeCount); + parentItem->locator = loc; + BTREE_PAGE_LOCATOR_LAST(context->img, &item->locator); + return true; + } + } + } + } + } + + (void) find_page(context, &hikey->tuple, BTreeKeyPageHiKey, level); + + /* context levels may be changed */ + parentItem = &context->items[context->index - 1]; + item = &context->items[context->index]; + + if (prevLoc != InvalidUndoLocation && prevLoc == context->imgUndoLoc) + continue; + + if (COMMITSEQNO_IS_INPROGRESS(context->csn) && + !O_PAGE_IS(context->img, RIGHTMOST)) + BTREE_PAGE_GET_HIKEY(imgHikey, context->img); + + if (COMMITSEQNO_IS_INPROGRESS(context->csn) && + (O_PAGE_IS(context->img, RIGHTMOST) + || o_btree_cmp(desc, &imgHikey, BTreeKeyNonLeafKey, hikey, BTreeKeyNonLeafKey) != 0)) + { + /* + * The BTree may be changed in progress, but find_page() function + * setup leaf offset always as BTREE_PAGE_ITEMS_COUNT(page) - 1 + * for the BTreeHiKey search case. + * + * We must refind the leaf offset in this case. + */ + btree_page_search(desc, + context->img, + (Pointer) &hikey->tuple, BTreeKeyNonLeafKey, NULL, + &item->locator); + BTREE_PAGE_LOCATOR_PREV(context->img, &item->locator); + } + + return true; + } + + /* unreachable */ + Assert(false); + return false; +} + +/* + * Return lokey of the context->img. + * + * It assumes that context->img have a lokey. All checks must be done by a caller code + * (BTREE_PAGE_FIND_KEEP_LOKEY flag exist, !PAGE_IS_LEFTMOST(context->img)). + */ +OTuple +btree_find_context_lokey(OBTreeFindPageContext *context) +{ + BTreePageItemLocator ploc = context->items[context->index - 1].locator; + + Assert(BTREE_PAGE_FIND_IS(context, KEEP_LOKEY)); + + if (BTREE_PAGE_FIND_IS(context, LOKEY_UNDO)) + { + /* + * Hikey of a left sibling from undo log. + */ + return context->undoLokey.tuple; + } + else if (BTREE_PAGE_FIND_IS(context, LOKEY_SIBLING)) + { + /* + * Hikey of the left sibling (had a rightlink to the current page). + */ + return context->lokey.tuple; + } + else if (BTREE_PAGE_LOCATOR_GET_OFFSET(context->parentImg, &ploc) > 0) + { + /* + * Fetches lokey for the left sibling from the parent image. + */ + OTuple result; + + BTREE_PAGE_READ_INTERNAL_TUPLE(result, context->parentImg, &ploc); + return result; + } + else + { + /* + * Hikey of the left sibling of the parent. + */ + Assert(context->flags & BTREE_PAGE_FIND_LOKEY_EXISTS); + return context->lokey.tuple; + } +} + +static Pointer +set_page_ptr(OBTreeFindPageContext *context, bool parent) +{ + Pointer pagePtr; + + if (!parent) + pagePtr = context->img = context->imgData; + else + pagePtr = context->parentImg = context->parentImgData; + return pagePtr; +} + +/* + * Navigates and reads page image from undo log according to find context. + * Saves lokey of the founded page to context->lokey if needed. + */ +static bool +btree_find_read_page(OBTreeFindPageContext *context, OInMemoryBlkno blkno, + uint32 pageChangeCount, bool parent, void *key, + BTreeKeyType keyType, PartialPageState *partial, + bool loadHikeysChunk) +{ + bool keep_lokey = BTREE_PAGE_FIND_IS(context, KEEP_LOKEY); + OFixedKey *lokey = keep_lokey ? &context->undoLokey : NULL; + CommitSeqNo *readCsn = BTREE_PAGE_FIND_IS(context, READ_CSN) ? &context->imgReadCsn : NULL; + bool success; + Pointer pagePtr; + + pagePtr = set_page_ptr(context, parent); + + BTREE_PAGE_FIND_UNSET(context, LOKEY_UNDO); + if (lokey) + clear_fixed_key(lokey); + + success = o_btree_read_page(context->desc, blkno, pageChangeCount, pagePtr, + context->csn, key, keyType, lokey, + partial, loadHikeysChunk, &context->imgUndoLoc, + readCsn); + + if (!success) + return false; + + if (lokey && !O_TUPLE_IS_NULL(lokey->tuple)) + BTREE_PAGE_FIND_SET(context, LOKEY_UNDO); + return true; +} + +/* + * Navigates and reads page image from undo log according to find context. + * Saves lokey of the founded page to context->lokey if needed. + */ +static ReadPageResult +btree_find_try_read_page(OBTreeFindPageContext *context, OInMemoryBlkno blkno, + uint32 pageChangeCount, bool parent, void *key, + BTreeKeyType keyType, PartialPageState *partial, + bool loadHikeysChunk) +{ + CommitSeqNo *readCsn = BTREE_PAGE_FIND_IS(context, READ_CSN) ? &context->imgReadCsn : NULL; + ReadPageResult result; + Pointer pagePtr; + + pagePtr = set_page_ptr(context, parent); + + result = o_btree_try_read_page(context->desc, blkno, pageChangeCount, + pagePtr, context->csn, + key, keyType, partial, loadHikeysChunk, + readCsn); + + return result; +} + +void +btree_find_context_from_modify_to_read(OBTreeFindPageContext *context, + Pointer key, + BTreeKeyType keyType, + uint16 level) +{ + BTreePageItemLocator loc; + bool success; + + Assert(!BTREE_PAGE_FIND_IS(context, DOWNLINK_LOCATION)); + Assert(BTREE_PAGE_FIND_IS(context, MODIFY)); + Assert(BTREE_PAGE_FIND_IS(context, IMAGE)); + BTREE_PAGE_FIND_UNSET(context, MODIFY); + + success = btree_find_read_page(context, + context->items[context->index].blkno, + context->items[context->index].pageChangeCount, + false, + key, + keyType, + NULL, + true); + + if (!success) + { + (void) find_page(context, key, keyType, level); + return; + } + + if (keyType == BTreeKeyRightmost) + { + /* We're looking for the rightmost page, so go the rightmost downlink */ + BTREE_PAGE_LOCATOR_LAST(context->img, &loc); + } + else if (keyType == BTreeKeyNone) + { + /* We're looking for the leftmost page, so go the leftmost downlink */ + BTREE_PAGE_LOCATOR_FIRST(context->img, &loc); + } + else + { + /* Locate the correct downlink within the non-leaf page */ + (void) btree_page_search(context->desc, context->img, + key, keyType, + NULL, &loc); + (void) page_locator_find_real_item(context->img, + NULL, + &loc); + } + + context->items[context->index].locator = loc; +} + +/* + * Search for a key within the page. First, it does binary search of + * appropriate chunk, then binary search within the chunk. + * + * This function is aware of partial page read. Returns true if it managed + * to read the required chunk and false otherwise. When no partial page + * state is give, always returns true. + */ +bool +btree_page_search(BTreeDescr *desc, Page p, Pointer key, BTreeKeyType keyType, + PartialPageState *partial, BTreePageItemLocator *locator) +{ + OffsetNumber chunkOffset; + bool isLeaf = O_PAGE_IS(p, LEAF); + + if (keyType == BTreeKeyPageHiKey && isLeaf) + { + BTREE_PAGE_LOCATOR_LAST(p, locator); + if (partial && !partial_load_chunk(partial, p, + locator->chunkOffset, NULL)) + return false; + return true; + } + + chunkOffset = btree_page_binary_search_chunks(desc, p, key, keyType); + + if (partial && !partial_load_chunk(partial, p, chunkOffset, NULL)) + return false; + + page_chunk_fill_locator(p, chunkOffset, locator); + + btree_page_search_items(desc, p, key, keyType, locator); + + return true; +} + +/* + * Search for the chunk containing key. + */ +static OffsetNumber +btree_page_binary_search_chunks(BTreeDescr *desc, Page p, + Pointer key, BTreeKeyType keyType) +{ + OffsetNumber mid, + low, + high; + int targetCmpVal, + result; + bool nextkey; + BTreePageHeader *header = (BTreePageHeader *) p; + OBTreeKeyCmp cmpFunc = desc->ops->cmp; + + Assert(header->chunksCount > 0); + + low = 0; + high = header->chunksCount - 1; + nextkey = (keyType != BTreeKeyPageHiKey); + + if (high < low) + return low; + + targetCmpVal = nextkey ? 0 : 1; /* a target value of cmpFunc() */ + + /* + * Don't pass BTreeHiKey to comparison function, we've set nextkey flag + * instead. + */ + if (keyType == BTreeKeyPageHiKey) + keyType = BTreeKeyNonLeafKey; + + while (high > low) + { + OTuple midTup; + + mid = low + ((high - low) / 2); + Assert(mid < header->chunksCount - 1); + + /* We have low <= mid < high, so mid points at a real slot */ + + midTup.formatFlags = header->chunkDesc[mid].hikeyFlags; + midTup.data = p + SHORT_GET_LOCATION(header->chunkDesc[mid].hikeyShortLocation); + result = cmpFunc(desc, key, keyType, &midTup, BTreeKeyNonLeafKey); + + if (result >= targetCmpVal) + low = mid + 1; + else + high = mid; + } + + return low; +} + +static void +btree_page_search_items(BTreeDescr *desc, Page p, Pointer key, + BTreeKeyType keyType, BTreePageItemLocator *locator) +{ + OffsetNumber mid, + low, + high; + bool isLeaf = O_PAGE_IS(p, LEAF), + nextkey; + OBTreeKeyCmp cmpFunc = desc->ops->cmp; + BTreeKeyType midkind; + int targetCmpVal, + result; + + midkind = isLeaf ? BTreeKeyLeafTuple : BTreeKeyNonLeafKey; + + if (locator->chunkItemsCount == 0) + { + locator->itemOffset = 0; + return; + } + + low = 0; + high = locator->chunkItemsCount - 1; + nextkey = (!isLeaf && keyType != BTreeKeyPageHiKey); + + /* Shouldn't look for hikey on leafs, because we're already here */ + Assert(!(isLeaf && keyType == BTreeKeyPageHiKey)); + + /* + * Binary search to find the first key on the page >= `key`, or first page + * key > `key` when nextkey is true. + * + * For nextkey=false (cmp=1), the loop invariant is: all slots before + * `low` are < `key`, all slots at or after `high` are >= `key`. + * + * For nextkey=true (cmp=0), the loop invariant is: all slots before `low` + * are <= `key`, all slots at or after `high` are > `key`. + * + * We can fall out when `high` == `low`. + */ + high++; /* establish the loop invariant for high */ + + targetCmpVal = nextkey ? 0 : 1; /* a target value of cmpFunc() */ + + /* + * Don't pass BTreeHiKey to comparison function, we've set nextkey flag + * instead. + */ + if (keyType == BTreeKeyPageHiKey) + keyType = BTreeKeyNonLeafKey; + + while (high > low) + { + mid = low + ((high - low) / 2); + + if (!isLeaf && mid == 0 && locator->chunkOffset == 0) + result = 1; + else + { + OTuple midTup; + + locator->itemOffset = mid; + BTREE_PAGE_READ_TUPLE(midTup, p, locator); + result = cmpFunc(desc, key, keyType, &midTup, midkind); + } + + if (result >= targetCmpVal) + low = mid + 1; + else + high = mid; + } + + locator->itemOffset = low; +} diff --git a/contrib/orioledb/src/btree/insert.c b/contrib/orioledb/src/btree/insert.c new file mode 100644 index 00000000000..07955ee3faf --- /dev/null +++ b/contrib/orioledb/src/btree/insert.c @@ -0,0 +1,1489 @@ +/*------------------------------------------------------------------------- + * + * insert.c + * Routines for implementation of inserting new item into B-tree page. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/insert.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/insert.h" +#include "btree/split.h" +#include "btree/page_contents.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "checkpoint/checkpoint.h" +#include "recovery/recovery.h" +#include "transam/undo.h" +#include "tuple/format.h" +#include "utils/page_pool.h" +#include "utils/stopevent.h" + +#include "miscadmin.h" +#include "utils/memutils.h" + +/* In order to avoid use of the recursion in insert_leaf() we use context. */ +typedef struct BTreeInsertStackItem +{ + /* next item in the find context. next == NULL if it's last item. */ + struct BTreeInsertStackItem *next; + /* current find context */ + OBTreeFindPageContext *context; + /* if level == 0, tuple is BTreeTuple else it is BTreeKey */ + OTuple tuple; + + /* + * if level == 0, tupheader is BTreeLeafTuphdr else it is + * BTreeNonLeafTuphdr + */ + Pointer tupheader; + /* length of the tuple */ + Size tuplen; + /* current level of the insert */ + int level; + /* blkno of the right page of incomplete split. */ + OInMemoryBlkno rightBlkno; + /* is current item replace tuple */ + bool replace; + /* is refind_page must be called */ + bool refind; +} BTreeInsertStackItem; + +/* Fills BTreeInsertStackItem as a downlink of current incomplete split. */ +static void o_btree_split_fill_downlink_item(BTreeInsertStackItem *insert_item, + OInMemoryBlkno left_blkno, + bool lock); + +/* + * Finishes split of the rootPageBlkno page. + * insert_item can be filled by o_btree_split_fill_downlink_item call. + */ +static OInMemoryBlkno o_btree_finish_root_split_internal(BTreeDescr *desc, + OInMemoryBlkno left_blkno, + BTreeInsertStackItem *insert_item); + +/* + * Adds a new fix split item to insert context. It modifies an insert_item. + */ +static BTreeInsertStackItem *o_btree_insert_stack_push_split_item(BTreeInsertStackItem *insert_item, + OInMemoryBlkno left_blkno); + +static void o_btree_insert_item(BTreeInsertStackItem *insert_item, + int reserve_kind); + +/* + * Returns true if a current page is the left page of incomplete split. + * Should be always call before insert a new tuple to page. + */ +bool +o_btree_split_is_incomplete(OInMemoryBlkno left_blkno, uint32 pageChangeCount, + bool *relocked) +{ + Page p = O_GET_IN_MEMORY_PAGE(left_blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + uint64 rightLink = header->rightLink; + + if (RightLinkIsValid(rightLink)) + { + Page rightP = O_GET_IN_MEMORY_PAGE(RIGHTLINK_GET_BLKNO(rightLink)); + + Assert(O_PAGE_GET_CHANGE_COUNT(rightP) == RIGHTLINK_GET_CHANGECOUNT(rightLink)); + + if (O_PAGE_IS(p, BROKEN_SPLIT)) + return true; + + /* wait for split finish */ + while (RightLinkIsValid(rightLink) && !O_PAGE_IS(rightP, BROKEN_SPLIT)) + { + relock_page(left_blkno); + *relocked = true; + if (O_PAGE_GET_CHANGE_COUNT(p) != pageChangeCount) + return false; + + rightLink = header->rightLink; + if (RightLinkIsValid(rightLink)) + { + rightP = O_GET_IN_MEMORY_PAGE(RIGHTLINK_GET_BLKNO(rightLink)); + Assert(O_PAGE_GET_CHANGE_COUNT(rightP) == RIGHTLINK_GET_CHANGECOUNT(rightLink)); + } + } + + /* split should be broken or ok after this */ + Assert(O_PAGE_IS(rightP, BROKEN_SPLIT) || !RightLinkIsValid(rightLink)); + + if (O_PAGE_IS(rightP, BROKEN_SPLIT)) + return true; + } + return false; +} + +static void +o_btree_split_fill_downlink_item_with_key(BTreeInsertStackItem *insert_item, + OInMemoryBlkno left_blkno, + bool lock, + OTuple key, + LocationIndex keylen, + BTreeNonLeafTuphdr *internal_header) +{ + BTreePageHeader *header; + OInMemoryBlkno right_blkno; + Page left_page = O_GET_IN_MEMORY_PAGE(left_blkno), + right_page; + + header = (BTreePageHeader *) left_page; + Assert(!O_PAGE_IS(left_page, RIGHTMOST)); + Assert(RightLinkIsValid(header->rightLink)); + + right_blkno = RIGHTLINK_GET_BLKNO(header->rightLink); + if (lock) + lock_page(right_blkno); + + right_page = O_GET_IN_MEMORY_PAGE(right_blkno); + Assert(O_PAGE_GET_CHANGE_COUNT(right_page) == RIGHTLINK_GET_CHANGECOUNT(header->rightLink)); + + insert_item->tuplen = keylen; + insert_item->tuple = key; + + internal_header->downlink = MAKE_IN_MEMORY_DOWNLINK(right_blkno, + O_PAGE_GET_CHANGE_COUNT(right_page)); + + if (lock) + unlock_page(right_blkno); + + insert_item->tupheader = (Pointer) internal_header; +} + +static void +o_btree_split_fill_downlink_item(BTreeInsertStackItem *insert_item, + OInMemoryBlkno left_blkno, + bool lock) +{ + Page left_page = O_GET_IN_MEMORY_PAGE(left_blkno); + OTuple hikey; + OTuple key; + LocationIndex keylen; + BTreeNonLeafTuphdr *internal_header = palloc(sizeof(BTreeNonLeafTuphdr)); + + keylen = BTREE_PAGE_GET_HIKEY_SIZE(left_page); + BTREE_PAGE_GET_HIKEY(hikey, left_page); + key.data = (Pointer) palloc(keylen); + key.formatFlags = hikey.formatFlags; + memcpy(key.data, hikey.data, keylen); + + o_btree_split_fill_downlink_item_with_key(insert_item, left_blkno, lock, + key, keylen, internal_header); +} + +static OInMemoryBlkno +o_btree_finish_root_split_internal(BTreeDescr *desc, + OInMemoryBlkno left_blkno, + BTreeInsertStackItem *insert_item) +{ + BTreeNonLeafTuphdr internal_header; + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(desc->rootInfo.rootPageBlkno); + BTreePageHeader *left_header, + *root_header; + Pointer ptr; + Page p = O_GET_IN_MEMORY_PAGE(desc->rootInfo.rootPageBlkno), + left_page; + FileExtent root_extent = page_desc->fileExtent; + bool is_leaf = PAGE_GET_LEVEL(p) == 0; + BTreePageItemLocator loc; + + left_page = O_GET_IN_MEMORY_PAGE(left_blkno); + init_new_btree_page(desc, left_blkno, O_BTREE_FLAG_LEFTMOST, PAGE_GET_LEVEL(p), false); + + memcpy(left_page + O_PAGE_HEADER_SIZE, + p + O_PAGE_HEADER_SIZE, + ORIOLEDB_BLCKSZ - O_PAGE_HEADER_SIZE); + + page_block_reads(desc->rootInfo.rootPageBlkno); + + init_new_btree_page(desc, desc->rootInfo.rootPageBlkno, + O_BTREE_FLAG_RIGHTMOST | O_BTREE_FLAG_LEFTMOST, + PAGE_GET_LEVEL(left_page) + 1, true); + init_page_first_chunk(desc, p, 0); + + /* restore checkpoint number and file offset for the rootPageBlkno */ + left_header = (BTreePageHeader *) left_page; + root_header = (BTreePageHeader *) p; + root_header->o_header.checkpointNum = left_header->o_header.checkpointNum; + left_header->o_header.checkpointNum = 0; + page_desc->fileExtent = root_extent; + + Assert(left_blkno); + Assert(page_is_locked(desc->rootInfo.rootPageBlkno) || O_PAGE_IS_LOCAL(desc->rootInfo.rootPageBlkno)); + + BTREE_PAGE_LOCATOR_FIRST(p, &loc); + page_locator_insert_item(p, &loc, BTreeNonLeafTuphdrSize); + BTREE_PAGE_LOCATOR_NEXT(p, &loc); + page_locator_insert_item(p, &loc, MAXALIGN(insert_item->tuplen) + BTreeNonLeafTuphdrSize); + + ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + memcpy(ptr, insert_item->tupheader, BTreeNonLeafTuphdrSize); + ptr += BTreeNonLeafTuphdrSize; + memcpy(ptr, insert_item->tuple.data, insert_item->tuplen); + BTREE_PAGE_SET_ITEM_FLAGS(p, &loc, insert_item->tuple.formatFlags); + + if (!(insert_item->tuple.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + root_header->chunkDesc[0].chunkKeysFixed = 0; + + internal_header.downlink = MAKE_IN_MEMORY_DOWNLINK(left_blkno, + O_PAGE_GET_CHANGE_COUNT(left_page)); + BTREE_PAGE_LOCATOR_FIRST(p, &loc); + ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + memcpy(ptr, &internal_header, BTreeNonLeafTuphdrSize); + + MARK_DIRTY(desc, left_blkno); + MARK_DIRTY(desc, desc->rootInfo.rootPageBlkno); + + O_GET_IN_MEMORY_PAGEDESC(insert_item->rightBlkno)->leftBlkno = left_blkno; + btree_split_mark_finished(insert_item->rightBlkno, false, true); + insert_item->rightBlkno = OInvalidInMemoryBlkno; + + btree_page_update_max_key_len(desc, p); + + unlock_page(desc->rootInfo.rootPageBlkno); + unlock_page(left_blkno); + + if (is_leaf) + pg_atomic_fetch_add_u32(&BTREE_GET_META(desc)->leafPagesNum, 1); + + return left_blkno; +} + +/* + * Fixes incomplete split of a non-rootPageBlkno page. + * Left page must be locked. Unlocks left page and all pages used internally. + */ +static void +o_btree_fix_page_split(BTreeDescr *desc, OInMemoryBlkno left_blkno) +{ + BTreeInsertStackItem iitem; + OBTreeFindPageContext context; + Page p = O_GET_IN_MEMORY_PAGE(left_blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + BTreePageHeader *rightHeader = (BTreePageHeader *) p; + OFixedKey key; + OInMemoryBlkno rightBlkno; + int level = PAGE_GET_LEVEL(p); + + Assert(left_blkno != desc->rootInfo.rootPageBlkno); + + iitem.context = &context; + copy_fixed_hikey(desc, &key, p); + rightBlkno = RIGHTLINK_GET_BLKNO(header->rightLink); + rightHeader = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(rightBlkno); + lock_page(rightBlkno); + Assert(O_PAGE_IS(O_GET_IN_MEMORY_PAGE(rightBlkno), BROKEN_SPLIT)); + START_CRIT_SECTION(); + page_block_reads(rightBlkno); + rightHeader->flags &= ~O_BTREE_FLAG_BROKEN_SPLIT; + + /* + * Register split. That would put back O_BTREE_FLAG_BROKEN_SPLIT on + * error. + */ + btree_register_inprogress_split(rightBlkno); + END_CRIT_SECTION(); + unlock_page(rightBlkno); + unlock_page(left_blkno); + + ppool_reserve_pages(desc->ppool, PPOOL_RESERVE_FIND, 2); + + init_page_find_context(iitem.context, desc, COMMITSEQNO_INPROGRESS, BTREE_PAGE_FIND_MODIFY); + + find_page(iitem.context, &key, BTreeKeyPageHiKey, level + 1); + iitem.rightBlkno = rightBlkno; + iitem.replace = false; + iitem.refind = false; + iitem.level = level + 1; + iitem.next = NULL; + + o_btree_split_fill_downlink_item(&iitem, left_blkno, true); + o_btree_insert_item(&iitem, PPOOL_RESERVE_FIND); +} + +/* + * Fixes incomplete split of a page. + * Left page must be locked. Unlocks left page and all pages used internally. + */ +void +o_btree_split_fix_and_unlock(BTreeDescr *descr, OInMemoryBlkno left_blkno) +{ + MemoryContext prev_context; + bool nested_call; + + nested_call = CurrentMemoryContext == btree_insert_context; + if (!nested_call) + { + prev_context = MemoryContextSwitchTo(btree_insert_context); + } + + /* + * Root split can't be incomplete, because it's executed within a single + * critical section. + */ + Assert(left_blkno != descr->rootInfo.rootPageBlkno); + + o_btree_fix_page_split(descr, left_blkno); + + if (!nested_call) + { + MemoryContextSwitchTo(prev_context); + MemoryContextResetOnly(btree_insert_context); + } +} + +/* + * Fixes incomplete split of a page. + * Left page must be locked. Unlocks left page and all pages used internally. + */ +void +o_btree_split_fix_for_right_page_and_unlock(BTreeDescr *desc, OInMemoryBlkno rightBlkno) +{ + OrioleDBPageDesc *rightPageDesc = O_GET_IN_MEMORY_PAGEDESC(rightBlkno); + OInMemoryBlkno leftBlkno; + BTreePageHeader *leftHeader; + uint64 rightLink; + uint32 rightChangeCount; + + leftBlkno = rightPageDesc->leftBlkno; + rightChangeCount = O_PAGE_GET_CHANGE_COUNT(O_GET_IN_MEMORY_PAGE(rightBlkno)); + + unlock_page(rightBlkno); + + lock_page(leftBlkno); + leftHeader = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(leftBlkno); + rightLink = leftHeader->rightLink; + + if (RightLinkIsValid(rightLink) && + RIGHTLINK_GET_BLKNO(rightLink) == rightBlkno && + RIGHTLINK_GET_CHANGECOUNT(rightLink) == rightChangeCount) + { + o_btree_split_fix_and_unlock(desc, leftBlkno); + } + else + { + unlock_page(leftBlkno); + } +} + +static BTreeInsertStackItem * +o_btree_insert_stack_push_split_item(BTreeInsertStackItem *insert_item, + OInMemoryBlkno left_blkno) +{ + Page p = O_GET_IN_MEMORY_PAGE(left_blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + BTreePageHeader *rightHeader; + BTreeInsertStackItem *new_item = palloc(sizeof(BTreeInsertStackItem)); + OInMemoryBlkno right_blkno; + + /* Should not be here. */ + Assert(insert_item->context->index != 0); + + /* + * The incomplete split found. We should fill a new insert item which will + * insert downlink to parent and push it to context. + */ + new_item->context = palloc(sizeof(OBTreeFindPageContext)); + *(new_item->context) = *(insert_item->context); + new_item->context->index--; + + new_item->replace = false; + new_item->level = insert_item->level + 1; + new_item->next = insert_item; + + o_btree_split_fill_downlink_item(new_item, left_blkno, true); + + /* Removes broken flag and unlock page. */ + right_blkno = RIGHTLINK_GET_BLKNO(header->rightLink); + lock_page(right_blkno); + rightHeader = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(right_blkno); + START_CRIT_SECTION(); + page_block_reads(right_blkno); + rightHeader->flags &= ~O_BTREE_FLAG_BROKEN_SPLIT; + btree_register_inprogress_split(right_blkno); + END_CRIT_SECTION(); + unlock_page(right_blkno); + unlock_page(left_blkno); + insert_item->refind = true; + + new_item->rightBlkno = right_blkno; + new_item->refind = true; + + return new_item; +} + +typedef struct +{ + BTreePageItem item; + int index; + int pgprocno; + bool inserted; +} TupleWaiterInfo; + +/* + * Gethers information about tuples to be inserted by other processes. + * Returns total size to be occupied by new tuples. + */ +static int +get_tuple_waiter_infos(BTreeDescr *desc, + int tupleWaiterProcnums[BTREE_PAGE_MAX_SPLIT_ITEMS], + TupleWaiterInfo tupleWaiterInfos[BTREE_PAGE_MAX_SPLIT_ITEMS], + int tupleWaitersCount) +{ + int i; + int totalSize = 0; + + for (i = 0; i < tupleWaitersCount; i++) + { + OPageWaiterShmemState *lockerState = &lockerStates[tupleWaiterProcnums[i]]; + TupleWaiterInfo *tupleWaiterInfo = &tupleWaiterInfos[i]; + OTuple tuple; + + tuple.formatFlags = lockerState->tupleFlags; + tuple.data = &lockerState->tupleData.fixedData[BTreeLeafTuphdrSize]; + + tupleWaiterInfo->item.flags = lockerState->tupleFlags; + tupleWaiterInfo->item.data = lockerState->tupleData.fixedData; + tupleWaiterInfo->item.size = BTreeLeafTuphdrSize + + MAXALIGN(o_btree_len(desc, + tuple, + OTupleLength)); + tupleWaiterInfo->pgprocno = tupleWaiterProcnums[i]; + tupleWaiterInfo->index = i; + tupleWaiterInfo->inserted = false; + totalSize += tupleWaiterInfo->item.size; + } + + return totalSize; +} + +static int +waiter_info_cmp(const void *a, const void *b, void *arg) +{ + TupleWaiterInfo *wa = (TupleWaiterInfo *) a; + TupleWaiterInfo *wb = (TupleWaiterInfo *) b; + OTuple ta; + OTuple tb; + BTreeDescr *desc = (BTreeDescr *) arg; + + ta.formatFlags = wa->item.flags; + ta.data = wa->item.data + BTreeLeafTuphdrSize; + tb.formatFlags = wb->item.flags; + tb.data = wb->item.data + BTreeLeafTuphdrSize; + + return o_btree_cmp(desc, &ta, BTreeKeyLeafTuple, &tb, BTreeKeyLeafTuple); + +} + +/* + * Merge inputItems (existing leaf-page items plus the inserter's new tuple) + * with the queued waiter tuples. Walk both sequences in sort order and + * emit the merged result into outputItems. + * + * The accept/reject decision for each waiter is a single global-budget gate: + * the entire merged byte total (including locator overhead for both halves + * of an eventual split) must fit within the combined two-page budget + * leftSpace + rightSpace, computed under whatever maxKeyLen the candidate + * waiter would inflate to. All inputItems are pre-counted into the running + * total — the original page already accommodated them, so as long as we + * never accept a waiter that pushes the total past the combined budget, + * inputItems are guaranteed to fit somewhere across the two output pages + * (no per-side bookkeeping is needed during the merge — btree_page_split_ + * location() picks the actual split point afterwards). + * + * If a waiter's acceptance would exceed the budget, it (and every later + * waiter) is dropped via finished = true. Conflicting waiters (same key + * as an input) are silently skipped. + * + * Returns true if the merged set doesn't fit a single page (a split is + * required), false otherwise. + */ +static bool +merge_waited_tuples(BTreeDescr *desc, Page p, BTreeSplitItems *outputItems, + BTreeSplitItems *inputItems, + TupleWaiterInfo tupleWaiterInfos[BTREE_PAGE_MAX_SPLIT_ITEMS], + int tupleWaitersCount) +{ + int inputIndex = 0, + outputIndex = 0, + waitersIndex = 0; + int totalSize = 0, + totalCount = 0; + int rightSpace, + singlePageSpace; + int maxKeyLen; + int i; + bool finished = false; + + /* + * Stack of waiters accepted so far, in acceptance (= sort) order. Used + * by the post-pass dry-run gate to drop one waiter at a time without + * re-walking the merge. Each entry records where the waiter ended up in + * outputItems[], its index in tupleWaiterInfos[], and the aligned length + * of its key (so we can decide whether dropping it requires a full + * maxKeyLen rescan). + */ + struct + { + int outputPos; + int waiterIdx; + int keyLen; + } accepted[BTREE_PAGE_MAX_SPLIT_ITEMS] = {0}; + int acceptedTop = 0; + + outputItems->leaf = inputItems->leaf; + outputItems->hikeySize = inputItems->hikeySize; + outputItems->hikeysEnd = inputItems->hikeysEnd; + outputItems->itemsCount = 0; + + /* + * Pre-count every input. After the merge runs, totalSize / totalCount + * always include all inputs plus every accepted waiter, which makes the + * input-fit invariant trivial: as long as the candidate total fits the + * two-page budget, the inputs (a subset of total) fit by construction. + */ + for (i = 0; i < inputItems->itemsCount; i++) + totalSize += inputItems->items[i].size; + totalCount = inputItems->itemsCount; + + maxKeyLen = inputItems->maxKeyLen; + + /* + * The right page inherits the original page's hikey, so its budget + * doesn't change as the merge progresses. singlePageSpace is the + * "no-split" budget — same formula as rightSpace, used at the end to + * decide whether the merged set fits on a single page. + */ + rightSpace = ORIOLEDB_BLCKSZ - + Max(inputItems->hikeysEnd, + MAXALIGN(sizeof(BTreePageHeader)) + inputItems->hikeySize); + singlePageSpace = rightSpace; + + while (inputIndex < inputItems->itemsCount || + (waitersIndex < tupleWaitersCount && !finished)) + { + int cmp; + BTreePageItem item; + bool isWaiter; + + /* Pick the next item in sort order. */ + if (inputIndex >= inputItems->itemsCount) + { + cmp = 1; + } + else if (waitersIndex >= tupleWaitersCount || finished) + { + cmp = -1; + } + else + { + OTuple tup1; + OTuple tup2; + + tup1.formatFlags = inputItems->items[inputIndex].flags; + tup1.data = inputItems->items[inputIndex].data + BTreeLeafTuphdrSize; + tup2.formatFlags = tupleWaiterInfos[waitersIndex].item.flags; + tup2.data = tupleWaiterInfos[waitersIndex].item.data + BTreeLeafTuphdrSize; + cmp = o_btree_cmp(desc, + &tup1, BTreeKeyLeafTuple, + &tup2, BTreeKeyLeafTuple); + + /* Conflicting waiter is silently dropped. */ + if (cmp == 0) + { + waitersIndex++; + continue; + } + } + + Assert(cmp != 0); + isWaiter = (cmp > 0); + + if (isWaiter) + { + OTuple tup; + int newKeyLen, + newMaxKeyLen, + newLeftSpace, + candidateTotalSize; + + item = tupleWaiterInfos[waitersIndex].item; + tup.formatFlags = item.flags; + tup.data = item.data + BTreeLeafTuphdrSize; + newKeyLen = MAXALIGN(o_btree_len(desc, tup, + OTupleKeyLengthNoVersion)); + newMaxKeyLen = Max(maxKeyLen, newKeyLen); + newLeftSpace = ORIOLEDB_BLCKSZ - + Max(inputItems->hikeysEnd, + MAXALIGN(sizeof(BTreePageHeader)) + newMaxKeyLen); + + candidateTotalSize = totalSize + item.size; + + /* + * Global budget gate. Allow per-page locator overhead twice — + * once for each side of a split — as a conservative upper bound + * on any actual split's locator cost. If the candidate set + * wouldn't fit, drop this waiter and every later one. + */ + if (candidateTotalSize + + 2 * MAXALIGN((totalCount + 1) * sizeof(LocationIndex)) > + newLeftSpace + rightSpace) + { + finished = true; + continue; + } + + tupleWaiterInfos[waitersIndex].inserted = true; + accepted[acceptedTop].outputPos = outputIndex; + accepted[acceptedTop].waiterIdx = waitersIndex; + accepted[acceptedTop].keyLen = newKeyLen; + acceptedTop++; + outputItems->items[outputIndex++] = item; + waitersIndex++; + totalSize = candidateTotalSize; + totalCount++; + maxKeyLen = newMaxKeyLen; + } + else + { + /* + * Inputs are pre-counted into totalSize / totalCount. Just emit + * them in sort order. + */ + outputItems->items[outputIndex++] = inputItems->items[inputIndex++]; + } + + Assert(outputIndex < BTREE_PAGE_MAX_SPLIT_ITEMS); + } + + outputItems->itemsCount = outputIndex; + + /* + * Re-derive maxKeyLen from items actually inserted; rejected waiters' + * wider keys must not leak into outputItems->maxKeyLen. + */ + outputItems->maxKeyLen = inputItems->maxKeyLen; + for (i = 0; i < acceptedTop; i++) + outputItems->maxKeyLen = Max(outputItems->maxKeyLen, + accepted[i].keyLen); + + /* + * Post-pass: the global-budget gate above is a conservative upper bound + * on what fits two pages, but it does not exactly mirror + * btree_page_split_location()'s loop (which is sensitive to the specific + * sequence of items at the boundary). If the merged set still doesn't + * have a valid split, drop the most recently accepted waiter (pop the + * stack), shift its outputItems slot out, and re-check. Repeat until the + * dry-run passes — or the stack empties, in which case the inputs alone + * are guaranteed to be splittable since they came from a valid page. + */ + while (totalSize + + MAXALIGN(totalCount * sizeof(LocationIndex)) > singlePageSpace && + !btree_page_split_can_succeed(outputItems) && + acceptedTop > 0) + { + int pos; + int waiterIdx; + int keyLen; + int itemSize; + int j; + + acceptedTop--; + pos = accepted[acceptedTop].outputPos; + waiterIdx = accepted[acceptedTop].waiterIdx; + keyLen = accepted[acceptedTop].keyLen; + itemSize = outputItems->items[pos].size; + + tupleWaiterInfos[waiterIdx].inserted = false; + totalSize -= itemSize; + totalCount--; + + for (j = pos; j < outputIndex - 1; j++) + outputItems->items[j] = outputItems->items[j + 1]; + outputIndex--; + outputItems->itemsCount = outputIndex; + + /* + * The dropped waiter's key only mattered for outputItems->maxKeyLen + * if it was the maximum; otherwise the existing max still bounds + * everything that's left. + */ + if (outputItems->maxKeyLen == keyLen) + { + outputItems->maxKeyLen = inputItems->maxKeyLen; + for (j = 0; j < acceptedTop; j++) + outputItems->maxKeyLen = Max(outputItems->maxKeyLen, + accepted[j].keyLen); + } + } + + /* + * Split is needed iff the merged set doesn't fit a single page. When a + * split is needed, the post-pass above must have driven the merged set + * into a state that btree_page_split_location() can actually partition. + */ + { + bool splitNeeded = totalSize + + MAXALIGN(totalCount * sizeof(LocationIndex)) > singlePageSpace; + + Assert(!splitNeeded || btree_page_split_can_succeed(outputItems)); + return splitNeeded; + } +} + +static void +o_btree_insert_mark_split_finished_if_needed(BTreeInsertStackItem *insert_item) +{ + if (insert_item->rightBlkno != OInvalidInMemoryBlkno) + { + btree_split_mark_finished(insert_item->rightBlkno, true, true); + btree_unregister_inprogress_split(insert_item->rightBlkno); + insert_item->rightBlkno = OInvalidInMemoryBlkno; + } +} + +static bool +o_btree_insert_split(BTreeInsertStackItem *insert_item, + BTreeSplitItems *items, + OffsetNumber offset, + CommitSeqNo csn, + bool needsUndo, + int reserve_kind, + int *waitersWakeupProcnums, + int waitersWakeupCount) +{ + OffsetNumber left_count; + OBTreeFindPageContext *curContext = insert_item->context; + BTreeDescr *desc = curContext->desc; + OInMemoryBlkno blkno, + right_blkno = OInvalidInMemoryBlkno, + root_split_left_blkno = OInvalidInMemoryBlkno; + Page p; + OTuple split_key; + LocationIndex split_key_len; + UndoLocation undoLocation; + BTreeNonLeafTuphdr *internal_header; + bool next; + Jsonb *params = NULL; + + blkno = curContext->items[curContext->index].blkno; + p = O_GET_IN_MEMORY_PAGE(blkno); + + if (STOPEVENTS_ENABLED()) + params = btree_page_stopevent_params(desc, p); + + left_count = btree_get_split_left_count(desc, p, offset, + insert_item->replace, + items, + &split_key, &split_key_len); + + /* Make page-level undo item if needed */ + if (needsUndo) + undoLocation = page_add_image_to_undo(desc, p, csn, + &split_key, split_key_len); + else + undoLocation = InvalidUndoLocation; + + internal_header = palloc(sizeof(BTreeNonLeafTuphdr)); + + START_CRIT_SECTION(); + + if (blkno == desc->rootInfo.rootPageBlkno) + root_split_left_blkno = ppool_alloc_page(desc->ppool, reserve_kind); + right_blkno = ppool_alloc_page(desc->ppool, reserve_kind); + + /* + * Move hikeyBlkno of split. This change is atomic, no need to bother + * about change count. + */ + if (checkpoint_state->stack[insert_item->level].hikeyBlkno == blkno) + checkpoint_state->stack[insert_item->level].hikeyBlkno = right_blkno; + + perform_page_split(desc, blkno, right_blkno, items, + left_count, split_key, split_key_len, + csn, undoLocation); + + o_btree_insert_mark_split_finished_if_needed(insert_item); + + unlock_page(right_blkno); + + if (waitersWakeupCount > 0) + mark_waiter_tuples_inserted(waitersWakeupProcnums, + waitersWakeupCount); + + o_btree_split_fill_downlink_item_with_key(insert_item, blkno, false, + split_key, split_key_len, + internal_header); + + if (blkno == desc->rootInfo.rootPageBlkno) + { + Assert(curContext->index == 0); + + insert_item->rightBlkno = right_blkno; + + blkno = o_btree_finish_root_split_internal(desc, + root_split_left_blkno, + insert_item); + + next = true; + END_CRIT_SECTION(); + } + else + { + /* node and leafs split */ + btree_register_inprogress_split(right_blkno); + if (insert_item->level == 0) + pg_atomic_fetch_add_u32(&BTREE_GET_META(desc)->leafPagesNum, 1); + + unlock_page_after_split(blkno); + + curContext->index--; + insert_item->refind = true; + next = false; + END_CRIT_SECTION(); + insert_item->rightBlkno = right_blkno; + + } + + + if (STOPEVENT_CONDITION(STOPEVENT_SPLIT_FAIL, params)) + elog(ERROR, "Debug condition: page has been splitted."); + + STOPEVENT(STOPEVENT_PAGE_SPLIT, params); + + if (!next) + { + /* Split non-rootPageBlkno case. Insert a downlink. */ + insert_item->replace = false; + insert_item->level++; + } + + return next; +} + +static void +tuple_waiters_check_hikey(BTreeDescr *desc, Page p, + TupleWaiterInfo tupleWaiterInfos[BTREE_PAGE_MAX_SPLIT_ITEMS], + int *tupleWaitersCount) +{ + OTuple hikey; + int count = (*tupleWaitersCount); + + if (O_PAGE_IS(p, RIGHTMOST)) + return; + + BTREE_PAGE_GET_HIKEY(hikey, p); + + while (count > 0) + { + OTuple waiterTup; + + waiterTup.formatFlags = tupleWaiterInfos[count - 1].item.flags; + waiterTup.data = tupleWaiterInfos[count - 1].item.data + BTreeLeafTuphdrSize; + + if (o_btree_cmp(desc, + &waiterTup, BTreeKeyLeafTuple, + &hikey, BTreeKeyNonLeafKey) < 0) + break; + count--; + } + + (*tupleWaitersCount) = count; +} + +static bool +o_btree_insert_needs_page_undo(BTreeDescr *desc, Page p) +{ + bool needsUndo = O_PAGE_IS(p, LEAF) && desc->undoType != UndoLogNone; + + if (needsUndo && OXidIsValid(desc->createOxid) && + desc->createOxid == get_current_oxid_if_any()) + needsUndo = false; + + return needsUndo; +} + +static bool +o_btree_insert_item_with_waiters(BTreeInsertStackItem *insert_item, + int reserve_kind, + int tupleWaiterProcnums[BTREE_PAGE_MAX_SPLIT_ITEMS], + int tupleWaitersCount) +{ + BTreeDescr *desc = insert_item->context->desc; + OBTreeFindPageContext *curContext = insert_item->context; + BTreeSplitItems items; + BTreeSplitItems newItems; + int i, + waitersWakeupCount = 0; + CommitSeqNo csn; + bool needsUndo; + OffsetNumber offset; + bool split; + OInMemoryBlkno blkno; + TupleWaiterInfo tupleWaiterInfos[BTREE_PAGE_MAX_SPLIT_ITEMS]; + Page p; + int totalSize; + BTreePageItemLocator loc; + + totalSize = get_tuple_waiter_infos(desc, + tupleWaiterProcnums, + tupleWaiterInfos, + tupleWaitersCount); + + blkno = curContext->items[curContext->index].blkno; + Assert(OInMemoryBlknoIsValid(blkno)); + p = O_GET_IN_MEMORY_PAGE(blkno); + + if (tupleWaitersCount <= BTREE_PAGE_ITEMS_COUNT(p) && + MAXALIGN(insert_item->tuplen) + BTreeLeafTuphdrSize + + totalSize + MAXALIGN(sizeof(LocationIndex)) * (tupleWaitersCount + 1) <= + BTREE_PAGE_FREE_SPACE(p)) + { + + page_block_reads(blkno); + + for (i = 0; i <= tupleWaitersCount; i++) + { + LocationIndex tuplen; + LocationIndex keyLen; + BTreePageHeader *header = (BTreePageHeader *) p; + BTreeLeafTuphdr tuphdr; + OTuple tuple; + Pointer ptr; + + if (i == 0) + { + loc = curContext->items[curContext->index].locator; + tuple = insert_item->tuple; + tuplen = insert_item->tuplen; + tuphdr = *((BTreeLeafTuphdr *) insert_item->tupheader); + START_CRIT_SECTION(); + } + else + { + TupleWaiterInfo *waiterInfo = &tupleWaiterInfos[i - 1]; + OPageWaiterShmemState *lockerState = &lockerStates[waiterInfo->pgprocno]; + + tuple.formatFlags = waiterInfo->item.flags; + tuple.data = waiterInfo->item.data + BTreeLeafTuphdrSize; + tuphdr = *((BTreeLeafTuphdr *) waiterInfo->item.data); + tuplen = waiterInfo->item.size - BTreeLeafTuphdrSize; + + if (!O_PAGE_IS(p, RIGHTMOST)) + { + OTuple hikey; + + hikey = page_get_hikey(p); + if (o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, &hikey, BTreeKeyNonLeafKey) >= 0) + continue; + + } + + btree_page_search(desc, p, (Pointer) &tuple, + BTreeKeyLeafTuple, NULL, &loc); + + if (!page_locator_fits_new_item(p, &loc, waiterInfo->item.size)) + break; + + if (BTREE_PAGE_LOCATOR_IS_VALID(p, &loc)) + { + OTuple existingTup; + + BTREE_PAGE_READ_LEAF_TUPLE(existingTup, p, &loc); + + if (o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, &existingTup, BTreeKeyLeafTuple) == 0) + continue; + } + + START_CRIT_SECTION(); + if (desc->undoType != UndoLogNone) + { + steal_reserved_undo_size(desc->undoType, + lockerState->reservedUndoSize); + make_waiter_undo_record(desc, blkno, + waiterInfo->pgprocno, + lockerState); + } + lockerState->inserted = true; + } + + page_locator_insert_item(p, &loc, MAXALIGN(tuplen) + BTreeLeafTuphdrSize); + header->prevInsertOffset = BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc); + keyLen = MAXALIGN(o_btree_len(desc, tuple, OTupleKeyLengthNoVersion)); + header->maxKeyLen = Max(header->maxKeyLen, keyLen); + + /* Copy new tuple and header */ + ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + memcpy(ptr, &tuphdr, BTreeLeafTuphdrSize); + ptr += BTreeLeafTuphdrSize; + memcpy(ptr, tuple.data, tuplen); + BTREE_PAGE_SET_ITEM_FLAGS(p, &loc, tuple.formatFlags); + + if (!(tuple.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + header->chunkDesc[loc.chunkOffset].chunkKeysFixed = 0; + MARK_DIRTY(desc, blkno); + END_CRIT_SECTION(); + } + + unlock_page(blkno); + + + return true; + } + + qsort_arg(tupleWaiterInfos, + tupleWaitersCount, + sizeof(TupleWaiterInfo), + waiter_info_cmp, + desc); + + tuple_waiters_check_hikey(desc, p, + tupleWaiterInfos, + &tupleWaitersCount); + + loc = curContext->items[curContext->index].locator; + offset = BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc); + + needsUndo = o_btree_insert_needs_page_undo(desc, p); + + /* Get CSN for undo item if needed */ + if (needsUndo) + csn = pg_atomic_read_u64(&TRANSAM_VARIABLES->nextCommitSeqNo); + else + csn = COMMITSEQNO_INPROGRESS; + + make_split_items(desc, p, &items, &offset, + insert_item->tupheader, + insert_item->tuple, + insert_item->tuplen, + insert_item->replace, + csn); + + split = merge_waited_tuples(desc, p, &newItems, &items, + tupleWaiterInfos, + tupleWaitersCount); + + for (i = 0; i < tupleWaitersCount; i++) + { + if (tupleWaiterInfos[i].inserted) + { + OPageWaiterShmemState *lockerState = &lockerStates[tupleWaiterInfos[i].pgprocno]; + + tupleWaiterProcnums[waitersWakeupCount++] = tupleWaiterInfos[i].pgprocno; + + if (desc->undoType != UndoLogNone) + { + steal_reserved_undo_size(desc->undoType, + lockerState->reservedUndoSize); + make_waiter_undo_record(desc, blkno, + tupleWaiterInfos[i].pgprocno, + lockerState); + } + } + } + + Assert(items.itemsCount + waitersWakeupCount == newItems.itemsCount); + + if (!split) + { + START_CRIT_SECTION(); + perform_page_compaction(desc, blkno, &newItems, needsUndo, csn); + MARK_DIRTY(desc, blkno); + + if (waitersWakeupCount > 0) + mark_waiter_tuples_inserted(tupleWaiterProcnums, + waitersWakeupCount); + + o_btree_insert_mark_split_finished_if_needed(insert_item); + unlock_page(blkno); + END_CRIT_SECTION(); + return true; + } + else + { + return o_btree_insert_split(insert_item, &newItems, offset, csn, + needsUndo, reserve_kind, + tupleWaiterProcnums, + waitersWakeupCount); + } +} + +static bool +o_btree_insert_item_no_waiters(BTreeInsertStackItem *insert_item, + int reserve_kind) +{ + BTreeDescr *desc = insert_item->context->desc; + OBTreeFindPageContext *curContext = insert_item->context; + OInMemoryBlkno blkno; + LocationIndex tupheaderlen; + LocationIndex newItemSize; + BTreePageItemLocator loc; + Page p; + BTreeItemPageFitType fit; + BTreePageHeader *header; + + blkno = curContext->items[curContext->index].blkno; + loc = curContext->items[curContext->index].locator; + tupheaderlen = (insert_item->level > 0) ? + BTreeNonLeafTuphdrSize : BTreeLeafTuphdrSize; + + Assert(OInMemoryBlknoIsValid(blkno)); + p = O_GET_IN_MEMORY_PAGE(blkno); + header = (BTreePageHeader *) p; + newItemSize = MAXALIGN(insert_item->tuplen) + tupheaderlen; + + /* + * Pass the current value of nextCommitSeqNo to page_locator_fits_item(). + * The result coult be somewhat pessimistic: it might happend that we + * could actually compact more due to advance of nextCommitSeqNo. + */ + fit = page_locator_fits_item(desc, + p, + &loc, + newItemSize, + insert_item->replace, + pg_atomic_read_u64(&TRANSAM_VARIABLES->nextCommitSeqNo)); + + if (fit == BTreeItemPageFitAsIs) + { + Pointer ptr; + + START_CRIT_SECTION(); + page_block_reads(blkno); + + if (!insert_item->replace) + { + LocationIndex keyLen; + + page_locator_insert_item(p, &loc, newItemSize); + header->prevInsertOffset = BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc); + + if (O_PAGE_IS(p, LEAF)) + keyLen = MAXALIGN(o_btree_len(desc, insert_item->tuple, OTupleKeyLengthNoVersion)); + else + keyLen = MAXALIGN(insert_item->tuplen); + header->maxKeyLen = Max(header->maxKeyLen, keyLen); + } + else + { + int prevItemSize; + BTreeLeafTuphdr prev; + + prev = *((BTreeLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc)); + prevItemSize = BTREE_PAGE_GET_ITEM_SIZE(p, &loc); + Assert(O_PAGE_IS(p, LEAF)); + + if (!prev.deleted) + { + OTuple tuple; + + BTREE_PAGE_READ_TUPLE(tuple, p, &loc); + PAGE_ADD_N_VACATED(p, BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tuple, OTupleLength))); + } + + /* + * If new tuple is less then previous one, don't resize page item + * immediately. We want to be able to rollback this action + * without page splits. + * + * Page compaction will re-use unoccupied page space when needed. + */ + if (newItemSize > prevItemSize) + { + page_locator_resize_item(p, &loc, newItemSize); + PAGE_SUB_N_VACATED(p, prevItemSize); + header->prevInsertOffset = BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc); + } + else + { + OTuple tuple pg_attribute_unused(); + + BTREE_PAGE_READ_TUPLE(tuple, p, &loc); + PAGE_SUB_N_VACATED(p, BTreeLeafTuphdrSize + + MAXALIGN(insert_item->tuplen)); + header->prevInsertOffset = MaxOffsetNumber; + } + + /* + * We replace tuples only in leafs. Only inserts go to the + * non-leaf pages. + */ + Assert(insert_item->level == 0); + } + + /* Copy new tuple and header */ + ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + memcpy(ptr, insert_item->tupheader, tupheaderlen); + ptr += tupheaderlen; + memcpy(ptr, insert_item->tuple.data, insert_item->tuplen); + BTREE_PAGE_SET_ITEM_FLAGS(p, &loc, insert_item->tuple.formatFlags); + + if (!(insert_item->tuple.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + header->chunkDesc[loc.chunkOffset].chunkKeysFixed = 0; + + page_split_chunk_if_needed(desc, p, &loc); + + MARK_DIRTY(desc, blkno); + + o_btree_insert_mark_split_finished_if_needed(insert_item); + unlock_page(blkno); + + END_CRIT_SECTION(); + + return true; + } + else + { + BTreeSplitItems items; + OffsetNumber offset; + CommitSeqNo csn; + bool needsUndo; + + /* + * No compaction should occur for bridge index: we need to keep the + * entries for VACUUM. + */ + Assert(fit == BTreeItemPageFitSplitRequired || + desc->type != oIndexBridge); + + offset = BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc); + + /* Get CSN for undo item if needed */ + needsUndo = o_btree_insert_needs_page_undo(desc, p); + if (needsUndo) + csn = pg_atomic_fetch_add_u64(&TRANSAM_VARIABLES->nextCommitSeqNo, 1); + else + csn = COMMITSEQNO_INPROGRESS; + + make_split_items(desc, p, &items, &offset, + insert_item->tupheader, + insert_item->tuple, + insert_item->tuplen, + insert_item->replace, + csn); + + /* + * After make_split_items() reclaims deleted tuples, the remaining + * items may fit on a single page even if page_locator_fits_item() + * estimated a split was needed. Check actual total size and do + * compaction instead of split when possible. + */ + if (fit == BTreeItemPageFitCompactRequired || + (O_PAGE_IS(p, LEAF) && split_items_fit_single_page(&items))) + { + START_CRIT_SECTION(); + + perform_page_compaction(desc, blkno, &items, needsUndo, csn); + header->prevInsertOffset = offset; + + MARK_DIRTY(desc, blkno); + o_btree_insert_mark_split_finished_if_needed(insert_item); + unlock_page(blkno); + + END_CRIT_SECTION(); + + return true; + } + + return o_btree_insert_split(insert_item, &items, offset, csn, + needsUndo, reserve_kind, NULL, 0); + } +} + +static void +o_btree_insert_item(BTreeInsertStackItem *insert_item, int reserve_kind) +{ + BTreeKeyType kind; + BTreeDescr *desc = insert_item->context->desc; + OInMemoryBlkno blkno = OInvalidInMemoryBlkno; + + Assert(insert_item != NULL); + + /*-- + * Guarantees that we never have recursive calls of o_btree_insert_item() such + * as: + * o_btree_insert_item()->refind_page()->find_page() + * ->o_btree_fix_page_split()->o_btree_insert_item() + * + * Reasons: + * + * 1. o_btree_insert_item() algorithm fixes broken splits itself for pages + * founded by refind_page(). + * 2. Inner call of ppool_reserve_pages(kind, 2) with a same kind is + * incorrect. + */ + Assert(!(insert_item->context->flags & BTREE_PAGE_FIND_FIX_LEAF_SPLIT)); + + while (insert_item != NULL) + { + OBTreeFindPageContext *curContext = insert_item->context; + bool next = false; + int tupleWaiterProcnums[BTREE_PAGE_MAX_SPLIT_ITEMS]; + int tupleWaitersCount; + + Assert(desc->ppool->numPagesReserved[reserve_kind] >= 2); + + if (insert_item->level > 0) + kind = BTreeKeyNonLeafKey; + else + kind = BTreeKeyLeafTuple; + + if (insert_item->level == 0) + { + + Assert(curContext->index >= 0 && curContext->index < ORIOLEDB_MAX_DEPTH); + blkno = curContext->items[curContext->index].blkno; + + /* + * it can be called only from o_btree_insert_tuple_to_leaf() + * o_btree_insert_tuple_to_leaf() can be called only from + * o_btree_normal_modify() + */ + + /* + * we already make incomplete split checks in (re)find_page() + * inside o_btree_normal_modify(). + */ + Assert(insert_item->refind == false); + } + else + { + bool relocked = false; + uint32 pageChangeCount; + + if (insert_item->refind) + { + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + /* + * Re-find appropriate tree page. It might happen that parent + * page is not available in context. That may happen due to + * concurrent rootPageBlkno split or page location using hint. + * Then just find appropriate page from the rootPageBlkno. + */ + BTREE_PAGE_FIND_UNSET(curContext, IMAGE); + if (curContext->index >= 0) + result = refind_page(curContext, &insert_item->tuple, kind, + insert_item->level, + curContext->items[curContext->index].blkno, + curContext->items[curContext->index].pageChangeCount); + else + result = find_page(curContext, &insert_item->tuple, kind, + insert_item->level); + Assert(result == OFindPageResultSuccess); + insert_item->refind = false; + } + + Assert(curContext->index >= 0 && curContext->index < ORIOLEDB_MAX_DEPTH); + blkno = curContext->items[curContext->index].blkno; + pageChangeCount = curContext->items[curContext->index].pageChangeCount; + + if (o_btree_split_is_incomplete(blkno, + pageChangeCount, + &relocked)) + { + /* pushes fix split item to the insert context */ + insert_item = o_btree_insert_stack_push_split_item(insert_item, + blkno); + continue; + } + else if (relocked) + { + /* page is changed, we should refind current tuple */ + unlock_page(blkno); + insert_item->refind = true; + continue; + } + } + + Assert(OInMemoryBlknoIsValid(blkno)); + + if (insert_item->level > 0 && + page_is_under_checkpoint(desc, blkno, false)) + { + /* + * We change a node that is under checkpoint and must mark it as + * autonomous. + */ + backend_set_autonomous_level(checkpoint_state, insert_item->level); + } + + if (insert_item->level == 0 && !insert_item->replace) + { + if (STOPEVENTS_ENABLED()) + { + Page page = O_GET_IN_MEMORY_PAGE(blkno); + Jsonb *params; + + params = btree_page_stopevent_params(desc, page); + STOPEVENT(STOPEVENT_BEFORE_GET_WAITERS_WITH_TUPLES, params); + } + + tupleWaitersCount = get_waiters_with_tuples(desc, blkno, tupleWaiterProcnums); + } + else + tupleWaitersCount = 0; + + if (tupleWaitersCount > 0) + next = o_btree_insert_item_with_waiters(insert_item, + reserve_kind, + tupleWaiterProcnums, + tupleWaitersCount); + else + next = o_btree_insert_item_no_waiters(insert_item, + reserve_kind); + + if (next) + insert_item = insert_item->next; + + if (insert_item != NULL) + ppool_reserve_pages(desc->ppool, reserve_kind, 2); + } + ppool_release_reserved(desc->ppool, PPOOL_KIND_GET_MASK(reserve_kind)); +} + +void +o_btree_insert_tuple_to_leaf(OBTreeFindPageContext *context, + OTuple tuple, LocationIndex tuplen, + BTreeLeafTuphdr *tuphdr, bool replace, + int reserve_kind) +{ + BTreeInsertStackItem insert_item; + MemoryContext prev_context; + bool nested_call; + + nested_call = CurrentMemoryContext == btree_insert_context; + if (!nested_call) + prev_context = MemoryContextSwitchTo(btree_insert_context); + + context->flags &= ~(BTREE_PAGE_FIND_FIX_LEAF_SPLIT); + insert_item.next = NULL; + insert_item.context = context; + insert_item.tuple = tuple; + insert_item.tuplen = tuplen; + insert_item.tupheader = (Pointer) tuphdr; + insert_item.level = 0; + insert_item.replace = replace; + insert_item.rightBlkno = OInvalidInMemoryBlkno; + insert_item.refind = false; + + o_btree_insert_item(&insert_item, reserve_kind); + + if (!nested_call) + { + MemoryContextSwitchTo(prev_context); + MemoryContextResetOnly(btree_insert_context); + } +} diff --git a/contrib/orioledb/src/btree/io.c b/contrib/orioledb/src/btree/io.c new file mode 100644 index 00000000000..37f1449af95 --- /dev/null +++ b/contrib/orioledb/src/btree/io.c @@ -0,0 +1,3665 @@ +/*------------------------------------------------------------------------- + * + * io.c + * Routines for orioledb B-tree disk IO. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/io.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include +#include + +#include "orioledb.h" + +#include "btree/io.h" +#include "btree/find.h" +#include "btree/merge.h" +#include "btree/page_chunks.h" +#include "btree/scan.h" +#include "btree/undo.h" +#include "checkpoint/checkpoint.h" +#include "catalog/free_extents.h" +#include "catalog/o_sys_cache.h" +#include "recovery/recovery.h" +#include "s3/headers.h" +#include "s3/worker.h" +#include "tableam/descr.h" +#include "tableam/handler.h" +#include "utils/compress.h" +#include "utils/elog.h" +#include "utils/page_pool.h" +#include "utils/seq_buf.h" +#include "utils/stopevent.h" +#include "utils/ucm.h" +#include "workers/bgwriter.h" + +#include "access/transam.h" +#include "access/relation.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "funcapi.h" + +typedef struct +{ + pg_atomic_uint64 writesStarted; + pg_atomic_uint64 writesFinished; + ConditionVariable cv[FLEXIBLE_ARRAY_MEMBER]; +} IOShmem; + +typedef struct TreeOffset +{ + OIndexKey key; + int segno; + uint32 chkpNum; + FileExtent fileExtent; + bool compressed; +} TreeOffset; + +typedef struct IOWriteBack +{ + int extentsNumber; + int extentsAllocated; + TreeOffset *extents; +} IOWriteBack; + +static IOWriteBack io_writeback = +{ + 0, 0, NULL +}; +static LWLockPadded *io_locks; +static IOShmem *ioShmem = NULL; +static int num_io_lwlocks; +static bool io_in_progress = false; + +static bool prepare_non_leaf_page(Page p); +static uint64 get_free_disk_offset(BTreeDescr *desc); +static bool get_free_disk_extent(BTreeDescr *desc, uint32 chkpNum, + off_t page_size, FileExtent *extent); +static bool get_free_disk_extent_copy_blkno(BTreeDescr *desc, off_t page_size, + FileExtent *extent, uint32 checkpoint_number); + +static bool write_page_to_disk(BTreeDescr *desc, FileExtent *extent, + uint32 curChkpNum, + Pointer page, off_t page_size); +static void write_page(OBTreeFindPageContext *context, + OInMemoryBlkno blkno, Page img, + uint32 checkpoint_number, + bool evict, bool copy_blkno); +static int tree_offsets_cmp(const void *a, const void *b); +static void writeback_put_extent(IOWriteBack *writeback, BTreeDescr *desc, + uint64 downlink); +static void perform_writeback(IOWriteBack *writeback); + +PG_FUNCTION_INFO_V1(orioledb_evict_pages); +PG_FUNCTION_INFO_V1(orioledb_write_pages); + +Size +btree_io_shmem_needs(void) +{ + return CACHELINEALIGN(offsetof(IOShmem, cv) + + sizeof(ConditionVariable) * max_procs); +} + +void +btree_io_shmem_init(Pointer buf, bool found) +{ + Pointer ptr = buf; + + ioShmem = (IOShmem *) ptr; + if (!found) + { + int i; + + pg_atomic_init_u64(&ioShmem->writesStarted, 0); + pg_atomic_init_u64(&ioShmem->writesFinished, 0); + + for (i = 0; i < max_procs; i++) + ConditionVariableInit(&ioShmem->cv[i]); + } +} + +static void +io_start(void) +{ + uint64 startNum; + bool slept = false; + + if (max_io_concurrency == 0) + return; + + startNum = pg_atomic_add_fetch_u64(&ioShmem->writesStarted, 1); + io_in_progress = true; + while (startNum > pg_atomic_read_u64(&ioShmem->writesFinished) + max_io_concurrency) + { + ConditionVariableSleep(&ioShmem->cv[startNum % max_procs], WAIT_EVENT_PG_SLEEP); + slept = true; + } + if (slept) + ConditionVariableCancelSleep(); +} + +static void +io_finish(void) +{ + uint64 finishNum; + + if (max_io_concurrency == 0) + return; + + finishNum = pg_atomic_add_fetch_u64(&ioShmem->writesFinished, 1); + io_in_progress = false; + ConditionVariableBroadcast(&ioShmem->cv[(finishNum + max_io_concurrency) % max_procs]); +} + +int +OFileRead(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info) +{ + int result; + + io_start(); + result = FileRead(file, buffer, amount, offset, wait_event_info); + io_finish(); + return result; +} + +int +OFileWrite(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info) +{ + int result; + + io_start(); + result = FileWrite(file, buffer, amount, offset, wait_event_info); + io_finish(); + return result; +} + +typedef struct +{ + uint32 checkpointNumber; + uint32 segmentNumber; +} FileHashKey; + +typedef struct +{ + FileHashKey key; + File file; + uint32 loadId; + char status; /* for simplehash use */ +} FileHashElement; + +#define SH_PREFIX s3Files +#define SH_ELEMENT_TYPE FileHashElement +#define SH_KEY_TYPE FileHashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) hash_any((unsigned char *) &key, sizeof(FileHashKey)) +#define SH_EQUAL(tb, a, b) memcmp(&a, &b, sizeof(FileHashKey)) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +char * +btree_filename(OIndexKey key, int segno, uint32 chkpNum) +{ + char *result; + char *db_prefix; + + o_get_prefixes_for_tablespace(key.oids.datoid, key.tablespace, + NULL, &db_prefix); + + if (orioledb_s3_mode) + { + if (segno == 0) + result = psprintf("%s/%u-%u", + db_prefix, + key.oids.relnode, + chkpNum); + else + result = psprintf("%s/%u.%u-%u", + db_prefix, + key.oids.relnode, + segno, + chkpNum); + } + else + { + if (segno == 0) + result = psprintf("%s/%u", + db_prefix, + key.oids.relnode); + else + result = psprintf("%s/%u.%u", + db_prefix, + key.oids.relnode, + segno); + } + + pfree(db_prefix); + return result; +} + +char * +btree_smgr_filename(BTreeDescr *desc, off_t offset, uint32 chkpNum) +{ + int segno = offset / ORIOLEDB_SEGMENT_SIZE; + OIndexKey key = {.oids = desc->oids,.tablespace = desc->tablespace}; + + return btree_filename(key, segno, chkpNum); +} + +static File +btree_open_smgr_file(BTreeDescr *desc, uint32 num, uint32 chkpNum, + uint32 loadId) +{ + if (orioledb_s3_mode) + { + FileHashElement *hashElem; + FileHashKey key; + bool found; + char *filename; + + key.checkpointNumber = chkpNum; + key.segmentNumber = num; + hashElem = s3Files_insert(desc->smgr.hash, key, &found); + if (found) + { + if (hashElem->loadId == loadId) + return hashElem->file; + else + FileClose(hashElem->file); + } + + filename = btree_smgr_filename(desc, + (off_t) num * ORIOLEDB_SEGMENT_SIZE, + chkpNum); + hashElem->file = PathNameOpenFile(filename, O_RDWR | O_CREAT | PG_BINARY); + hashElem->loadId = loadId; + if (hashElem->file <= 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open data file %s: %m", filename))); + pfree(filename); + return hashElem->file; + } + else + { + char *filename; + + if (num >= desc->smgr.array.filesAllocated) + { + int i = desc->smgr.array.filesAllocated; + + /* + * btree_open_smgr should have been called before, so + * filesAllocated should be greater than 0 + */ + Assert(desc->smgr.array.filesAllocated > 0); + + while (num >= desc->smgr.array.filesAllocated) + desc->smgr.array.filesAllocated *= 2; + + desc->smgr.array.files = (File *) repalloc(desc->smgr.array.files, + sizeof(File) * desc->smgr.array.filesAllocated); + for (; i < desc->smgr.array.filesAllocated; i++) + desc->smgr.array.files[i] = -1; + } + + if (desc->smgr.array.files[num] >= 0) + return desc->smgr.array.files[num]; + + filename = btree_smgr_filename(desc, + (off_t) num * ORIOLEDB_SEGMENT_SIZE, + chkpNum); + desc->smgr.array.files[num] = PathNameOpenFile(filename, O_RDWR | O_CREAT | PG_BINARY); + + if (desc->smgr.array.files[num] <= 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open data file %s: %m", filename))); + pfree(filename); + return desc->smgr.array.files[num]; + } +} + +void +btree_init_smgr(BTreeDescr *descr) +{ + if (orioledb_s3_mode) + { + descr->smgr.hash = NULL; + } + else + { + descr->smgr.array.files = NULL; + descr->smgr.array.filesAllocated = 0; + } +} + +void +btree_open_smgr(BTreeDescr *descr) +{ + if (orioledb_s3_mode) + { + int i; + int j; + + descr->smgr.hash = s3Files_create(TopMemoryContext, 16, NULL); + + for (i = 0; i < 2; i++) + { + descr->buildPartsInfo[i].writeMaxLocation = 0; + for (j = 0; j < MAX_NUM_DIRTY_PARTS; j++) + { + descr->buildPartsInfo[i].dirtyParts[j].segNum = -1; + descr->buildPartsInfo[i].dirtyParts[j].partNum = -1; + } + } + } + else + { + int i; + + if (descr->smgr.array.files) + return; + + descr->smgr.array.filesAllocated = 16; + descr->smgr.array.files = (File *) MemoryContextAlloc(TopMemoryContext, + sizeof(File) * descr->smgr.array.filesAllocated); + for (i = 0; i < descr->smgr.array.filesAllocated; i++) + descr->smgr.array.files[i] = -1; + (void) btree_open_smgr_file(descr, 0, 0, 0); + } +} + +void +btree_close_smgr(BTreeDescr *descr) +{ + int i; + + if (orioledb_s3_mode) + { + int j; + + for (j = 0; j < 2; j++) + { + for (i = 0; i < MAX_NUM_DIRTY_PARTS; i++) + { + S3TaskLocation location; + uint32 chkpNum; + int32 segNum, + partNum; + + chkpNum = descr->buildPartsInfo[j].dirtyParts[i].chkpNum; + segNum = descr->buildPartsInfo[j].dirtyParts[i].segNum; + partNum = descr->buildPartsInfo[j].dirtyParts[i].partNum; + if (segNum >= 0 && partNum >= 0) + { + OIndexKey key = {.oids = descr->oids, + .tablespace = descr->tablespace}; + + location = s3_schedule_file_part_write(chkpNum, key, segNum, + partNum); + descr->buildPartsInfo[j].writeMaxLocation = + Max(descr->buildPartsInfo[j].writeMaxLocation, location); + } + descr->buildPartsInfo[j].dirtyParts[i].chkpNum = 0; + descr->buildPartsInfo[j].dirtyParts[i].segNum = -1; + descr->buildPartsInfo[j].dirtyParts[i].partNum = -1; + } + } + + if (descr->smgr.hash) + { + s3Files_iterator i; + FileHashElement *hashElem; + + s3Files_start_iterate(descr->smgr.hash, &i); + while ((hashElem = s3Files_iterate(descr->smgr.hash, &i)) != NULL) + FileClose(hashElem->file); + + s3Files_destroy(descr->smgr.hash); + } + } + else if (descr->smgr.array.files) + { + for (i = 0; i < descr->smgr.array.filesAllocated; i++) + { + if (descr->smgr.array.files[i] >= 0) + FileClose(descr->smgr.array.files[i]); + } + pfree(descr->smgr.array.files); + } + descr->smgr.array.filesAllocated = 0; + descr->smgr.array.files = NULL; +} + +static void +btree_s3_flush(BTreeDescr *desc, uint32 chkpNum) +{ + int i; + BTreeMetaPage *meta = BTREE_GET_META(desc); + + for (i = 0; i < MAX_NUM_DIRTY_PARTS; i++) + { + S3TaskLocation location; + int32 segNum, + partNum; + + segNum = meta->partsInfo[chkpNum % 2].dirtyParts[i].segNum; + partNum = meta->partsInfo[chkpNum % 2].dirtyParts[i].partNum; + if (segNum >= 0 && partNum >= 0) + { + OIndexKey key = {.oids = desc->oids,.tablespace = desc->tablespace}; + + Assert(chkpNum == meta->partsInfo[chkpNum % 2].dirtyParts[i].chkpNum); + location = s3_schedule_file_part_write(chkpNum, key, segNum, partNum); + meta->partsInfo[chkpNum % 2].writeMaxLocation = + Max(meta->partsInfo[chkpNum % 2].writeMaxLocation, location); + } + meta->partsInfo[chkpNum % 2].dirtyParts[i].segNum = -1; + meta->partsInfo[chkpNum % 2].dirtyParts[i].partNum = -1; + } +} + +static void +btree_smgr_schedule_s3_write(BTreeDescr *desc, uint32 chkpNum, + int32 segNum, int32 partNum) +{ + int i; + int32 curSegNum, + curPartNum, + curChkpNum, + tmpSegNum, + tmpPartNum, + tmpChkpNum; + BTreeS3PartsInfo *partsInfo = NULL; + + if (OInMemoryBlknoIsValid(desc->rootInfo.metaPageBlkno)) + { + BTreeMetaPage *meta = BTREE_GET_META(desc); + + partsInfo = meta->partsInfo; + } + else + { + partsInfo = desc->buildPartsInfo; + } + + curSegNum = segNum; + curPartNum = partNum; + curChkpNum = chkpNum; + for (i = 0; i < MAX_NUM_DIRTY_PARTS; i++) + { + tmpSegNum = partsInfo[chkpNum % 2].dirtyParts[i].segNum; + tmpPartNum = partsInfo[chkpNum % 2].dirtyParts[i].partNum; + tmpChkpNum = partsInfo[chkpNum % 2].dirtyParts[i].chkpNum; + partsInfo[chkpNum % 2].dirtyParts[i].segNum = curSegNum; + partsInfo[chkpNum % 2].dirtyParts[i].partNum = curPartNum; + partsInfo[chkpNum % 2].dirtyParts[i].chkpNum = curChkpNum; + curSegNum = tmpSegNum; + curPartNum = tmpPartNum; + curChkpNum = tmpChkpNum; + + if ((curSegNum == segNum && + curPartNum == partNum && + curChkpNum == chkpNum) || + curSegNum < 0) + break; + + if (i == MAX_NUM_DIRTY_PARTS - 1) + { + S3TaskLocation location; + OIndexKey key = {.oids = desc->oids,.tablespace = desc->tablespace}; + + location = s3_schedule_file_part_write(curChkpNum, key, curSegNum, + curPartNum); + partsInfo[chkpNum % 2].writeMaxLocation = + Max(partsInfo[chkpNum % 2].writeMaxLocation, location); + } + } +} + +static int +btree_smgr_write(BTreeDescr *desc, char *buffer, uint32 chkpNum, + int amount, off_t offset) +{ + int result = 0; + off_t curOffset = offset, + granularity; + S3HeaderTag tag = {0}; + + if (use_mmap) + { + Assert(offset + amount <= device_length); + memcpy(mmap_data + offset, buffer, amount); + return amount; + } + else if (use_device) + { + Assert(offset + amount <= device_length); + pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_WRITE); + result = pg_pwrite(device_fd, buffer, amount, offset); + pgstat_report_wait_end(); + return result; + } + + if (orioledb_s3_mode) + { + granularity = ORIOLEDB_S3_PART_SIZE; + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.checkpointNum = chkpNum; + } + else + { + granularity = ORIOLEDB_SEGMENT_SIZE; + } + + while (amount > 0) + { + int segno = curOffset / ORIOLEDB_SEGMENT_SIZE; + int partno = 0; + File file; + uint32 loadId = 0; + + if (orioledb_s3_mode) + { + tag.segNum = segno; + partno = (curOffset % ORIOLEDB_SEGMENT_SIZE) / ORIOLEDB_S3_PART_SIZE; + s3_header_lock_part(tag, partno, &loadId); + } + + file = btree_open_smgr_file(desc, segno, chkpNum, loadId); + if ((curOffset + amount) / granularity == curOffset / granularity) + { + result += OFileWrite(file, buffer, amount, + curOffset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + WAIT_EVENT_DATA_FILE_WRITE); + if (orioledb_s3_mode) + s3_header_unlock_part(tag, partno, true); + break; + } + else + { + int stepAmount = granularity - curOffset % granularity; + + Assert(amount >= stepAmount); + result += OFileWrite(file, buffer, stepAmount, + curOffset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + WAIT_EVENT_DATA_FILE_WRITE); + buffer += stepAmount; + curOffset += stepAmount; + amount -= stepAmount; + } + + if (orioledb_s3_mode) + s3_header_unlock_part(tag, partno, true); + } + + if (orioledb_s3_mode) + { + btree_smgr_schedule_s3_write(desc, + chkpNum, + offset / ORIOLEDB_SEGMENT_SIZE, + (offset % ORIOLEDB_SEGMENT_SIZE) / ORIOLEDB_S3_PART_SIZE); + if (offset / ORIOLEDB_S3_PART_SIZE != (offset + amount - 1) / ORIOLEDB_S3_PART_SIZE) + btree_smgr_schedule_s3_write(desc, + chkpNum, + (offset + amount - 1) / ORIOLEDB_SEGMENT_SIZE, + ((offset + amount - 1) % ORIOLEDB_SEGMENT_SIZE) / ORIOLEDB_S3_PART_SIZE); + } + + return result; +} + +int +btree_smgr_read(BTreeDescr *desc, char *buffer, uint32 chkpNum, + int amount, off_t offset) +{ + int result = 0; + off_t granularity; + S3HeaderTag tag = {0}; + + if (use_mmap) + { + Assert(offset + amount <= device_length); + memcpy(buffer, mmap_data + offset, amount); + return amount; + } + else if (use_device) + { + Assert(offset + amount <= device_length); + pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_READ); + result = pg_pread(device_fd, buffer, amount, offset); + pgstat_report_wait_end(); + return result; + } + + if (orioledb_s3_mode) + { + granularity = ORIOLEDB_S3_PART_SIZE; + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.checkpointNum = chkpNum; + } + else + { + granularity = ORIOLEDB_SEGMENT_SIZE; + } + + while (amount > 0) + { + int segno = offset / ORIOLEDB_SEGMENT_SIZE; + int partno = 0; + File file; + uint32 loadId = 0; + + if (orioledb_s3_mode) + { + tag.segNum = segno; + partno = (offset % ORIOLEDB_SEGMENT_SIZE) / ORIOLEDB_S3_PART_SIZE; + s3_header_lock_part(tag, partno, &loadId); + } + + file = btree_open_smgr_file(desc, segno, chkpNum, loadId); + if ((offset + amount) / granularity == offset / granularity) + { + result += OFileRead(file, buffer, amount, + offset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + WAIT_EVENT_DATA_FILE_READ); + if (orioledb_s3_mode) + s3_header_unlock_part(tag, partno, false); + break; + } + else + { + int stepAmount = granularity - offset % granularity; + + Assert(amount >= stepAmount); + result += OFileRead(file, buffer, stepAmount, + offset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + WAIT_EVENT_DATA_FILE_READ); + buffer += stepAmount; + offset += stepAmount; + amount -= stepAmount; + } + + if (orioledb_s3_mode) + s3_header_unlock_part(tag, partno, false); + } + + return result; +} + +void +btree_smgr_writeback(BTreeDescr *desc, uint32 chkpNum, + off_t offset, int amount) +{ + if (use_mmap) + { + Assert(offset + amount <= device_length); + msync(mmap_data + offset, amount, MS_ASYNC); + return; + } + else if (use_device) + { + return; + } + + while (amount > 0) + { + int segno = offset / ORIOLEDB_SEGMENT_SIZE; + File file; + uint32 loadId = 0; + + if (orioledb_s3_mode) + { + S3HeaderTag tag = { + .key = {.oids = desc->oids,.tablespace = desc->tablespace}, + .checkpointNum = chkpNum, + .segNum = segno}; + + loadId = s3_header_get_load_id(tag); + } + + file = btree_open_smgr_file(desc, segno, chkpNum, loadId); + if ((offset + amount) / ORIOLEDB_SEGMENT_SIZE == segno) + { + FileWriteback(file, + offset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + amount, WAIT_EVENT_DATA_FILE_FLUSH); + break; + } + else + { + int stepAmount = ORIOLEDB_SEGMENT_SIZE - offset % ORIOLEDB_SEGMENT_SIZE; + + Assert(amount >= stepAmount); + FileWriteback(file, + offset % ORIOLEDB_SEGMENT_SIZE + (orioledb_s3_mode ? ORIOLEDB_BLCKSZ : 0), + stepAmount, WAIT_EVENT_DATA_FILE_FLUSH); + offset += stepAmount; + amount -= stepAmount; + } + } +} + +void +btree_smgr_sync(BTreeDescr *desc, uint32 chkpNum, off_t length) +{ + int num; + + if (orioledb_s3_mode) + btree_s3_flush(desc, chkpNum); + + if (use_mmap || use_device) + return; + + for (num = 0; num < length / ORIOLEDB_SEGMENT_SIZE; num++) + { + File file; + uint32 loadId = 0; + + if (orioledb_s3_mode) + { + S3HeaderTag tag = { + .key = {.oids = desc->oids,.tablespace = desc->tablespace}, + .checkpointNum = chkpNum, + .segNum = num}; + + loadId = s3_header_get_load_id(tag); + } + + file = btree_open_smgr_file(desc, num, chkpNum, loadId); + FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); + } +} + +/* + * Punch a hole in a raw OS file descriptor. Logs a WARNING on failure and + * returns; callers don't need to handle the return value because the data + * is being discarded either way. + */ +void +punch_fd_hole(int fd, off_t offset, off_t length, const char *fileName) +{ + int ret; + +#ifdef __APPLE__ + { + fpunchhole_t hole; + + memset(&hole, 0, sizeof(hole)); + hole.fp_offset = offset; + hole.fp_length = length; + ret = fcntl(fd, F_PUNCHHOLE, &hole); + } +#else + ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, length); +#endif + if (ret < 0) + { + int save_errno = errno; + + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not punch hole in file %s offset=%lld length=%lld (%d %s)", + fileName, (long long) offset, (long long) length, + save_errno, strerror(save_errno)))); + } +} + +void +btree_smgr_punch_hole(BTreeDescr *desc, uint32 chkpNum, + off_t offset, int length) +{ + Assert(!orioledb_s3_mode && !use_mmap && !use_device); + + while (length > 0) + { + File file; + int segno = offset / ORIOLEDB_SEGMENT_SIZE; + off_t segoffset; + int seglength; + + file = btree_open_smgr_file(desc, segno, chkpNum, 0); + + segoffset = offset % ORIOLEDB_SEGMENT_SIZE; + if ((offset + length) / ORIOLEDB_SEGMENT_SIZE == segno) + { + seglength = length; + length = 0; + } + else + { + seglength = ORIOLEDB_SEGMENT_SIZE - segoffset; + Assert(length >= seglength); + + offset += seglength; + length -= seglength; + } + punch_fd_hole(FileGetRawDesc(file), segoffset, seglength, + FilePathName(file)); + } +} + +void +btree_io_error_cleanup(void) +{ + if (io_in_progress) + io_finish(); +} + +void +request_btree_io_lwlocks(void) +{ + num_io_lwlocks = max_procs * 4; + RequestNamedLWLockTranche("orioledb_btree_io", num_io_lwlocks); +} + +void +init_btree_io_lwlocks(void) +{ + io_locks = GetNamedLWLockTranche("orioledb_btree_io"); +} + +/* + * Assign number of IO operation to particular (blkno; offnum) pair. + */ +int +assign_io_num(OInMemoryBlkno blkno, OffsetNumber offnum) +{ + int locknum; + int i; + pg_crc32c crc; + + INIT_CRC32C(crc); + COMP_CRC32C(crc, &blkno, sizeof(blkno)); + COMP_CRC32C(crc, &offnum, sizeof(offnum)); + FIN_CRC32C(crc); + + locknum = crc % num_io_lwlocks; + + for (i = 0; i < num_io_lwlocks; i++) + { + if (LWLockConditionalAcquire(&io_locks[locknum].lock, LW_EXCLUSIVE)) + return locknum; + locknum = (locknum + 1) % num_io_lwlocks; + } + + LWLockAcquire(&io_locks[locknum].lock, LW_EXCLUSIVE); + return locknum; +} + +/* + * Wait until particular IO operation is completed. + */ +void +wait_for_io_completion(int ionum) +{ + LWLockAcquire(&io_locks[ionum].lock, LW_SHARED); + LWLockRelease(&io_locks[ionum].lock); +} + +/* + * Report given IO operation to be finished. + */ +void +unlock_io(int ionum) +{ + LWLockRelease(&io_locks[ionum].lock); +} + +/* + * Get next disk free offset for uncompressed on disk B-tree. + * Returns InvalidFileExtentOff if fails. + */ +static uint64 +get_free_disk_offset(BTreeDescr *desc) +{ + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + LWLock *metaLock = &metaPage->metaLock; + uint64 result, + numFreeBlocks; + uint32 free_buf_num; + bool gotBlock; + + Assert(!orioledb_s3_mode); + + /* + * Switch to the next sequential buffer with free blocks numbers in + * needed. + */ + numFreeBlocks = pg_atomic_read_u64(&metaPage->numFreeBlocks); + free_buf_num = metaPage->freeBuf.tag.num; + while (numFreeBlocks == 0 && + can_use_checkpoint_extents(desc, free_buf_num + 1)) + { + SeqBufTag tag = {0}, + old_tag = desc->freeBuf.shared->tag; + SeqBufReplaceResult replaceResult; + + if (orioledb_use_sparse_files) + { + try_to_punch_holes(desc); + Assert(free_buf_num + 1 <= metaPage->punchHolesChkpNum); + } + + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.num = free_buf_num + 1; + tag.type = 't'; + + LWLockAcquire(metaLock, LW_EXCLUSIVE); + replaceResult = seq_buf_try_replace(&desc->freeBuf, + &tag, + &metaPage->numFreeBlocks, + use_device ? sizeof(FileExtent) : sizeof(uint32)); + if (replaceResult == SeqBufReplaceSuccess) + { + if (old_tag.type == 'm') + { + uint32 chkpNum = o_get_latest_chkp_num(tag.key.oids.datoid, + tag.key.oids.relnode, + checkpoint_state->lastCheckpointNumber, + NULL); + + if (old_tag.num < chkpNum) + seq_buf_remove_file(&old_tag); + } + else + { + Assert(old_tag.type == 't'); + if (!orioledb_use_sparse_files || + old_tag.num <= metaPage->punchHolesChkpNum) + seq_buf_remove_file(&old_tag); + } + } + LWLockRelease(metaLock); + if (replaceResult == SeqBufReplaceError) + { + return InvalidFileExtentOff; + } + /* SeqBufReplaceAlready requires no action, just retry if needed */ + + numFreeBlocks = pg_atomic_read_u64(&metaPage->numFreeBlocks); + free_buf_num = metaPage->freeBuf.tag.num; + } + + /* + * Try to get free block number from the buffer. If not success, then + * extend the file. + */ + LWLockAcquire(metaLock, LW_SHARED); + gotBlock = false; + while (numFreeBlocks > 0) + { + if (pg_atomic_compare_exchange_u64(&metaPage->numFreeBlocks, + &numFreeBlocks, + numFreeBlocks - 1)) + { + gotBlock = true; + break; + } + } + + if (gotBlock) + { + + if (use_device) + { + FileExtent extent; + + if (seq_buf_read_file_extent(&desc->freeBuf, &extent)) + result = extent.off; + else + result = InvalidFileExtentOff; + } + else + { + uint32 offset; + + if (seq_buf_read_u32(&desc->freeBuf, &offset)) + result = offset; + else + result = InvalidFileExtentOff; + } + } + else + { + if (use_device) + result = orioledb_device_alloc(desc, ORIOLEDB_BLCKSZ) / ORIOLEDB_COMP_BLCKSZ; + else + result = pg_atomic_fetch_add_u64(&metaPage->datafileLength[0], 1); + } + LWLockRelease(metaLock); + return result; +} + +/* + * Fills free file extent for B-tree. + * + * FileExtentIsValid(extent) == false if fails. + */ +static bool +get_free_disk_extent(BTreeDescr *desc, uint32 chkpNum, + off_t page_size, FileExtent *extent) +{ + if (orioledb_s3_mode) + { + int len = OCompressIsValid(desc->compress) ? FileExtentLen(page_size) : 1; + int threshold = ORIOLEDB_S3_PART_SIZE / (OCompressIsValid(desc->compress) ? ORIOLEDB_COMP_BLCKSZ : ORIOLEDB_BLCKSZ); + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + + extent->off = pg_atomic_fetch_add_u64(&metaPage->datafileLength[chkpNum % 2], len); + extent->len = len; + + if ((extent->off + threshold - 1) / threshold != + (extent->off + threshold - 1 + len) / threshold) + { + Assert((extent->off + threshold - 1) / threshold + 1 == + (extent->off + threshold - 1 + len) / threshold); + s3_headers_increase_loaded_parts(1); + } + + extent->off |= (uint64) chkpNum << S3_CHKP_NUM_SHIFT; + + return FileExtentIsValid(*extent); + } + + /* + * User temporary trees maintain a pure backend-local free space map. + * Serve the allocation from that list first, falling back to extending + * the data file. This avoids any dependency on checkpoint-tagged seq + * bufs. + */ + if (btree_desc_is_local_temp(desc)) + { + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + uint16 len = OCompressIsValid(desc->compress) ? FileExtentLen(page_size) : 1; + + if (!local_free_extents_pop(desc, len, extent)) + { + extent->len = len; + if (use_device) + extent->off = orioledb_device_alloc(desc, len * ORIOLEDB_COMP_BLCKSZ) / ORIOLEDB_COMP_BLCKSZ; + else + extent->off = pg_atomic_fetch_add_u64(&metaPage->datafileLength[0], len); + } + return FileExtentIsValid(*extent); + } + + if (!OCompressIsValid(desc->compress)) + { + Assert(page_size == ORIOLEDB_BLCKSZ); + + extent->off = get_free_disk_offset(desc); + extent->len = 1; + } + else + { + /* Try to add free extents if we didn't manage to do after checkpoint */ + add_free_extents_from_tmp(desc, remove_old_checkpoint_files); + *extent = get_extent(desc, FileExtentLen(page_size)); + } + + return FileExtentIsValid(*extent); +} + +/* + * Fills free file extent for B-tree under copy blkno lock. + * + * FileExtentIsValid(extent) == false if fails. + */ +static bool +get_free_disk_extent_copy_blkno(BTreeDescr *desc, off_t page_size, + FileExtent *extent, uint32 checkpoint_number) +{ + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + + LWLockAcquire(&metaPage->copyBlknoLock, LW_SHARED); + + if (!get_free_disk_extent(desc, checkpoint_number, page_size, extent)) + { + LWLockRelease(&metaPage->copyBlknoLock); + return false; + } + + if ((desc->storageType == BTreeStoragePersistence || desc->storageType == BTreeStorageUnlogged) && + checkpoint_state->treeType == desc->type && + checkpoint_state->datoid == desc->oids.datoid && + checkpoint_state->relnode == desc->oids.relnode && + checkpoint_state->curKeyType != CurKeyFinished) + { + /* + * We're writing to the next checkpoint, while current checkpoint is + * concurrently taking. So, indicate this page is free in the + * checkpoint currently taking. We have to take a lock in order to be + * sure that checkpoint map file will be finishing concurrently. + * Otherwise we might loose this block number. + */ + int prev_chkp_index = (checkpoint_number - 1) % 2; + bool success; + + if (OCompressIsValid(desc->compress) || use_device) + { + success = seq_buf_write_file_extent(&desc->nextChkp[prev_chkp_index], *extent); + } + else + { + uint32 offset = extent->off; + + Assert(offset < UINT32_MAX); + success = seq_buf_write_u32(&desc->nextChkp[prev_chkp_index], offset); + } + + if (!success) + { + LWLockRelease(&metaPage->copyBlknoLock); + return false; + } + } + + LWLockRelease(&metaPage->copyBlknoLock); + + return FileExtentIsValid(*extent); +} + +/* Functions for eviction_page_checkpoint_numbers test included under IS_DEV */ +PG_FUNCTION_INFO_V1(reset_read_page_checkpoint_stats); +PG_FUNCTION_INFO_V1(fetch_read_page_checkpoint_stats); + +Datum +reset_read_page_checkpoint_stats(PG_FUNCTION_ARGS) +{ + min_read_page_checkpoint = UINT32_MAX; + max_read_page_checkpoint = 0; + PG_RETURN_VOID(); +} + +Datum +fetch_read_page_checkpoint_stats(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + bool nulls[2] = {false}; + Datum values[2]; + + InitMaterializedSRF(fcinfo, 0); + + values[0] = UInt32GetDatum(min_read_page_checkpoint); + values[1] = UInt32GetDatum(max_read_page_checkpoint); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + + return (Datum) 0; +} + +/* Store checkpoint statistics for page reads for eviction_page_checkpoint_numbers test */ +static void +store_read_page_checkpoint_stats(uint32 checkpointNum) +{ + /* Remember for checkpoint read test only */ + max_read_page_checkpoint = Max(max_read_page_checkpoint, checkpointNum); + min_read_page_checkpoint = Min(min_read_page_checkpoint, checkpointNum); + elog(DEBUG1, "Remember read_page_checkpoin: min %u max %u", min_read_page_checkpoint, max_read_page_checkpoint); +} + +/* + * Now we have only one page version (1). When we have + * different versions we'll need to bump + * ORIOLEDB_PAGE_VERSION and implement on-the-fly conversion + * function from all previous page versions to use _after_ + * decompression. + */ +static bool +check_orioledb_page_version(OrioleDBOndiskPageHeader ondisk_page_header) +{ + if (ondisk_page_header.page_version != ORIOLEDB_PAGE_VERSION) + elog(FATAL, "Page version %u of OrioleDB cluster is not among supported for conversion %u", ondisk_page_header.page_version, ORIOLEDB_PAGE_VERSION); + + return false; +} + +static void +convert_orioledb_page_version(Pointer img) +{ + Assert(ORIOLEDB_PAGE_VERSION == 1); + elog(FATAL, "Page version conversion is not implemented"); +} + +/* + * Now we have only one compresss version (1). When we have + * different versions we'll need to bump + * ORIOLEDB_COMPRESS_VERSION and add other variants of + * decompress function from all previous page versions in + * this function + */ +static bool +check_orioledb_compress_version(OrioleDBOndiskPageHeader ondisk_page_header) +{ + if (ondisk_page_header.compress_version != ORIOLEDB_COMPRESS_VERSION) + elog(FATAL, "Page version %u of OrioleDB cluster is not among supported for conversion %u", ondisk_page_header.compress_version, ORIOLEDB_PAGE_VERSION); + + return false; +} + +/* + * Reads a page from disk to the img from a valid downlink. It's fills an empty + * array of offsets for the page. + */ +bool +read_page_from_disk(BTreeDescr *desc, Pointer img, uint64 downlink, + FileExtent *extent) +{ + off_t byte_offset, + read_size; + uint64 offset = DOWNLINK_GET_DISK_OFF(downlink); + uint32 chkpNum = 0; + uint16 len = DOWNLINK_GET_DISK_LEN(downlink); + bool err = false; + OrioleDBOndiskPageHeader ondisk_page_header = {0}; + bool needs_page_version_convert; + + Assert(FileExtentOffIsValid(offset)); + Assert(FileExtentLenIsValid(len)); + + extent->off = offset; + extent->len = len; + + if (orioledb_s3_mode) + { + chkpNum = S3_GET_CHKP_NUM(offset); + offset &= S3_OFFSET_MASK; + } + + if (!OCompressIsValid(desc->compress)) + { + /* easy case, read page from uncompressed index */ + Assert(len == 1); + + if (use_device) + byte_offset = (off_t) offset * (off_t) ORIOLEDB_COMP_BLCKSZ; + else + byte_offset = (off_t) offset * (off_t) ORIOLEDB_BLCKSZ; + read_size = ORIOLEDB_BLCKSZ; + + err = btree_smgr_read(desc, img, chkpNum, read_size, byte_offset) != read_size; + if (err) + return false; + + ondisk_page_header = *((OrioleDBOndiskPageHeader *) img); + needs_page_version_convert = check_orioledb_page_version(ondisk_page_header); + + elog(DEBUG1, "Read plain disk page: checkpoint %u", ondisk_page_header.checkpointNum); + } + else + { + char buf[ORIOLEDB_BLCKSZ]; + bool compressed = len != (ORIOLEDB_BLCKSZ / ORIOLEDB_COMP_BLCKSZ); + + if (compressed) + { + bool needs_compress_version_convert PG_USED_FOR_ASSERTS_ONLY; + + byte_offset = (off_t) offset * (off_t) ORIOLEDB_COMP_BLCKSZ; + read_size = len * ORIOLEDB_COMP_BLCKSZ; + + err = btree_smgr_read(desc, buf, chkpNum, read_size, byte_offset) != read_size; + if (err) + return false; + + ondisk_page_header = *((OrioleDBOndiskPageHeader *) buf); + needs_page_version_convert = check_orioledb_page_version(ondisk_page_header); + + needs_compress_version_convert = check_orioledb_compress_version(ondisk_page_header); + Assert(!needs_compress_version_convert); + o_decompress_page(buf + O_PAGE_HEADER_SIZE, ondisk_page_header.compress_page_size, img); + elog(DEBUG1, "Read disk page: checkpoint %u size %d", ondisk_page_header.checkpointNum, ondisk_page_header.compress_page_size); + + /* + * Decompressed page has its own OrioleDBPageHeader with the same + * checkpointNum as is external OrioleDBOndiskPageHeader. It is + * redundant and unused, just check it. + */ + Assert(((BTreePageHeader *) img)->o_header.checkpointNum == ondisk_page_header.checkpointNum); + } + else + { + byte_offset = (off_t) offset * (off_t) ORIOLEDB_COMP_BLCKSZ; + read_size = O_PAGE_HEADER_SIZE; + + /* details about written image parts are in write_page_to_disk */ + err = btree_smgr_read(desc, (Pointer) &ondisk_page_header, chkpNum, read_size, byte_offset) != read_size; + byte_offset += read_size; + + if (err) + return false; + + read_size = ORIOLEDB_BLCKSZ - O_PAGE_HEADER_SIZE; + err = btree_smgr_read(desc, img + O_PAGE_HEADER_SIZE, chkpNum, read_size, byte_offset) != read_size; + if (err) + return false; + + needs_page_version_convert = check_orioledb_page_version(ondisk_page_header); + elog(DEBUG1, "Read disk page: checkpoint %u size %d", ondisk_page_header.checkpointNum, ORIOLEDB_BLCKSZ); + } + } + + /* + * At this point, page is fully read and decompressed. Do conversion of + * needed data from OrioleDBOndiskPageHeader to OrioleDBPageHeader. Do + * conversion of page version (not implemented yet); + */ + Assert(!err); + + if (needs_page_version_convert) + convert_orioledb_page_version(img); + + /* + * Convert needed data from OrioleDBOndiskPageHeader to + * OrioleDBPageHeader. Erase what's unused to be safe. + */ + memset(img, 0, O_PAGE_HEADER_SIZE); + ((BTreePageHeader *) img)->o_header.checkpointNum = ondisk_page_header.checkpointNum; + + /* For eviction/page checkpoint number test */ + store_read_page_checkpoint_stats(((BTreePageHeader *) img)->o_header.checkpointNum); + + return true; +} + +/* + * Writes a page to the disk. An array of file offsets must be valid. + */ +static bool +write_page_to_disk(BTreeDescr *desc, FileExtent *extent, uint32 curChkpNum, + Pointer page, off_t page_size) +{ + + off_t byte_offset, + write_size; + bool err = false; + uint32 chkpNum = 0; + char buf[ORIOLEDB_BLCKSZ]; + + Assert(FileExtentOffIsValid(extent->off)); + + byte_offset = (off_t) extent->off; + + if (orioledb_s3_mode) + { + chkpNum = S3_GET_CHKP_NUM(byte_offset); + byte_offset &= S3_OFFSET_MASK; + } + + if (!OCompressIsValid(desc->compress)) + { + OrioleDBOndiskPageHeader *ondisk_page_header; + + /* + * Easy case, write whole page to uncompressed index. + */ + Assert(extent->len == 1); + Assert(page_size == ORIOLEDB_BLCKSZ); + + if (use_device) + byte_offset *= (off_t) ORIOLEDB_COMP_BLCKSZ; + else + byte_offset *= (off_t) ORIOLEDB_BLCKSZ; + write_size = ORIOLEDB_BLCKSZ; + + memset(buf, 0, O_PAGE_HEADER_SIZE); + ondisk_page_header = (OrioleDBOndiskPageHeader *) buf; + ondisk_page_header->checkpointNum = curChkpNum; + ondisk_page_header->page_version = ORIOLEDB_PAGE_VERSION; + memcpy(&buf[O_PAGE_HEADER_SIZE], page + O_PAGE_HEADER_SIZE, ORIOLEDB_BLCKSZ - O_PAGE_HEADER_SIZE); + + err = btree_smgr_write(desc, buf, chkpNum, write_size, byte_offset) != write_size; + + elog(DEBUG1, "Wrote plain disk page: checkpoint %u", curChkpNum); + } + else + { + OrioleDBOndiskPageHeader ondisk_page_header = {0}; + + byte_offset *= (off_t) ORIOLEDB_COMP_BLCKSZ; + + /* + * overflow protection + */ + Assert(sizeof(((OrioleDBOndiskPageHeader *) 0)->compress_page_size) == sizeof(uint16)); + Assert(ORIOLEDB_BLCKSZ < UINT16_MAX); + + /* Write header first */ + ondisk_page_header.compress_page_size = page_size; + ondisk_page_header.checkpointNum = curChkpNum; + ondisk_page_header.compress_version = ORIOLEDB_COMPRESS_VERSION; + ondisk_page_header.page_version = ORIOLEDB_PAGE_VERSION; + + write_size = O_PAGE_HEADER_SIZE; + err = btree_smgr_write(desc, (char *) &ondisk_page_header, chkpNum, write_size, byte_offset) != write_size; + byte_offset += write_size; + + if (err) + return false; + + /* Write everything left except header, which is already written */ + if (page_size != ORIOLEDB_BLCKSZ) + { + /* + * Compressed chunks don't have external header, just make up for + * length + */ + write_size = extent->len * ORIOLEDB_COMP_BLCKSZ - O_PAGE_HEADER_SIZE; + err = btree_smgr_write(desc, page, chkpNum, write_size, byte_offset) != write_size; + } + else + { + /* + * For non-compresses page cut already written header and make up + * for length + */ + page += O_PAGE_HEADER_SIZE; + write_size = ORIOLEDB_BLCKSZ - O_PAGE_HEADER_SIZE; + err = btree_smgr_write(desc, page, chkpNum, write_size, byte_offset) != write_size; + } + + elog(DEBUG1, "Wrote disk page: checkpoint %u size %d", curChkpNum, (int) page_size); + + } + + return !err; +} + +/* + * Load the page where context is pointing from disk to memory, assuming parent + * page is locked. + */ +void +load_page(OBTreeFindPageContext *context) +{ + OrioleDBPageDesc *parent_page_desc, + *page_desc; + BTreeDescr *desc = context->desc; + OInMemoryBlkno parent_blkno; + Page parent_page; + BTreePageItemLocator *parent_loc; + CommitSeqNo csn; + uint64 downlink; + int context_index, + ionum; + uint32 parent_change_count; + BTreeNonLeafTuphdr *int_hdr; + OInMemoryBlkno blkno; + OFixedKey target_hikey; + int target_level; + Page page; + char buf[ORIOLEDB_BLCKSZ]; + bool was_modify; + bool was_downlink_location; + bool was_fetch = false; + bool was_image = false; + bool was_keep_lokey = false; + uint32 chkpNum = 0; + + context_index = context->index; + parent_blkno = context->items[context_index].blkno; + parent_loc = &context->items[context_index].locator; + parent_change_count = context->items[context_index].pageChangeCount; + parent_page = O_GET_IN_MEMORY_PAGE(parent_blkno); + + ionum = assign_io_num(parent_blkno, BTREE_PAGE_LOCATOR_GET_OFFSET(parent_page, parent_loc)); + + /* Modify parent downlink: indicate that IO is in-progress */ + page_block_reads(parent_blkno); + int_hdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent_page, parent_loc); + Assert(DOWNLINK_IS_ON_DISK(int_hdr->downlink)); + + downlink = int_hdr->downlink; + + int_hdr->downlink = MAKE_IO_DOWNLINK(ionum); + Assert(PAGE_GET_N_ONDISK(parent_page) > 0); + PAGE_DEC_N_ONDISK(parent_page); + + BTREE_PAGE_LOCATOR_NEXT(parent_page, parent_loc); + if (BTREE_PAGE_LOCATOR_IS_VALID(parent_page, parent_loc)) + copy_fixed_page_key(desc, &target_hikey, parent_page, parent_loc); + else if (!O_PAGE_IS(parent_page, RIGHTMOST)) + copy_fixed_hikey(desc, &target_hikey, parent_page); + else + clear_fixed_key(&target_hikey); + target_level = PAGE_GET_LEVEL(parent_page) - 1; + + unlock_page(parent_blkno); + + /* Prepare new page metaPage-data */ + ppool_reserve_pages(desc->ppool, PPOOL_RESERVE_FIND, 1); + blkno = ppool_alloc_page(desc->ppool, PPOOL_RESERVE_FIND); + lock_page(blkno); + page_block_reads(blkno); + + Assert(OInMemoryBlknoIsValid(blkno)); + page = O_GET_IN_MEMORY_PAGE(blkno); + parent_page_desc = O_GET_IN_MEMORY_PAGEDESC(parent_blkno); + page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + + page_desc->flags = 0; + + /* Read page data and put it to the page */ + if (!read_page_from_disk(desc, buf, downlink, &page_desc->fileExtent)) + { + int_hdr->downlink = downlink; + PAGE_INC_N_ONDISK(parent_page); + unlock_io(ionum); + if (orioledb_s3_mode) + chkpNum = S3_GET_CHKP_NUM(page_desc->fileExtent.off); + + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not read page with file offset " UINT64_FORMAT " from %s: %m", + DOWNLINK_GET_DISK_OFF(downlink), + btree_smgr_filename(desc, DOWNLINK_GET_DISK_OFF(downlink), chkpNum)))); + } + + put_page_image(blkno, buf); + ppool_ucm_init(desc->ppool, blkno); + page_desc->type = parent_page_desc->type; + page_desc->oids = parent_page_desc->oids; + + Assert(O_PAGE_IS(page, LEAF) || + (PAGE_GET_N_ONDISK(page) == BTREE_PAGE_ITEMS_COUNT(page))); + + if (orioledb_s3_mode && !O_PAGE_IS(page, LEAF)) + { + BTreePageItemLocator loc; + + /* + * In S3 mode schedule load of all the page children for faster + * warmup. + */ + BTREE_PAGE_FOREACH_ITEMS(page, &loc) + { + BTreeNonLeafTuphdr *tupHdr; + + tupHdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(page, &loc); + (void) s3_schedule_downlink_load(desc, tupHdr->downlink); + } + } + + unlock_page(blkno); + + EA_LOAD_INC(blkno); + + if (STOPEVENTS_ENABLED()) + { + Jsonb *params; + + params = btree_page_stopevent_params(desc, page); + STOPEVENT(STOPEVENT_LOAD_PAGE_REFIND, params); + } + + /* re-find parent page (it might be changed due to concurrent operations) */ + csn = context->csn; + was_modify = BTREE_PAGE_FIND_IS(context, MODIFY); + was_image = BTREE_PAGE_FIND_IS(context, IMAGE); + BTREE_PAGE_FIND_UNSET(context, IMAGE); + if (!was_modify) + { + was_fetch = BTREE_PAGE_FIND_IS(context, FETCH); + Assert(was_fetch || was_image); + BTREE_PAGE_FIND_UNSET(context, FETCH); + BTREE_PAGE_FIND_SET(context, MODIFY); + } + was_keep_lokey = BTREE_PAGE_FIND_IS(context, KEEP_LOKEY); + if (was_keep_lokey) + BTREE_PAGE_FIND_UNSET(context, KEEP_LOKEY); + was_downlink_location = BTREE_PAGE_FIND_IS(context, DOWNLINK_LOCATION); + if (!was_downlink_location) + BTREE_PAGE_FIND_SET(context, DOWNLINK_LOCATION); + context->csn = COMMITSEQNO_INPROGRESS; + if (PAGE_GET_LEVEL(page) != target_level) + ereport(PANIC, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("error reading downlink %X/%X in relfile (%u, %u)", + (uint32) (downlink >> 32), (uint32) (downlink), + desc->oids.datoid, desc->oids.relnode), + errdetail("Level mismatch, expected: %d, found: %d", + PAGE_GET_LEVEL(page), target_level))); + + if (O_PAGE_IS(page, RIGHTMOST)) + { + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + if (!O_TUPLE_IS_NULL(target_hikey.tuple)) + ereport(PANIC, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("error reading downlink %X/%X in relfile (%u, %u)", + (uint32) (downlink >> 32), (uint32) (downlink), + desc->oids.datoid, desc->oids.relnode), + errdetail("Hikeys don't match."))); + result = refind_page(context, NULL, BTreeKeyRightmost, + PAGE_GET_LEVEL(page) + 1, + parent_blkno, parent_change_count); + Assert(result == OFindPageResultSuccess); + } + else + { + OTuple hikey; + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + BTREE_PAGE_GET_HIKEY(hikey, page); + + if (O_TUPLE_IS_NULL(target_hikey.tuple) || + o_btree_cmp(desc, &hikey, BTreeKeyNonLeafKey, &target_hikey, BTreeKeyNonLeafKey) != 0) + ereport(PANIC, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("error reading downlink %X/%X in relfile (%u, %u)", + (uint32) (downlink >> 32), (uint32) (downlink), + desc->oids.datoid, desc->oids.relnode), + errdetail("Hikeys don't match."))); + result = refind_page(context, &hikey, BTreeKeyPageHiKey, + PAGE_GET_LEVEL(page) + 1, parent_blkno, + parent_change_count); + Assert(result == OFindPageResultSuccess); + } + + /* restore context state */ + context->csn = csn; + if (!was_modify) + { + if (was_fetch) + BTREE_PAGE_FIND_SET(context, FETCH); + BTREE_PAGE_FIND_UNSET(context, MODIFY); + } + if (was_image) + BTREE_PAGE_FIND_SET(context, IMAGE); + if (was_keep_lokey) + BTREE_PAGE_FIND_SET(context, KEEP_LOKEY); + if (!was_downlink_location) + BTREE_PAGE_FIND_UNSET(context, DOWNLINK_LOCATION); + + context_index = context->index; + parent_blkno = context->items[context_index].blkno; + parent_loc = &context->items[context_index].locator; + parent_change_count = context->items[context_index].pageChangeCount; + + /* Replace parent downlink with orioledb downlink */ + page_block_reads(parent_blkno); + parent_page = O_GET_IN_MEMORY_PAGE(parent_blkno); + int_hdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent_page, parent_loc); + Assert(int_hdr->downlink == MAKE_IO_DOWNLINK(ionum)); + int_hdr->downlink = MAKE_IN_MEMORY_DOWNLINK(blkno, O_PAGE_HEADER(page)->pageChangeCount); + + unlock_io(ionum); +} + +/* + * Returns pointer to writable image. It compresses page if needed. + */ +static inline Pointer +get_write_img(BTreeDescr *desc, Page page, size_t *size) +{ + Pointer result; + + if (OCompressIsValid(desc->compress)) + { + result = o_compress_page(page, size, desc->compress); + if (*size > (ORIOLEDB_BLCKSZ - ORIOLEDB_COMP_BLCKSZ - O_PAGE_HEADER_SIZE)) + { + /* + * No sense to write compressed page + */ + result = page; + *size = ORIOLEDB_BLCKSZ; + } + } + else + { + result = page; + *size = ORIOLEDB_BLCKSZ; + } + return result; +} + +#ifdef USE_ASSERT_CHECKING +static void +prewrite_image_check(Page p) +{ + if (!O_PAGE_IS(p, LEAF)) + { + BTreePageItemLocator loc; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + Assert(DOWNLINK_IS_ON_DISK(tuphdr->downlink)); + } + } +} +#endif + +/* + * Returns downlink to the page or InvalidDiskDownlink if fails. + */ +uint64 +perform_page_io(BTreeDescr *desc, OInMemoryBlkno blkno, + Page img, uint32 checkpoint_number, bool copy_blkno, + bool *dirty_parent) +{ + Page page = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) page; + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + Pointer write_img; + size_t write_size; + int chkp_index; + bool less_num, + err = false; + +#ifdef USE_ASSERT_CHECKING + prewrite_image_check(img); +#endif + + EA_WRITE_INC(blkno); + + less_num = header->o_header.checkpointNum < checkpoint_number; + if (less_num) + { + /* + * Page wasn't yet written during given checkpoint, so we have to + * relocate it in order to implement copy-on-write checkpointing. + */ + if ((uintptr_t) page != (uintptr_t) img) + { + /* + * we need to update the written checkpoint number for the img too + */ + header = (BTreePageHeader *) img; + header->o_header.checkpointNum = checkpoint_number; + header = (BTreePageHeader *) page; + } + header->o_header.checkpointNum = checkpoint_number; + } + else + { + Assert(header->o_header.checkpointNum == checkpoint_number); + } + + write_img = get_write_img(desc, img, &write_size); + + /* + * Determine the file position to write this page. + */ + chkp_index = checkpoint_number % 2; + if (orioledb_s3_mode) + { + if (less_num) + { + err = !get_free_disk_extent(desc, checkpoint_number, write_size, &page_desc->fileExtent); + *dirty_parent = true; + } + else + { + if (!OCompressIsValid(desc->compress)) + { + /* easy case: no compression */ + *dirty_parent = false; + } + else + { + uint16 old_len = page_desc->fileExtent.len, + new_len = FileExtentLen(write_size); + + if (old_len < new_len) + { + err = !get_free_disk_extent(desc, checkpoint_number, write_size, &page_desc->fileExtent); + *dirty_parent = true; + } + else if (old_len > new_len) + { + page_desc->fileExtent.len = new_len; + *dirty_parent = true; + } + else + { + *dirty_parent = false; + } + } + } + } + else if (less_num) + { + /* + * Page wasn't yet written during given checkpoint, so we have to + * relocate it in order to implement copy-on-write checkpointing. + */ + + if (FileExtentIsValid(page_desc->fileExtent)) + { +#ifdef USE_ASSERT_CHECKING + + /* + * Shared seq_bufs should be initialized by checkpointer. User + * temporary trees keep their own backend-local free space map and + * do not use these shared buffers at all; system trees that + * happen to be BTreeStorageTemporary still share a pool and only + * skip the nextChkp assertion (no .map file). + */ + if (!btree_desc_is_local_temp(desc)) + { + if (desc->storageType != BTreeStorageTemporary) + { + SpinLockAcquire(&desc->nextChkp[chkp_index].shared->lock); + Assert(desc->nextChkp[chkp_index].shared->tag.num == checkpoint_number); + SpinLockRelease(&desc->nextChkp[chkp_index].shared->lock); + } + SpinLockAcquire(&desc->tmpBuf[chkp_index].shared->lock); + Assert(desc->tmpBuf[chkp_index].shared->tag.num == checkpoint_number); + SpinLockRelease(&desc->tmpBuf[chkp_index].shared->lock); + } +#endif + free_extent_for_checkpoint(desc, &page_desc->fileExtent, checkpoint_number); + } + + /* Get free disk page to locate new page image */ + if (copy_blkno) + { + err = !get_free_disk_extent_copy_blkno(desc, write_size, + &page_desc->fileExtent, + checkpoint_number); + } + else + { + err = !get_free_disk_extent(desc, checkpoint_number, write_size, &page_desc->fileExtent); + } + + *dirty_parent = true; + } + else + { + /* + * Has been already written during given checkpoint, so rewrite page + * in-place. + */ + Assert(FileExtentIsValid(page_desc->fileExtent)); + if (!OCompressIsValid(desc->compress)) + { + /* easy case: no compression */ + *dirty_parent = false; + } + else + { + uint16 old_len = page_desc->fileExtent.len, + new_len = FileExtentLen(write_size); + + /* + * check: is current image take as much space as previous written + * page? + */ + if (old_len < new_len) + { + free_extent_for_checkpoint(desc, &page_desc->fileExtent, checkpoint_number); + /* allocate more file blocks */ + if (copy_blkno) + { + err = !get_free_disk_extent_copy_blkno(desc, write_size, + &page_desc->fileExtent, + checkpoint_number); + } + else + { + err = !get_free_disk_extent(desc, checkpoint_number, + write_size, &page_desc->fileExtent); + } + } + else if (old_len > new_len) + { + /* + * free space + */ + FileExtent free_extent; + + free_extent.len = page_desc->fileExtent.len - new_len; + free_extent.off = page_desc->fileExtent.off + new_len; + + if (!seq_buf_write_file_extent(&desc->nextChkp[chkp_index], free_extent) || + !seq_buf_write_file_extent(&desc->tmpBuf[chkp_index], free_extent)) + { + err = true; + } + page_desc->fileExtent.len = new_len; + } + + *dirty_parent = old_len != new_len; + } + } + + if (err) + { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not (re) allocate file blocks for page %d to file %s: %m", + blkno, btree_smgr_filename(desc, 0, checkpoint_number)))); + } + + Assert(FileExtentIsValid(page_desc->fileExtent)); + + if (!write_page_to_disk(desc, &page_desc->fileExtent, checkpoint_number, write_img, write_size)) + { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not write page %d to file %s with offset %lu: %m", + blkno, + btree_smgr_filename(desc, page_desc->fileExtent.off, checkpoint_number), + (unsigned long) page_desc->fileExtent.off))); + + return InvalidDiskDownlink; + } + + Assert(FileExtentIsValid(page_desc->fileExtent)); + return MAKE_ON_DISK_DOWNLINK(page_desc->fileExtent); +} + +/* + * Performs page write for autonomous checkpoint images. + * + * Returns downlink to the page. + */ +uint64 +perform_page_io_autonomous(BTreeDescr *desc, uint32 chkpNum, Page img, FileExtent *extent) +{ + Pointer write_img; + size_t write_size; + +#ifdef USE_ASSERT_CHECKING + prewrite_image_check(img); +#endif + + write_img = get_write_img(desc, img, &write_size); + + if (!get_free_disk_extent(desc, chkpNum, write_size, extent)) + { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not get free file offset for write page to file %s: %m", + btree_smgr_filename(desc, 0, 0)))); + + return InvalidDiskDownlink; + } + + Assert(FileExtentIsValid(*extent)); + + if (!write_page_to_disk(desc, extent, chkpNum, write_img, write_size)) + { + uint64 offset; + + if (orioledb_s3_mode) + { + offset = extent->off & S3_OFFSET_MASK; + chkpNum = S3_GET_CHKP_NUM(extent->off); + } + else + { + offset = extent->off; + chkpNum = 0; + } + + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not write autonomous page to file %s with offset %lu: %m", + btree_smgr_filename(desc, offset, chkpNum), + (unsigned long) offset))); + + return InvalidDiskDownlink; + } + + Assert(FileExtentIsValid(*extent)); + return MAKE_ON_DISK_DOWNLINK(*extent); +} + +/* + * Performs page write for tree build. + * + * Returns downlink to the page. + */ +uint64 +perform_page_io_build(BTreeDescr *desc, Page img, + FileExtent *extent, BTreeMetaPage *metaPage) +{ + Pointer write_img; + size_t write_size; + uint32 chkpNum; + + btree_page_update_max_key_len(desc, img); + +#ifdef USE_ASSERT_CHECKING + prewrite_image_check(img); +#endif + + write_img = get_write_img(desc, img, &write_size); + + if (orioledb_s3_mode) + chkpNum = checkpoint_state->lastCheckpointNumber; + else + chkpNum = 0; + + if (!OCompressIsValid(desc->compress)) + { + Assert(write_size == ORIOLEDB_BLCKSZ); + + extent->len = 1; + if (use_device) + extent->off = orioledb_device_alloc(desc, ORIOLEDB_BLCKSZ) / ORIOLEDB_COMP_BLCKSZ; + else + extent->off = pg_atomic_fetch_add_u64(&metaPage->datafileLength[chkpNum % 2], 1); + } + else + { + extent->len = FileExtentLen(write_size); + if (use_device) + extent->off = orioledb_device_alloc(desc, ORIOLEDB_BLCKSZ) / ORIOLEDB_COMP_BLCKSZ; + else + extent->off = pg_atomic_fetch_add_u64(&metaPage->datafileLength[chkpNum % 2], extent->len); + } + + if (orioledb_s3_mode) + { + int threshold = ORIOLEDB_S3_PART_SIZE / (OCompressIsValid(desc->compress) ? ORIOLEDB_COMP_BLCKSZ : ORIOLEDB_BLCKSZ); + + if ((extent->off + threshold - 1) / threshold != + (extent->off + threshold - 1 + extent->len) / threshold) + { + S3HeaderTag tag; + uint64 offset = (extent->off + extent->len - 1) * (OCompressIsValid(desc->compress) ? ORIOLEDB_COMP_BLCKSZ : ORIOLEDB_BLCKSZ); + int index; + + Assert((extent->off + threshold - 1) / threshold + 1 == + (extent->off + threshold - 1 + extent->len) / threshold); + + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.checkpointNum = chkpNum; + tag.segNum = offset / ORIOLEDB_SEGMENT_SIZE; + index = (offset % ORIOLEDB_SEGMENT_SIZE) / ORIOLEDB_S3_PART_SIZE; + s3_header_mark_part_loading(tag, index); + s3_header_mark_part_loaded(tag, index); + s3_headers_increase_loaded_parts(1); + } + + extent->off |= (uint64) chkpNum << S3_CHKP_NUM_SHIFT; + } + + Assert(FileExtentIsValid(*extent)); + + if (!write_page_to_disk(desc, extent, 0, write_img, write_size)) + { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not write autonomous page to file %s with offset %lu: %m", + btree_smgr_filename(desc, extent[0].off, chkpNum), + (unsigned long) extent[0].off))); + + return InvalidDiskDownlink; + } + + Assert(FileExtentIsValid(*extent)); + return MAKE_ON_DISK_DOWNLINK(*extent); +} + +/* + * Prepare internal page for writing to disk. + */ +static bool +prepare_non_leaf_page(Page p) +{ + BTreePageItemLocator loc; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (DOWNLINK_IS_IN_IO(tuphdr->downlink)) + return false; + + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + OInMemoryBlkno child = DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink); + OrioleDBPageDesc *desc = O_GET_IN_MEMORY_PAGEDESC(child); + + if (!try_lock_page(child)) + return false; + + /* + * It's worth less to write non-leaf page, if it's going to anyway + * become dirty after writing of child. + */ + if (IS_DIRTY(child) || desc->ionum >= 0) + { + unlock_page(child); + return false; + } + + /* XXX: should we also consider checkpoint number of child page? */ + Assert(FileExtentIsValid(desc->fileExtent)); + tuphdr->downlink = MAKE_ON_DISK_DOWNLINK(desc->fileExtent); + unlock_page(child); + } + } + + PAGE_SET_N_ONDISK(p, BTREE_PAGE_ITEMS_COUNT(p)); + return true; +} + +/* + * Evict the page, assuming target page and its parent are locked. + */ +static void +write_page(OBTreeFindPageContext *context, OInMemoryBlkno blkno, Page img, + uint32 checkpoint_number, + bool evict, bool copy_blkno) +{ + BTreeDescr *desc = context->desc; + OInMemoryBlkno parent_blkno = OInvalidInMemoryBlkno; + Page parent_page = NULL; + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageItemLocator *parent_loc; + int ionum = -1, + context_index; + BTreeNonLeafTuphdr *int_hdr = NULL; + uint32 parent_change_count = 0; + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + bool is_root = desc->rootInfo.rootPageBlkno == blkno; + + /* rootPageBlkno can not be evicted here */ + Assert(!evict || !is_root); + Assert(OInMemoryBlknoIsValid(desc->rootInfo.rootPageBlkno)); + Assert(page_is_locked(blkno) || O_PAGE_IS_LOCAL(blkno)); + EA_EVICT_INC(blkno); + + if (!is_root) + { + context_index = context->index; + parent_blkno = context->items[context_index].blkno; + parent_loc = &context->items[context_index].locator; + parent_change_count = context->items[context_index].pageChangeCount; + + parent_page = O_GET_IN_MEMORY_PAGE(parent_blkno); + + ionum = assign_io_num(parent_blkno, BTREE_PAGE_LOCATOR_GET_OFFSET(parent_page, parent_loc)); + + /* Prepare to modify downlink in parent page */ + page_block_reads(parent_blkno); + int_hdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent_page, parent_loc); + } + else + { + /* + * Root page still need ionum to prevent changing of checkpoint + * number. + */ + ionum = assign_io_num(blkno, MaxOffsetNumber); + } + + if (!IS_DIRTY(blkno)) + { + Assert(evict); + + /* + * Easy case: page isn't dirty and doesn't need to be written to the + * disk. Then we just have to change downlink in the parent. + */ + Assert(FileExtentIsValid(page_desc->fileExtent)); + int_hdr->downlink = MAKE_ON_DISK_DOWNLINK(page_desc->fileExtent); + PAGE_INC_N_ONDISK(parent_page); + + /* Concurrent readers should give up when we release the lock... */ + O_PAGE_CHANGE_COUNT_INC(p); + unlock_page(blkno); + unlock_io(ionum); + } + else + { + uint64 new_downlink, + old_downlink = 0; + bool dirty_parent; + + /* Mark parent downlink as IO in-progress. */ + if (evict) + { + old_downlink = int_hdr->downlink; + int_hdr->downlink = MAKE_IO_DOWNLINK(ionum); + O_PAGE_CHANGE_COUNT_INC(p); + } + /* Caller (walk_page()) ensured that there is no IO in progress */ + Assert(page_desc->ionum < 0); + page_desc->ionum = ionum; + if (!is_root) + unlock_page(parent_blkno); + + /* Perform actual IO */ + if (evict) + { + unlock_page(blkno); + new_downlink = perform_page_io(desc, blkno, p, + checkpoint_number, copy_blkno, &dirty_parent); + + if (DiskDownlinkIsValid(new_downlink)) + writeback_put_extent(&io_writeback, desc, new_downlink); + + /* Page is not dirty anymore */ + CLEAN_DIRTY(desc->ppool, blkno); + } + else + { + /* Non-leaf pages are already copied by caller */ + if (O_PAGE_IS(p, LEAF)) + memcpy(img, p, ORIOLEDB_BLCKSZ); + + CLEAN_DIRTY_CONCURRENT(blkno); + unlock_page(blkno); + + if (STOPEVENTS_ENABLED()) + { + Jsonb *params; + + params = btree_page_stopevent_params(desc, p); + STOPEVENT(STOPEVENT_AFTER_IONUM_SET, params); + } + new_downlink = perform_page_io(desc, blkno, img, + checkpoint_number, copy_blkno, &dirty_parent); + + if (DiskDownlinkIsValid(new_downlink)) + writeback_put_extent(&io_writeback, desc, new_downlink); + + /* Clean dirty only if there are no concurrent writes */ + lock_page(blkno); + if (!IS_DIRTY_CONCURRENT(blkno)) + CLEAN_DIRTY(desc->ppool, blkno); + unlock_page(blkno); + + if (!DiskDownlinkIsValid(new_downlink)) + { + page_desc->ionum = -1; + unlock_io(ionum); + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not evict page %d to disk: %m", blkno))); + } + else if (!dirty_parent) + { + page_desc->ionum = -1; + unlock_io(ionum); + perform_writeback(&io_writeback); + return; + } + } + + if (!is_root) + { + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + /* Refind parent */ + BTREE_PAGE_FIND_SET(context, DOWNLINK_LOCATION); + if (O_PAGE_IS(p, RIGHTMOST)) + { + result = refind_page(context, NULL, BTreeKeyRightmost, + PAGE_GET_LEVEL(p) + 1, + parent_blkno, parent_change_count); + } + else + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, p); + result = refind_page(context, &hikey, BTreeKeyPageHiKey, + PAGE_GET_LEVEL(p) + 1, + parent_blkno, parent_change_count); + } + Assert(result == OFindPageResultSuccess); + + BTREE_PAGE_FIND_UNSET(context, DOWNLINK_LOCATION); + + context_index = context->index; + parent_blkno = context->items[context_index].blkno; + parent_loc = &context->items[context_index].locator; + parent_change_count = context->items[context_index].pageChangeCount; + + /* Replace parent downlink with on-disk link */ + parent_page = O_GET_IN_MEMORY_PAGE(parent_blkno); + page_block_reads(parent_blkno); + int_hdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent_page, parent_loc); + + if (!DiskDownlinkIsValid(new_downlink)) + { + /* error happens on write, rollback changes in shared memory */ + if (evict) + int_hdr->downlink = old_downlink; + page_desc->ionum = -1; + unlock_io(ionum); + unlock_page(parent_blkno); + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not evict page %d to disk: %m", blkno))); + } + else + { + if (dirty_parent) + MARK_DIRTY(desc, parent_blkno); + + if (evict) + { + int_hdr->downlink = new_downlink; + PAGE_INC_N_ONDISK(parent_page); + } + } + } + page_desc->ionum = -1; + unlock_io(ionum); + } + + if (!is_root) + unlock_page(parent_blkno); + + if (evict) + ppool_free_page(desc->ppool, blkno, false); + + perform_writeback(&io_writeback); +} + +static void +btree_finalize_private_seq_bufs(BTreeDescr *desc, EvictedTreeData *evicted_data) +{ + int chkp_index; + bool is_compressed = OCompressIsValid(desc->compress); + + Assert(desc->storageType == BTreeStorageTemporary || + desc->storageType == BTreeStoragePersistence || + desc->storageType == BTreeStorageUnlogged); + + /* we must not evict BTree under checkpoint */ + + if (desc->storageType == BTreeStoragePersistence || desc->storageType == BTreeStorageUnlogged) + { + chkp_index = SEQ_BUF_SHARED_EXIST(desc->nextChkp[0].shared) ? 0 : 1; + + Assert(!SEQ_BUF_SHARED_EXIST(desc->nextChkp[1 - chkp_index].shared)); + Assert(!SEQ_BUF_SHARED_EXIST(desc->tmpBuf[1 - chkp_index].shared)); + Assert(is_compressed || SEQ_BUF_SHARED_EXIST(desc->freeBuf.shared)); + Assert(SEQ_BUF_SHARED_EXIST(desc->nextChkp[chkp_index].shared)); + Assert(SEQ_BUF_SHARED_EXIST(desc->tmpBuf[chkp_index].shared)); + } + else + { + chkp_index = SEQ_BUF_SHARED_EXIST(desc->tmpBuf[0].shared) ? 0 : 1; + + Assert(!SEQ_BUF_SHARED_EXIST(desc->tmpBuf[1 - chkp_index].shared)); + Assert(is_compressed || SEQ_BUF_SHARED_EXIST(desc->freeBuf.shared)); + Assert(SEQ_BUF_SHARED_EXIST(desc->tmpBuf[chkp_index].shared)); + } + + if (is_compressed) + { + evicted_data->freeBuf.tag = desc->freeBuf.tag; + evicted_data->freeBuf.offset = 0; + } + else + { + evicted_data->freeBuf.tag = desc->freeBuf.shared->tag; + evicted_data->freeBuf.offset = seq_buf_finalize(&desc->freeBuf); + FREE_PAGE_IF_VALID(desc->ppool, desc->freeBuf.shared->pages[0]); + FREE_PAGE_IF_VALID(desc->ppool, desc->freeBuf.shared->pages[1]); + } + + /* + * We must always finalize seq bufs (not just close them) to save the + * correct offset into evicted data. On restore, init_seq_buf() uses a + * non-NULL evicted pointer to skip the skip_len reservation (e.g. + * CheckpointFileHeader). If the offset is left at 0, the header space + * won't be reserved, and seq_buf_finalize() at checkpoint time will + * return a size smaller than sizeof(CheckpointFileHeader). + */ + if (desc->storageType == BTreeStoragePersistence || desc->storageType == BTreeStorageUnlogged) + { + evicted_data->nextChkp.tag = desc->nextChkp[chkp_index].shared->tag; + evicted_data->nextChkp.offset = seq_buf_finalize(&desc->nextChkp[chkp_index]); + FREE_PAGE_IF_VALID(desc->ppool, desc->nextChkp[chkp_index].shared->pages[0]); + FREE_PAGE_IF_VALID(desc->ppool, desc->nextChkp[chkp_index].shared->pages[1]); + + evicted_data->tmpBuf.tag = desc->tmpBuf[chkp_index].shared->tag; + evicted_data->tmpBuf.offset = seq_buf_finalize(&desc->tmpBuf[chkp_index]); + FREE_PAGE_IF_VALID(desc->ppool, desc->tmpBuf[chkp_index].shared->pages[0]); + FREE_PAGE_IF_VALID(desc->ppool, desc->tmpBuf[chkp_index].shared->pages[1]); + } + else + { + evicted_data->tmpBuf.tag = desc->tmpBuf[chkp_index].shared->tag; + evicted_data->tmpBuf.offset = seq_buf_finalize(&desc->tmpBuf[chkp_index]); + FREE_PAGE_IF_VALID(desc->ppool, desc->tmpBuf[chkp_index].shared->pages[0]); + FREE_PAGE_IF_VALID(desc->ppool, desc->tmpBuf[chkp_index].shared->pages[1]); + } +} + +/* + * Evict the tree, assuming rootPageBlkno page is locked. + */ +static bool +evict_btree(BTreeDescr *desc, uint32 checkpoint_number) +{ + OInMemoryBlkno root_blkno = desc->rootInfo.rootPageBlkno; + Page rootPageBlkno = O_GET_IN_MEMORY_PAGE(root_blkno); + OrioleDBPageDesc *root_desc = O_GET_IN_MEMORY_PAGEDESC(root_blkno); + BTreeMetaPage *metaPage = BTREE_GET_META(desc); + CheckpointFileHeader file_header = {0}; + EvictedTreeData evicted_tree_data = {{0}}; + uint64 new_downlink; + char img[ORIOLEDB_BLCKSZ]; + bool was_dirty; + uint32 chkpNum = 0; + bool notModified; + bool hasMetaLock = LWLockHeldByMe(&checkpoint_state->oTablesMetaLock); + SharedRootInfoKey evict_key; + int evict_lockNo; + + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc) && + (O_PAGE_STATE_IS_LOCKED(pg_atomic_read_u64(&(O_PAGE_HEADER(rootPageBlkno)->state))) || O_PAGE_IS_LOCAL(root_blkno))); + + /* + * Try to acquire oSharedRootInfoInsertLocks early to avoid deadlocks. If + * we can't get it, bail out — the page will be evicted later. + */ + evict_key.datoid = desc->oids.datoid; + evict_key.relnode = desc->oids.relnode; + evict_lockNo = tag_hash(&evict_key, sizeof(evict_key)) % SHARED_ROOT_INFO_INSERT_NUM_LOCKS; + if (!LWLockConditionalAcquire(&checkpoint_state->oSharedRootInfoInsertLocks[evict_lockNo], + LW_EXCLUSIVE)) + { + unlock_page(root_blkno); + return false; + } + + /* + * Additional protection: don't evict the tree root page if the resource + * owner hasn't released its seq scans yet. According to the locks they + * must be already finished, but not yet released from shmem. + */ + if (meta_page_get_num_seq_scans(desc->rootInfo.metaPageBlkno) != 0) + { + LWLockRelease(&checkpoint_state->oSharedRootInfoInsertLocks[evict_lockNo]); + unlock_page(root_blkno); + return false; + } + + /* we check it before */ + Assert(!RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(rootPageBlkno))); + if (orioledb_s3_mode) + { + btree_s3_flush(desc, checkpoint_number); + } + + was_dirty = IS_DIRTY(root_blkno); + + /* + * Checking FileExtentIsValid() is essential for just created temporary + * trees which aren't dirty, but don't have fileExtent initialized. + */ + if (was_dirty || !FileExtentIsValid(root_desc->fileExtent)) + { + bool not_used; + + CLEAN_DIRTY(desc->ppool, root_blkno); + + /* Code above ensured there is no IO in progress */ + Assert(root_desc->ionum < 0); + root_desc->ionum = assign_io_num(root_blkno, InvalidOffsetNumber); + memcpy(img, rootPageBlkno, ORIOLEDB_BLCKSZ); + unlock_page(root_blkno); + + new_downlink = perform_page_io(desc, root_blkno, img, checkpoint_number, + false, ¬_used); + if (!DiskDownlinkIsValid(new_downlink)) + { + elog(FATAL, "Can not evict rootPageBlkno page on disk."); + } + + writeback_put_extent(&io_writeback, desc, new_downlink); + unlock_io(root_desc->ionum); + root_desc->ionum = -1; + } + else + { + Assert(FileExtentIsValid(root_desc->fileExtent)); + new_downlink = MAKE_ON_DISK_DOWNLINK(root_desc->fileExtent); + unlock_page(root_blkno); + } + + if (!hasMetaLock) + { + if (!LWLockConditionalAcquire(&checkpoint_state->oTablesMetaLock, + LW_SHARED)) + { + LWLockRelease(&checkpoint_state->oSharedRootInfoInsertLocks[evict_lockNo]); + return false; + } + } + + file_header.rootDownlink = new_downlink; + + ppool_free_page(desc->ppool, root_blkno, false); + + if (orioledb_s3_mode) + chkpNum = S3_GET_CHKP_NUM(DOWNLINK_GET_DISK_OFF(new_downlink)); + + file_header.datafileLength = pg_atomic_read_u64(&metaPage->datafileLength[chkpNum % 2]); + file_header.leafPagesNum = pg_atomic_read_u32(&metaPage->leafPagesNum); + file_header.ctid = pg_atomic_read_u64(&metaPage->ctid); + file_header.bridgeCtid = pg_atomic_read_u64(&metaPage->bridge_ctid); + file_header.numFreeBlocks = pg_atomic_read_u64(&metaPage->numFreeBlocks); + Assert(meta_page_get_num_seq_scans(desc->rootInfo.metaPageBlkno) == 0); + + evicted_tree_data.key.datoid = desc->oids.datoid; + evicted_tree_data.key.relnode = desc->oids.relnode; + evicted_tree_data.file_header = file_header; + evicted_tree_data.maxLocation[0] = metaPage->partsInfo[0].writeMaxLocation; + evicted_tree_data.maxLocation[1] = metaPage->partsInfo[1].writeMaxLocation; + evicted_tree_data.dirtyFlag1 = metaPage->dirtyFlag1; + evicted_tree_data.dirtyFlag2 = metaPage->dirtyFlag2; + evicted_tree_data.punchHolesChkpNum = metaPage->punchHolesChkpNum; + + notModified = (!metaPage->dirtyFlag1 && !metaPage->dirtyFlag2); + + /* + * Free all private seq buf pages and get their offsets + */ + if (!orioledb_s3_mode || desc->storageType == BTreeStorageTemporary) + btree_finalize_private_seq_bufs(desc, &evicted_tree_data); + + ppool_free_page(desc->ppool, desc->rootInfo.metaPageBlkno, false); + + desc->rootInfo.rootPageBlkno = OInvalidInMemoryBlkno; + desc->rootInfo.metaPageBlkno = OInvalidInMemoryBlkno; + + perform_writeback(&io_writeback); + + /* + * Check if we can skip the evicted data if tree has no modification after + * writing the last *.map file. + * + * For compressed trees we must always store evicted data. Otherwise, on + * reload was_evicted will be false and o_tree_init_free_extents() will + * try to re-insert free extents that are already present in the in-memory + * system trees (they are not cleaned up on eviction), causing assertion + * failures in free_extent(). + */ + if (desc->storageType != BTreeStoragePersistence || !notModified || + OCompressIsValid(desc->compress)) + insert_evicted_data(&evicted_tree_data); + + elog(DEBUG1, "evict_btree: (%u, %u) chkpNum=%u notModified=%d", + desc->oids.datoid, desc->oids.relnode, + chkpNum, notModified); + + /* + * Shared descr drops to signalize other backends that tree is evicted. + * Backends and workers can create a new SharedRootInfo* after this. + */ + o_drop_shared_root_info(desc->oids.datoid, desc->oids.relnode); + + LWLockRelease(&checkpoint_state->oSharedRootInfoInsertLocks[evict_lockNo]); + + if (!hasMetaLock) + LWLockRelease(&checkpoint_state->oTablesMetaLock); + + return true; +} + +BTreeDescr * +index_oids_get_btree_descr(ORelOids oids, OIndexType type) +{ + OIndexDescr *indexDescr = NULL; + BTreeDescr *desc; + bool nested; + + /* Check is this table is visible for us */ + indexDescr = o_fetch_index_descr(oids, type, false, &nested); + + if (indexDescr == NULL) + return NULL; + + desc = &indexDescr->desc; + + if (!o_btree_try_use_shmem(desc)) + return NULL; + + return desc; +} + +typedef struct +{ + bool indexRegularLock; + bool indexCheckpointerLock; + bool tableRegularLock; + bool tableCheckpointerLock; + ORelOids tableOids; +} EvictBtreeLocksState; + +/* + * Acquire all the locks required to completely evict the tree. We need to + * take both regular and checkpointer locks. Also, for PK we need to lock + * the table as well, because a concurrent seq scan can lock only the table. + */ +static BTreeDescr * +get_evict_btree_locks(OInMemoryBlkno blkno, ORelOids oids, OIndexType type, + EvictBtreeLocksState *state) +{ + BTreeDescr *desc; + OIndexDescr *id; + bool recovery = is_recovery_in_progress(); + bool nested = false; + + if (!recovery && !(state->indexRegularLock = o_tables_rel_try_lock_extended(&oids, AccessExclusiveLock, &nested, false))) + return NULL; + + if (nested) + return NULL; + + if (!(state->indexCheckpointerLock = o_tables_rel_try_lock_extended(&oids, AccessExclusiveLock, &nested, true))) + return NULL; + + if (nested) + return NULL; + + desc = index_oids_get_btree_descr(oids, type); + + if (desc == NULL || + desc->rootInfo.rootPageBlkno != blkno) + return NULL; + + if (desc->type != oIndexPrimary) + return desc; + + id = (OIndexDescr *) desc->arg; + state->tableOids = id->tableOids; + + /* + * if primary index is ctid, then we don't need to lock the table, because + * ctid is the table itself + */ + if (id->primaryIsCtid) + return desc; + + if (!recovery && !(state->tableRegularLock = o_tables_rel_try_lock_extended(&state->tableOids, AccessExclusiveLock, &nested, false))) + return NULL; + + if (nested) + return NULL; + + if (!(state->tableCheckpointerLock = o_tables_rel_try_lock_extended(&state->tableOids, AccessExclusiveLock, &nested, true))) + return NULL; + + if (nested) + return NULL; + + desc = index_oids_get_btree_descr(oids, type); + + if (desc == NULL || + desc->rootInfo.rootPageBlkno != blkno) + return NULL; + + return desc; +} + +static void +release_evict_btree_locks(ORelOids oids, EvictBtreeLocksState *state) +{ + if (state->indexRegularLock) + o_tables_rel_unlock_extended(&oids, AccessExclusiveLock, false); + if (state->indexCheckpointerLock) + o_tables_rel_unlock_extended(&oids, AccessExclusiveLock, true); + if (state->tableRegularLock) + o_tables_rel_unlock_extended(&state->tableOids, AccessExclusiveLock, false); + if (state->tableCheckpointerLock) + o_tables_rel_unlock_extended(&state->tableOids, AccessExclusiveLock, true); +} + +/* + * Pre-lock checks for walk_page(). Validates page state and resolves the + * btree descriptor before the page lock is acquired. + * + * Returns the BTreeDescr pointer on success (caller should proceed to lock), + * or NULL if the page should be skipped. Sets *oids as a side effect. + */ +static BTreeDescr * +walk_page_prelock_check(OInMemoryBlkno blkno, bool evict, + OrioleDBPageDesc *page_desc, Page p, + ORelOids *oids) +{ + BTreeDescr *desc; + + if (!ORelOidsIsValid(page_desc->oids) || page_desc->type == oIndexInvalid) + return NULL; + + /* + * Read field2 directly rather than via PAGE_GET_N_ONDISK(): we don't hold + * the page lock here, so a concurrent leaf/non-leaf transition could fire + * the macro's debug assert even though the outer flag check just passed. + * The result of this comparison is racy by design and gets re-validated + * once the page is locked. + */ + if (!O_PAGE_IS(p, LEAF) && evict && + ((BTreePageHeader *) p)->field2 != BTREE_PAGE_ITEMS_COUNT(p)) + return NULL; + + if (!evict && !IS_DIRTY(blkno)) + return NULL; + + /* Important to access the shared memory once */ + *oids = *((volatile ORelOids *) &page_desc->oids); + + /* + * index_oids_get_btree_descr() might imply page eviction. We shouldn't + * do this while holding a page lock. So, we need to do this before + * locking the page. + */ + if (IS_SYS_TREE_OIDS(*oids)) + { + if (sys_tree_get_storage_type(oids->relnode) != BTreeStorageInMemory) + desc = get_sys_tree(oids->relnode); + else + return NULL; + } + else + { + /* Check is this index is visible for us */ + desc = index_oids_get_btree_descr(*oids, page_desc->type); + + if (desc == NULL) + return NULL; + } + + return desc; +} + +typedef enum WalkPageCheckResult +{ + WalkPageCheckPassed, + WalkPageCheckFailed, + WalkPageCheckWaitIO +} WalkPageCheckResult; + +/* + * Locked-page validity checks for walk_page(). Must be called with the page + * lock held. + * + * Returns WalkPageCheckPassed if all checks pass (page remains locked). + * Returns WalkPageCheckFailed if a check fails (page is unlocked). + * Returns WalkPageCheckWaitIO if IO is in progress (page is unlocked, + * *ionum is set for the caller to wait on). + * + * When !evict, also prepares the non-leaf page image into img. + */ +static WalkPageCheckResult +walk_page_check_locked(OInMemoryBlkno blkno, bool evict, + OrioleDBPageDesc *page_desc, Page p, + ORelOids oids, char *img, int *ionum) +{ + if (!ORelOidsIsValid(page_desc->oids) || + page_desc->type == oIndexInvalid || + !ORelOidsIsEqual(oids, page_desc->oids)) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + + if (!evict && !IS_DIRTY(blkno)) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + + if (O_PAGE_IS(p, PRE_CLEANUP)) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + + /* On concurrent IO, unlock and let the caller decide to wait or skip */ + *ionum = page_desc->ionum; + if (*ionum >= 0) + { + unlock_page(blkno); + return WalkPageCheckWaitIO; + } + + if (!O_PAGE_IS(p, LEAF) && evict && PAGE_GET_N_ONDISK(p) != BTREE_PAGE_ITEMS_COUNT(p)) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + + if (!O_PAGE_IS(p, LEAF) && !evict) + { + memcpy(img, p, ORIOLEDB_BLCKSZ); + if (!prepare_non_leaf_page(img)) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + } + + if (RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(p))) + { + unlock_page(blkno); + return WalkPageCheckFailed; + } + + return WalkPageCheckPassed; +} + +/* + * Handle root page eviction in walk_page(). Called with the page lock held. + * Manages all lock/unlock internally, including the two-pass protocol: + * release page lock, acquire evict btree locks, re-lock and re-validate. + * Guarantees release_evict_btree_locks() is called after get_evict_btree_locks(). + */ +static OWalkPageResult +walk_page_evict_root(BTreeDescr *desc, OInMemoryBlkno blkno, + OrioleDBPageDesc *page_desc, Page p, + ORelOids oids) +{ + EvictBtreeLocksState locksState; + uint32 checkpoint_number; + bool copy_blkno; + bool result = false; + int ionum; + + if (tree_is_under_checkpoint(desc)) + { + unlock_page(blkno); + return OWalkPageSkipped; + } + + /* Release page lock before acquiring evict btree locks */ + unlock_page(blkno); + + memset(&locksState, 0, sizeof(locksState)); + + desc = get_evict_btree_locks(blkno, oids, page_desc->type, &locksState); + + if (!desc) + { + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + /* + * Re-lock the page and re-validate all checks after acquiring evict btree + * locks. + */ + if (!try_lock_page(blkno)) + { + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + if (walk_page_check_locked(blkno, true, page_desc, p, + oids, NULL, &ionum) != WalkPageCheckPassed) + { + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + if (tree_is_under_checkpoint(desc)) + { + unlock_page(blkno); + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + if (desc->rootInfo.rootPageBlkno != blkno) + { + unlock_page(blkno); + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + if (!get_checkpoint_number(desc, blkno, &checkpoint_number, ©_blkno)) + { + unlock_page(blkno); + release_evict_btree_locks(oids, &locksState); + return OWalkPageSkipped; + } + + result = evict_btree(desc, checkpoint_number); + o_invalidate_oids(oids); + + release_evict_btree_locks(oids, &locksState); + + return result ? OWalkPageEvicted : OWalkPageSkipped; +} + +/* + * Examine single page and evict it if possible. + * + * Note that here we skip seq buf pages, as we will evict them together with the + * tree in evict_btree() when we evict the root page. + */ +OWalkPageResult +walk_page(OInMemoryBlkno blkno, bool evict) +{ + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + OBTreeFindPageContext context; + BTreeDescr *desc; + Page p = O_GET_IN_MEMORY_PAGE(blkno), + parent_page; + ORelOids oids; + BTreeNonLeafTuphdr *int_hdr; + uint32 checkpoint_number; + bool copy_blkno, + merge_tried = false; + OFindPageResult findResult; + int ionum; + char img[ORIOLEDB_BLCKSZ]; + bool is_root; + WalkPageCheckResult checkResult; + + p = O_GET_IN_MEMORY_PAGE(blkno); +retry: + + desc = walk_page_prelock_check(blkno, evict, page_desc, p, &oids); + if (!desc) + return OWalkPageSkipped; + + if (!try_lock_page(blkno)) + return OWalkPageSkipped; + + checkResult = walk_page_check_locked(blkno, evict, page_desc, p, + oids, img, &ionum); + if (checkResult == WalkPageCheckWaitIO) + { + wait_for_io_completion(ionum); + goto retry; + } + if (checkResult == WalkPageCheckFailed) + return OWalkPageSkipped; + + /* Try to merge sparse page instead of eviction */ + if (!merge_tried && is_page_too_sparse(desc, p)) + { + bool result; + + result = btree_try_merge_and_unlock(desc, blkno, true, false); + + /* Merge shouldn't leave us with locked pages. */ + Assert(!have_locked_pages()); + + if (result) + { + return OWalkPageMerged; + } + else + { + merge_tried = true; + goto retry; + } + } + + Assert(desc != NULL); + Assert(ORootPageIsValid(desc) && OMetaPageIsValid(desc)); + is_root = desc->rootInfo.rootPageBlkno == blkno; + + /* If page is rootPageBlkno, we don't need to search parent page. */ + context.desc = desc; + context.index = 0; + if (!is_root) + { + init_page_find_context(&context, desc, COMMITSEQNO_INPROGRESS, BTREE_PAGE_FIND_MODIFY + | BTREE_PAGE_FIND_TRY_LOCK + | BTREE_PAGE_FIND_DOWNLINK_LOCATION + | BTREE_PAGE_FIND_NO_FIX_SPLIT); + if (O_PAGE_IS(p, RIGHTMOST)) + { + findResult = find_page(&context, NULL, BTreeKeyRightmost, PAGE_GET_LEVEL(p) + 1); + } + else + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, p); + findResult = find_page(&context, &hikey, BTreeKeyPageHiKey, PAGE_GET_LEVEL(p) + 1); + } + + if (findResult != OFindPageResultSuccess) + { + Assert(findResult == OFindPageResultFailure); + unlock_page(blkno); + Assert(!have_locked_pages()); + return OWalkPageSkipped; + } + + BTREE_PAGE_FIND_UNSET(&context, TRY_LOCK); + parent_page = O_GET_IN_MEMORY_PAGE(context.items[context.index].blkno); + + int_hdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent_page, &context.items[context.index].locator); + + if (!DOWNLINK_IS_IN_MEMORY(int_hdr->downlink) || + DOWNLINK_GET_IN_MEMORY_BLKNO(int_hdr->downlink) != blkno) + { + /* + * We didn't find downlink pointing to this page. This could + * happend because of concurrent split. Give up then... + */ + unlock_page(blkno); + unlock_page(context.items[context.index].blkno); + return OWalkPageSkipped; + } + } + else if (IS_SYS_TREE_OIDS(oids)) + { + Assert(is_root); + unlock_page(blkno); + return OWalkPageSkipped; + } + + if (!get_checkpoint_number(desc, blkno, &checkpoint_number, ©_blkno)) + { + unlock_page(blkno); + + if (!is_root) + { + unlock_page(context.items[context.index].blkno); + } + return OWalkPageSkipped; + } + + if (evict && is_root) + return walk_page_evict_root(desc, blkno, page_desc, p, oids); + + STOPEVENT(STOPEVENT_BEFORE_WRITE_PAGE, NULL); + + write_page(&context, blkno, img, checkpoint_number, evict, copy_blkno); + + STOPEVENT(STOPEVENT_AFTER_WRITE_PAGE, NULL); + + return evict ? OWalkPageEvicted : OWalkPageWritten; +} + +static bool +write_tree_pages_recursive(UndoLogType undoType, + OInMemoryBlkno blkno, uint32 loadId, + int maxLevel, bool evict) +{ + Page p; + int level; + OInMemoryBlkno childPageNumbers[BTREE_PAGE_MAX_CHUNK_ITEMS]; + uint32 childPageChangeCounts[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int childPagesCount = 0; + int i; + BTreePageItemLocator loc; + + if (!OInMemoryBlknoIsValid(blkno)) + return false; + + lock_page(blkno); + p = O_GET_IN_MEMORY_PAGE(blkno); + + /* + * For local pool pages, the slot may have been reclaimed by a reentrant + * eviction triggered while we were processing a sibling downlink + * collected earlier. Treat a NULL slot as a missing page. + */ + if (O_PAGE_IS_LOCAL(blkno) && p == NULL) + { + unlock_page(blkno); + return false; + } + if (O_PAGE_GET_CHANGE_COUNT(p) != loadId) + { + unlock_page(blkno); + return false; + } + level = PAGE_GET_LEVEL(p); + + if (!O_PAGE_IS(p, LEAF)) + { + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + childPageNumbers[childPagesCount] = DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink); + childPageChangeCounts[childPagesCount] = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(tuphdr->downlink); + childPagesCount++; + } + } + } + + unlock_page(blkno); + + for (i = 0; i < childPagesCount; i++) + (void) write_tree_pages_recursive(undoType, + childPageNumbers[i], + childPageChangeCounts[i], + maxLevel, + evict); + + if (level <= maxLevel) + { + while (true) + { + reserve_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(undoType), + 2 * O_MERGE_UNDO_IMAGE_SIZE); + if (walk_page(blkno, evict) != OWalkPageMerged) + break; + } + release_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(undoType)); + } + + return true; +} + +static void +write_tree_pages(BTreeDescr *desc, int maxLevel, bool evict) +{ + o_btree_load_shmem(desc); + if (!write_tree_pages_recursive(desc->undoType, + desc->rootInfo.rootPageBlkno, + desc->rootInfo.rootPageChangeCount, + maxLevel, evict)) + { + desc->rootInfo.rootPageBlkno = OInvalidInMemoryBlkno; + desc->rootInfo.metaPageBlkno = OInvalidInMemoryBlkno; + desc->rootInfo.rootPageChangeCount = 0; + o_btree_load_shmem(desc); + (void) write_tree_pages_recursive(desc->undoType, + desc->rootInfo.rootPageBlkno, + desc->rootInfo.rootPageChangeCount, + maxLevel, evict); + } +} + +static void +write_relation_pages(Oid relid, int maxLevel, bool evict) +{ + OTableDescr *descr; + BTreeDescr *td; + Relation rel; + int treen; + + orioledb_check_shmem(); + + rel = relation_open(relid, AccessShareLock); + + if (!rel) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation oid %u does not exists", relid))); + + descr = relation_get_descr(rel); + + for (treen = 0; treen < descr->nIndices; treen++) + { + td = &descr->indices[treen]->desc; + write_tree_pages(td, maxLevel, evict); + } + td = &descr->toast->desc; + write_tree_pages(td, maxLevel, evict); + + relation_close(rel, AccessShareLock); +} + +Datum +orioledb_evict_pages(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int maxLevel = PG_GETARG_INT32(1); + + write_relation_pages(relid, maxLevel, true); + + PG_RETURN_VOID(); +} + +Datum +orioledb_write_pages(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int maxLevel = ORIOLEDB_MAX_DEPTH; + + write_relation_pages(relid, maxLevel, false); + + PG_RETURN_VOID(); +} + +static int +tree_offsets_cmp(const void *a, const void *b) +{ + TreeOffset val1 = *(TreeOffset *) a; + TreeOffset val2 = *(TreeOffset *) b; + + if (val1.key.oids.datoid != val2.key.oids.datoid) + return (val1.key.oids.datoid < val2.key.oids.datoid) ? -1 : 1; + else if (val1.key.oids.relnode != val2.key.oids.relnode) + return (val1.key.oids.relnode < val2.key.oids.relnode) ? -1 : 1; + else if (val1.chkpNum != val2.chkpNum) + return (val1.chkpNum < val2.chkpNum) ? -1 : 1; + else if (val1.segno != val2.segno) + return (val1.segno < val2.segno) ? -1 : 1; + else if (val1.fileExtent.off != val2.fileExtent.off) + return val1.fileExtent.off < val2.fileExtent.off ? -1 : 1; + else if (val1.fileExtent.len != val2.fileExtent.len) + { + /* + * an extent with bigger length will be placed first, it helps to + * simplify process this case in perform_writeback() + */ + return val1.fileExtent.len > val2.fileExtent.len ? -1 : 1; + } + + return 0; +} + +static void +writeback_put_extent(IOWriteBack *writeback, BTreeDescr *desc, + uint64 downlink) +{ + TreeOffset offset; + off_t blcksz = 0; + int last_segno; + FileExtent extent; + + Assert(DOWNLINK_IS_ON_DISK(downlink)); + extent.len = DOWNLINK_GET_DISK_LEN(downlink); + extent.off = DOWNLINK_GET_DISK_OFF(downlink); + + if (!ORelOidsIsValid(desc->oids) || desc->type == oIndexInvalid) + return; + + if (orioledb_s3_mode) + { + offset.chkpNum = S3_GET_CHKP_NUM(extent.off); + extent.off &= S3_OFFSET_MASK; + } + else + { + offset.chkpNum = 0; + } + + Assert(extent.len > 0); + Assert(extent.len <= (ORIOLEDB_BLCKSZ / ORIOLEDB_COMP_BLCKSZ)); + + offset.key.oids = desc->oids; + offset.key.tablespace = desc->tablespace; + offset.compressed = OCompressIsValid(desc->compress); + blcksz = offset.compressed ? ORIOLEDB_COMP_BLCKSZ : ORIOLEDB_BLCKSZ; + offset.segno = blcksz * extent.off / ORIOLEDB_SEGMENT_SIZE; + last_segno = blcksz * (extent.off + extent.len - 1) / ORIOLEDB_SEGMENT_SIZE; + + while (offset.segno <= last_segno) + { + if (writeback->extents == NULL) + { + writeback->extentsNumber = 0; + writeback->extentsAllocated = 16; + writeback->extents = (TreeOffset *) MemoryContextAlloc(TopMemoryContext, + sizeof(TreeOffset) * writeback->extentsAllocated); + } + else if (writeback->extentsNumber >= writeback->extentsAllocated) + { + writeback->extentsAllocated *= 2; + writeback->extents = (TreeOffset *) repalloc(writeback->extents, + sizeof(TreeOffset) * writeback->extentsAllocated); + } + + offset.fileExtent = extent; + if (offset.segno != last_segno) + offset.fileExtent.len = ORIOLEDB_SEGMENT_SIZE / blcksz - extent.off % (ORIOLEDB_SEGMENT_SIZE / blcksz); + writeback->extents[writeback->extentsNumber] = offset; + writeback->extentsNumber++; + offset.segno++; + extent.off += offset.fileExtent.len; + extent.len -= offset.fileExtent.len; + } +} + +static void +perform_writeback(IOWriteBack *writeback) +{ + int i, + len = 0, + flushAfter; + uint64 offset = InvalidFileExtentOff - 1; + off_t blcksz = 0; + ORelOids oids = {0}; + File file = -1; + int segno = 0; + int chkpNum = 0; + + if (use_device && !use_mmap) + { + writeback->extentsNumber = 0; + return; + } + + flushAfter = IsBGWriter ? bgwriter_flush_after : backend_flush_after; + flushAfter *= BLCKSZ / ORIOLEDB_BLCKSZ; + + /* PG defaults: flushAfter == 0 turns off writeback */ + if (flushAfter == 0) + { + writeback->extentsNumber = 0; + return; + } + + if (writeback->extentsNumber < flushAfter) + return; + + pg_qsort(writeback->extents, writeback->extentsNumber, + sizeof(TreeOffset), tree_offsets_cmp); + + for (i = 0; i < writeback->extentsNumber; i++) + { + TreeOffset cur = writeback->extents[i]; + + if (oids.datoid != cur.key.oids.datoid || + oids.relnode != cur.key.oids.relnode || + segno != cur.segno || chkpNum != cur.chkpNum) + { + if (use_mmap) + { + if (len > 0) + msync(mmap_data + (off_t) segno * ORIOLEDB_SEGMENT_SIZE + (off_t) offset * blcksz, (off_t) len * blcksz, MS_ASYNC); + } + else + { + if (len > 0) + { + FileWriteback(file, (off_t) offset * blcksz, + (off_t) len * blcksz, + WAIT_EVENT_DATA_FILE_FLUSH); + } + if (file >= 0) + FileClose(file); + } + + blcksz = cur.compressed ? ORIOLEDB_COMP_BLCKSZ : ORIOLEDB_BLCKSZ; + oids = cur.key.oids; + segno = cur.segno; + chkpNum = cur.chkpNum; + if (!use_mmap) + { + char *filename; + + filename = btree_filename(cur.key, segno, chkpNum); + file = PathNameOpenFile(filename, O_RDWR | O_CREAT | PG_BINARY); + pfree(filename); + offset = cur.fileExtent.off; + len = cur.fileExtent.len; + } + } + else + { + if (cur.fileExtent.off == offset) + { + continue; + } + else if (cur.fileExtent.off == offset + len) + { + len += cur.fileExtent.len; + } + else + { + if (use_mmap) + msync(mmap_data + (off_t) segno * ORIOLEDB_SEGMENT_SIZE + (off_t) offset * blcksz, (off_t) len * blcksz, MS_ASYNC); + else + FileWriteback(file, (off_t) offset * blcksz, + (off_t) len * blcksz, + WAIT_EVENT_DATA_FILE_FLUSH); + offset = cur.fileExtent.off; + len = cur.fileExtent.len; + } + } + } + + if (len > 0) + { + Assert(blcksz != 0); + if (use_mmap) + msync(mmap_data + (off_t) segno * ORIOLEDB_SEGMENT_SIZE + (off_t) offset * blcksz, (off_t) len * blcksz, MS_ASYNC); + else + FileWriteback(file, (off_t) offset * blcksz, + (off_t) len * blcksz, + WAIT_EVENT_DATA_FILE_FLUSH); + } + + if (!use_mmap && file >= 0) + FileClose(file); + + writeback->extentsNumber = 0; +} + +typedef void (*RelnodeFileCallback) (const char *filename, uint32 segno, + char *ext, void *arg); + +/* + * Iterate all the files belonging to given (datoid, relnode) pair and call + * the callback for each filename. + * + * Guarantees that at first we process the first data file. + */ +static bool +iterate_relnode_files(OIndexKey key, RelnodeFileCallback callback, void *arg) +{ + struct dirent *file; + DIR *dir; + char *filename; + bool first_file_deleted = false; + char *db_prefix; + + o_get_prefixes_for_tablespace(key.oids.datoid, key.tablespace, + NULL, &db_prefix); + + dir = opendir(db_prefix); + + if (dir == NULL) + return false; + + while (errno = 0, (file = readdir(dir)) != NULL) + { + uint32 file_relnode, + file_chkp = 0, + file_segno = 0; + char file_ext[5]; + char *file_ext_p = NULL; + + if ((sscanf(file->d_name, "%10u-%10u.%4s", + &file_relnode, &file_chkp, file_ext) == 3 && + (!strcmp(file_ext, "tmp") || !strcmp(file_ext, "map") || + !strcmp(file_ext, "evt")) && + (file_ext_p = file_ext)) || + sscanf(file->d_name, "%10u.%10u", &file_relnode, &file_segno) == 2 || + sscanf(file->d_name, "%10u", &file_relnode) == 1) + { + if (key.oids.relnode == file_relnode) + { + if (!orioledb_s3_mode && !first_file_deleted) + { + filename = psprintf("%s/%u", db_prefix, key.oids.relnode); + + /* + * The first-file callback exists for callers that care + * about ordering the base file relative to its segments + * (e.g. durable unlink, precommit fsync). Skip it when + * the base file is absent: after a crash, a secondary + * file like ".1" or "-.map" can + * exist on disk while the base file was never durably + * created, and fsync/unlink of a missing path would + * ereport ERROR (PANIC during startup recovery). + */ + if (access(filename, F_OK) == 0) + callback(filename, 0, NULL, arg); + pfree(filename); + first_file_deleted = true; + } + + if (file_segno != 0 || file_ext_p != NULL) + { + filename = psprintf("%s/%s", db_prefix, file->d_name); + callback(filename, file_segno, file_ext_p, arg); + pfree(filename); + } + } + } + } + + closedir(dir); + pfree(db_prefix); + return true; +} + +static void +unlink_callback(const char *filename, uint32 segno, char *ext, void *arg) +{ + /* + * Recovery determines relation data presence by presence of the first + * data file. So, we durably delete the first data file to avoid + * situation when partially deleted file data is visible. + */ + bool fsync = *(bool *) arg; + + if (segno == 0 && ext == NULL && fsync) + durable_unlink(filename, ERROR); + else + unlink(filename); +} + +bool +cleanup_btree_files(OIndexKey key, bool fsync) +{ + return iterate_relnode_files(key, unlink_callback, (void *) &fsync); +} + +static void +fsync_callback(const char *filename, uint32 segno, char *ext, void *arg) +{ + if (ext == NULL || strcmp(ext, "tmp") != 0) + fsync_fname(filename, false); +} + +bool +fsync_btree_files(OIndexKey key) +{ + return iterate_relnode_files(key, fsync_callback, NULL); +} + +void +try_to_punch_holes(BTreeDescr *desc) +{ + BTreeMetaPage *metaPage; + File file; + uint64 file_size; + char *filename, + buf[ORIOLEDB_BLCKSZ]; + uint64 len = 0, + i, + buf_len; + uint32 chkp_num; + LWLock *metaLock; + LWLock *punchHolesLock; + + Assert(orioledb_use_sparse_files); + Assert(!OCompressIsValid(desc->compress)); + + o_btree_load_shmem(desc); + metaPage = BTREE_GET_META(desc); + metaLock = &metaPage->metaLock; + punchHolesLock = &metaPage->punchHolesLock; + + chkp_num = metaPage->punchHolesChkpNum + 1; + while (can_use_checkpoint_extents(desc, chkp_num)) + { + SeqBufTag tag; + bool removeFile = false; + + LWLockAcquire(punchHolesLock, LW_EXCLUSIVE); + + if (chkp_num == metaPage->punchHolesChkpNum + 1) + { + if (chkp_num < metaPage->freeBuf.tag.num) + removeFile = true; + } + else + { + chkp_num = metaPage->punchHolesChkpNum + 1; + /* Try for next checkpoint number */ + LWLockRelease(punchHolesLock); + continue; + } + + tag.key.oids = desc->oids; + tag.key.tablespace = desc->tablespace; + tag.type = 't'; + tag.num = chkp_num; + if (!seq_buf_file_exist(&tag)) + { + /* table may be deleted or *.tmp file not created */ + LWLockAcquire(metaLock, LW_EXCLUSIVE); + Assert(chkp_num == metaPage->punchHolesChkpNum + 1); + metaPage->punchHolesChkpNum = chkp_num; + LWLockRelease(metaLock); + LWLockRelease(punchHolesLock); + chkp_num++; + continue; + } + + /* free extents from *.tmp file */ + filename = get_seq_buf_filename(&tag); + file = PathNameOpenFile(filename, O_RDONLY | PG_BINARY); + if (file < 0) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not open file %s: %m", filename))); + file_size = FileSize(file); + + while (true) + { + BlockNumber *cur_off; + + buf_len = OFileRead(file, buf, ORIOLEDB_BLCKSZ, len, WAIT_EVENT_DATA_FILE_READ); + if (buf_len <= 0) + break; + + cur_off = (BlockNumber *) buf; + for (i = 0; i < buf_len; i += sizeof(BlockNumber)) + { + btree_smgr_punch_hole(desc, chkp_num, + (off_t) (*cur_off) * (off_t) ORIOLEDB_BLCKSZ, + ORIOLEDB_BLCKSZ); + cur_off++; + } + len += buf_len; + } + if (file_size != len) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not read data from checkpoint tmp file: %s %lu %lu: %m", + filename, len, file_size))); + + pfree(filename); + FileClose(file); + + if (removeFile) + seq_buf_remove_file(&tag); + + LWLockAcquire(metaLock, LW_EXCLUSIVE); + Assert(chkp_num == metaPage->punchHolesChkpNum + 1); + metaPage->punchHolesChkpNum = chkp_num; + LWLockRelease(metaLock); + + LWLockRelease(punchHolesLock); + + /* Try for next checkpoint number */ + chkp_num++; + } +} diff --git a/contrib/orioledb/src/btree/iterator.c b/contrib/orioledb/src/btree/iterator.c new file mode 100644 index 00000000000..4af52772470 --- /dev/null +++ b/contrib/orioledb/src/btree/iterator.c @@ -0,0 +1,1332 @@ +/*------------------------------------------------------------------------- + * + * iterator.c + * Implementation of orioledb B-tree iterator. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/iterator.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/btree.h" +#include "btree/find.h" +#include "btree/iterator.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "catalog/sys_trees.h" +#include "tableam/descr.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "utils/page_pool.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + +/* Iterates through undo images */ +typedef struct +{ + BTreeIterator *it; + /* a current page image from undo log */ + char image[ORIOLEDB_BLCKSZ]; + /* a lokey of the image for backward scan */ + OFixedKey lokey; + /* a base undo location, from the data page header */ + UndoLocation baseLoc; + /* undo location of the `image` in this struct */ + UndoLocation imageUndoLoc; + /* is the image leftmost on the base location */ + bool leftmost; + /* is the image rightmost on the base location */ + bool rightmost; +} UndoIterator; + +struct BTreeIterator +{ + OBTreeFindPageContext context; + OIndexDescr *oidescr; + OSnapshot oSnapshot; + UndoIterator undoIt; + /* scan direction of current iterator: forward or backward */ + ScanDirection scanDir; + /* current tuple location in UndoIterator */ + BTreePageItemLocator undoLoc; + /* do we have to combine results from both current and undo pages? */ + bool combinedResult; + /* do we need to combine results in the current page? */ + bool combinedPage; + /* memory context for returned tuples */ + MemoryContext tupleCxt; + /* callback for fetching tuple version */ + TupleFetchCallback fetchCallback; + void *fetchCallbackArg; +#ifdef USE_ASSERT_CHECKING + /* additional check for iteration order */ + OFixedTuple prevTuple; +#endif +}; + +static void get_next_combined_location(BTreeIterator *it); +static void load_page_from_undo(BTreeIterator *it, void *key, BTreeKeyType kind); +static bool btree_iterator_check_load_next_page(BTreeIterator *it); +static OTuple o_btree_iterator_fetch_internal(BTreeIterator *it, + CommitSeqNo *tupleCsn); +static bool o_btree_interator_can_fetch_from_undo(BTreeDescr *desc, BTreeIterator *it); +static bool can_fetch_from_undo(BTreeIterator *it); +static void undo_it_create(UndoIterator *undoIt, BTreeIterator *it); +static void undo_it_init(UndoIterator *undoIt, UndoLocation location, void *key, BTreeKeyType kind); +static bool undo_it_next_page(BTreeDescr *desc, UndoIterator *undoIt); +static bool undo_it_switch(BTreeDescr *desc, UndoIterator *undoIt, UndoLocation location); +static void undo_it_find_internal(UndoIterator *undoIt, void *key, BTreeKeyType kind); + +#define IT_IS_BACKWARD(it) ((it)->scanDir == BackwardScanDirection) +#define IT_IS_FORWARD(it) ((it)->scanDir == ForwardScanDirection) + +#define IS_LAST_PAGE(page, it) ((IT_IS_FORWARD((it)) && O_PAGE_IS(page, RIGHTMOST)) \ + || (IT_IS_BACKWARD((it)) && O_PAGE_IS(page, LEFTMOST))) + +#define IT_NEXT_OFFSET(it, loc) \ + do { \ + if (IT_IS_FORWARD(it)) \ + BTREE_PAGE_LOCATOR_NEXT((it)->context.img, (loc)); \ + else if (IT_IS_BACKWARD(it)) \ + BTREE_PAGE_LOCATOR_PREV((it)->context.img, (loc)); \ + } while (0); \ + +#define UNDO_IT_NEXT_OFFSET(undoIt, loc) \ + do { \ + if (IT_IS_FORWARD(it)) \ + BTREE_PAGE_LOCATOR_NEXT((undoIt)->image, (loc)); \ + else if (IT_IS_BACKWARD(it)) \ + BTREE_PAGE_LOCATOR_PREV((undoIt)->image, (loc)); \ + } while (0); \ + +/* + * Fetches tuple from the tree with given CSN snapshot. Tuple is allocated + * in the given context. Leaf page is found using the given hint (if provided). + * Given hint is adjusted with relevant leaf page. + */ +OTuple +o_btree_find_tuple_by_key_cb(BTreeDescr *desc, void *key, + BTreeKeyType kind, OSnapshot *read_o_snapshot, + CommitSeqNo *out_csn, MemoryContext mcxt, + BTreeLocationHint *hint, + bool *deleted, + TupleFetchCallback cb, + void *arg) +{ + BTreePageItemLocator loc; + OBTreeFindPageContext context; + char *img; + BTreePageHeader *header; + bool combinedResult = false; + OTuple result; + OFindPageResult findResult PG_USED_FOR_ASSERTS_ONLY; + + /* + * If we need to get the result from given snapshot in the past, and in + * the same time we might have modifications in this tree, then we might + * need to combine results from the data page and the page image in undo + * log. + */ + if (COMMITSEQNO_IS_NORMAL(read_o_snapshot->csn)) + combinedResult = have_current_undo(desc->undoType); + + /* + * If we don't need to combine results, then ask find_page() to load the + * relevant page item from undo log for us by passing our snapshot csn. + */ + init_page_find_context(&context, desc, + combinedResult ? COMMITSEQNO_INPROGRESS : read_o_snapshot->csn, + BTREE_PAGE_FIND_FETCH); + + /* Use page location hint if provided */ + if (hint && OInMemoryBlknoIsValid(hint->blkno)) + findResult = refind_page(&context, key, kind, 0, hint->blkno, hint->pageChangeCount); + else + findResult = find_page(&context, key, kind, 0); + + Assert(findResult == OFindPageResultSuccess); + + loc = context.items[context.index].locator; + img = context.img; + header = (BTreePageHeader *) img; + + /* Adjust hint if given */ + if (hint) + { + hint->blkno = context.items[context.index].blkno; + hint->pageChangeCount = context.items[context.index].pageChangeCount; + } + + if (combinedResult && header->csn >= read_o_snapshot->csn) + { + /* + * Have to combine the results. First look for a matching tuple on + * the data page modified by us. + */ + if (BTREE_PAGE_LOCATOR_IS_VALID(img, &loc)) + { + BTreeLeafTuphdr *tupHdrPtr; + OTuple curTuple; + int result_size; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdrPtr, curTuple, img, &loc); + tupHdrPtr = (BTreeLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(img, &loc); + + if (o_btree_cmp(desc, key, kind, &curTuple, BTreeKeyLeafTuple) == 0) + { + BTreeLeafTuphdr tupHdr = *tupHdrPtr; + + /* + * We found the matching tuple. Now check if it is modified + * by us. Even if tuple is modified by us, there might be FOR + * KEY SHARE locks placed by concurrent transactions. Find the + * first non-lock-only undo record in the chain and check if + * it belongs to our transaction. + */ + (void) find_non_lock_only_undo_record(desc->undoType, &tupHdr); + + if (!XACT_INFO_IS_LOCK_ONLY(tupHdr.xactInfo) && + XACT_INFO_OXID_IS_CURRENT(tupHdr.xactInfo)) + { + /* + * OK, we found the tuple modified by us. It overrides + * whatever we could have from the undo log page image. + * Return it right away. + */ + if (out_csn) + *out_csn = COMMITSEQNO_INPROGRESS; + + if (deleted) + *deleted = (tupHdrPtr->deleted != BTreeLeafTupleNonDeleted); + + if (tupHdrPtr->deleted == BTreeLeafTupleNonDeleted) + { + result_size = o_btree_len(desc, curTuple, OTupleLength); + result.data = (Pointer) MemoryContextAlloc(mcxt, result_size); + memcpy(result.data, curTuple.data, result_size); + result.formatFlags = curTuple.formatFlags; + return result; + } + else + { + O_TUPLE_SET_NULL(result); + /* cppcheck-suppress uninitvar */ + return result; + } + } + } + } + + /* + * There is no matching tuple modified by us. So, we have to fetch + * the page image from undo log as we didn't ask find_page() to do + * this for us. + */ + read_page_from_undo(desc, img, header->undoLocation, read_o_snapshot->csn, + key, kind, NULL); + btree_page_search(desc, img, key, kind, NULL, &loc); + page_locator_find_real_item(img, NULL, &loc); + } + + if (BTREE_PAGE_LOCATOR_IS_VALID(img, &loc)) + { + BTreeLeafTuphdr *tupHdr; + OTuple curTuple; + int cmp; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, curTuple, img, &loc); + cmp = o_btree_cmp(desc, key, kind, &curTuple, BTreeKeyLeafTuple); + + if (deleted) + *deleted = (tupHdr->deleted != BTreeLeafTupleNonDeleted); + + if (cmp == 0) + { + /* + * The matching tuple is found. Traverse the row-level undo chain + * for the relevant version and return it. + */ + return o_find_tuple_version(desc, img, &loc, read_o_snapshot, + out_csn, mcxt, cb, arg); + } + } + + /* Tuple isn't found */ + O_TUPLE_SET_NULL(result); + return result; +} + +OTuple +o_btree_find_tuple_by_key(BTreeDescr *desc, void *key, BTreeKeyType kind, + OSnapshot *read_o_snapshot, CommitSeqNo *out_csn, + MemoryContext mcxt, BTreeLocationHint *hint) +{ + return o_btree_find_tuple_by_key_cb(desc, key, kind, read_o_snapshot, + out_csn, mcxt, hint, NULL, NULL, NULL); +} + + +/* + * Finds appropriate tuple version by traversing the undo chain. + * + * Starts with the on-page tuple, then walks historical versions from undo + * records. Visibility is determined by oSnapshot. + * + * When a TupleFetchCallback (cb) is provided, it is called for each version + * to choose among multiple versions that share the same csn/xlogptr. This is + * useful when there are several uncommitted versions within a single + * in-progress transaction. The callback controls the iteration. + * + * The result's OTuple.data is allocated in mctx and is to be freed by the + * caller. + * + * Note on COMMITSEQNO_NON_DELETED: this CSN is treated as in-progress + * (COMMITSEQNO_IS_INPROGRESS returns true for it), so it returns data from + * uncommitted transactions just like COMMITSEQNO_INPROGRESS. The difference + * is that NON_DELETED also returns tuples that are marked as deleted but are + * still physically present on the page, which is needed when accessing trees + * that may be deleted in uncommitted (sub-)transactions — on rollback those + * trees become visible again. + */ +OTuple +o_find_tuple_version(BTreeDescr *desc, Page p, BTreePageItemLocator *loc, + OSnapshot *oSnapshot, CommitSeqNo *tupleCsn, + MemoryContext mcxt, TupleFetchCallback cb, + void *arg) +{ + BTreeLeafTuphdr tupHdr, + *tupHdrPtr; + OTuple curTuple; + OTuple result; + int result_size; + UndoLocation undoLocation = InvalidUndoLocation; + bool curTupleAllocated = false; + MemoryContext prevMctx; + bool txIsFinished = false; + + prevMctx = MemoryContextSwitchTo(mcxt); + + BTREE_PAGE_READ_LEAF_ITEM(tupHdrPtr, curTuple, p, loc); + tupHdr = *tupHdrPtr; + (void) find_non_lock_only_undo_record(desc->undoType, &tupHdr); + + Assert(COMMITSEQNO_IS_NORMAL(oSnapshot->csn) || + COMMITSEQNO_IS_INPROGRESS(oSnapshot->csn)); + + while (true) + { + OTupleXactInfo xactInfo = tupHdr.xactInfo; + CommitSeqNo tupcsn; + XLogRecPtr tupptr = InvalidXLogRecPtr; + + oxid_match_snapshot(XACT_INFO_GET_OXID(xactInfo), oSnapshot, &tupcsn, + XLogRecPtrIsInvalid(oSnapshot->xlogptr) ? NULL : &tupptr); + + txIsFinished = COMMITSEQNO_IS_COMMITTED(tupcsn); + + if (tupleCsn) + { + if (COMMITSEQNO_IS_NORMAL(tupcsn)) + *tupleCsn = COMMITSEQNO_IS_NORMAL(oSnapshot->csn) ? Max(oSnapshot->csn, tupcsn + 1) : COMMITSEQNO_MAX_NORMAL - 1; + else if (COMMITSEQNO_IS_FROZEN(tupcsn)) + *tupleCsn = COMMITSEQNO_IS_NORMAL(oSnapshot->csn) ? oSnapshot->csn : COMMITSEQNO_MAX_NORMAL - 1; + else + *tupleCsn = COMMITSEQNO_INPROGRESS; + } + + if (cb) + { + TupleFetchCallbackResult cbResult; + + /* + * Fetch from undo chain if txn is in progress OR historical + * version + */ + OXid tupOxid = XACT_INFO_GET_OXID(xactInfo); + + cbResult = cb(curTuple, tupOxid, oSnapshot, arg, txIsFinished); + + if (cbResult == OTupleFetchMatch) + break; + if (cbResult == OTupleFetchNotMatch) + { + if (curTupleAllocated) + pfree(curTuple.data); + O_TUPLE_SET_NULL(result); + MemoryContextSwitchTo(prevMctx); + return result; + } + } + + if (!txIsFinished) + { + if (!cb) + { + if (COMMITSEQNO_IS_INPROGRESS(oSnapshot->csn)) + break; + + /* + * We see the changes made by our transaction. Exception are + * changes made by current command unless we're dealing with + * system tree. + */ + if (!XACT_INFO_IS_LOCK_ONLY(xactInfo) && + XACT_INFO_OXID_IS_CURRENT(xactInfo) && + oSnapshot->csn != COMMITSEQNO_MAX_NORMAL) + { + CommandId tupleCid; + + if (IS_SYS_TREE_OIDS(desc->oids)) + break; + + /* + * Use cached UndoLocation if we have undo records in this + * command or below. MaxUndoLocation means there are no + * undo records yet, so we need to recheck. + */ + tupleCid = undo_location_get_command(UndoLocationGetValue(tupHdr.undoLocation)); + + if (tupleCid < oSnapshot->cid) + break; + } + } + } + else + { + if (COMMITSEQNO_IS_INPROGRESS(oSnapshot->csn)) + break; + } + + if (!COMMITSEQNO_IS_INPROGRESS(tupcsn) && + !COMMITSEQNO_IS_ABORTED(tupcsn)) + { + if (COMMITSEQNO_IS_INPROGRESS(oSnapshot->csn)) + { + Assert(XLogRecPtrIsInvalid(oSnapshot->xlogptr)); + break; + } + + if (XLogRecPtrIsInvalid(oSnapshot->xlogptr)) + { + if (tupcsn < oSnapshot->csn) + break; + } + else + { + if (tupptr <= oSnapshot->xlogptr) + break; + } + } + + undoLocation = tupHdr.undoLocation; + + if (!UndoLocationIsValid(undoLocation)) + { + if (curTupleAllocated) + pfree(curTuple.data); + O_TUPLE_SET_NULL(result); + MemoryContextSwitchTo(prevMctx); + return result; + } + + if (tupHdr.deleted != BTreeLeafTupleNonDeleted || + XACT_INFO_IS_LOCK_ONLY(tupHdr.xactInfo)) + { + get_prev_leaf_header_from_undo(desc->undoType, &tupHdr, true); + } + else + { + if (curTupleAllocated) + pfree(curTuple.data); + get_prev_leaf_header_and_tuple_from_undo(desc->undoType, &tupHdr, + &curTuple, 0); + curTupleAllocated = true; + } + + Assert(UNDO_REC_EXISTS(desc->undoType, undoLocation)); + } + + if (COMMITSEQNO_IS_NON_DELETED(oSnapshot->csn)) + { + if (tupHdr.deleted != BTreeLeafTupleNonDeleted && + txIsFinished) + { + if (curTupleAllocated) + pfree(curTuple.data); + O_TUPLE_SET_NULL(result); + MemoryContextSwitchTo(prevMctx); + return result; + } + } + else if (tupHdr.deleted != BTreeLeafTupleNonDeleted && !cb) + { + if (curTupleAllocated) + pfree(curTuple.data); + O_TUPLE_SET_NULL(result); + MemoryContextSwitchTo(prevMctx); + return result; + } + + if (!curTupleAllocated) + { + result_size = o_btree_len(desc, curTuple, OTupleLength); + /* TODO: check result tuple size */ + result.data = (Pointer) MemoryContextAlloc(mcxt, result_size); + memcpy(result.data, curTuple.data, result_size); + result.formatFlags = curTuple.formatFlags; + } + else + { + result = curTuple; + } + + Assert(!UndoLocationIsValid(undoLocation) || UNDO_REC_EXISTS(desc->undoType, undoLocation)); + MemoryContextSwitchTo(prevMctx); + return result; +} + +BTreeIterator * +o_btree_iterator_create(BTreeDescr *desc, void *key, BTreeKeyType kind, + OSnapshot *o_snapshot, ScanDirection scanDir) +{ + BTreeIterator *it; + uint16 findFlags = BTREE_PAGE_FIND_IMAGE; + OFindPageResult findResult PG_USED_FOR_ASSERTS_ONLY; + + it = (BTreeIterator *) palloc(sizeof(BTreeIterator)); + + if (!IS_SYS_TREE_OIDS(desc->oids)) + { + it->oidescr = (OIndexDescr *) desc->arg; + ResourceOwnerRememberOIndexDescr(CurrentResourceOwner, it->oidescr); + } + else + it->oidescr = NULL; + + it->combinedResult = have_current_undo(desc->undoType) && COMMITSEQNO_IS_NORMAL(o_snapshot->csn); + it->oSnapshot = *o_snapshot; + it->scanDir = scanDir; + it->tupleCxt = CurrentMemoryContext; + it->fetchCallback = NULL; + it->fetchCallbackArg = NULL; + BTREE_PAGE_LOCATOR_SET_INVALID(&it->undoLoc); +#ifdef USE_ASSERT_CHECKING + O_TUPLE_SET_NULL(it->prevTuple.tuple); +#endif + + undo_it_create(&it->undoIt, it); + + if (IT_IS_BACKWARD(it)) + findFlags |= BTREE_PAGE_FIND_KEEP_LOKEY; + + init_page_find_context(&it->context, desc, + it->combinedResult ? COMMITSEQNO_INPROGRESS : o_snapshot->csn, findFlags); + + if (key == NULL) + { + if (IT_IS_FORWARD(it)) + kind = BTreeKeyNone; + else + kind = BTreeKeyRightmost; + } + + findResult = find_page(&it->context, key, kind, 0); + Assert(findResult == OFindPageResultSuccess); + + if (key != NULL && IT_IS_BACKWARD(it)) + { + BTreePageItemLocator *loc = &it->context.items[it->context.index].locator; + bool make_dec = false; + + /* + * From btree_page_binary_search(): "When nextkey is false (this + * case), we are looking for the first item >= scankey." + * + * If it's next item than decrement item offset. In case item == + * search key no need to do this. + */ + Assert(BTREE_PAGE_LOCATOR_GET_OFFSET(it->context.img, loc) <= BTREE_PAGE_ITEMS_COUNT(it->context.img)); + + if (BTREE_PAGE_LOCATOR_GET_OFFSET(it->context.img, loc) == BTREE_PAGE_ITEMS_COUNT(it->context.img)) + make_dec = true; + else + { + OTuple tup; + + BTREE_PAGE_READ_TUPLE(tup, it->context.img, loc); + if (o_btree_cmp(desc, key, kind, &tup, BTreeKeyLeafTuple) < 0) + make_dec = true; + } + + if (make_dec) + BTREE_PAGE_LOCATOR_PREV(it->context.img, loc); + } + + load_page_from_undo(it, key, + kind != BTreeKeyRightmost ? kind : BTreeKeyNone); + + return it; +} + +void +o_btree_iterator_set_tuple_ctx(BTreeIterator *it, MemoryContext tupleCxt) +{ + it->tupleCxt = tupleCxt; +} + +void +o_btree_iterator_set_callback(BTreeIterator *it, + TupleFetchCallback callback, + void *arg) +{ + it->fetchCallback = callback; + it->fetchCallbackArg = arg; +} + +/* + * Fetches tha next tuple from the iterator. Returns null tuple when there + * are no more tuples before the end boundary (defined by `end`, `endType`, + * and `endIsIncluded`). + * + * The result's OTuple.data is allocated in it->tupleCxt memory context. It's + * the caller's responsibility to free this memory. + */ +OTuple +o_btree_iterator_fetch(BTreeIterator *it, CommitSeqNo *tupleCsn, + void *end, BTreeKeyType endType, + bool endIsIncluded, BTreeLocationHint *hint) +{ + BTreeDescr *desc = it->context.desc; + OTuple result; + + ASAN_UNPOISON_MEMORY_REGION(&result, sizeof(result)); + + result = o_btree_iterator_fetch_internal(it, tupleCsn); + + if (!O_TUPLE_IS_NULL(result) && end != NULL) + { + int cmp = o_btree_cmp(desc, &result, BTreeKeyLeafTuple, end, endType); + + if (IT_IS_BACKWARD(it)) + cmp *= -1; + + if (cmp >= (endIsIncluded ? 1 : 0)) + { + pfree(result.data); + O_TUPLE_SET_NULL(result); + return result; + } + } + +#ifdef USE_ASSERT_CHECKING + if (!O_TUPLE_IS_NULL(result)) + { + if (!O_TUPLE_IS_NULL(it->prevTuple.tuple)) + { + int cmp; + + cmp = o_btree_cmp(desc, &it->prevTuple.tuple, BTreeKeyLeafTuple, + &result, BTreeKeyLeafTuple); + + Assert((IT_IS_FORWARD(it) && cmp < 0) || cmp > 0); + } + copy_fixed_tuple(desc, &it->prevTuple, result); + } +#endif + + if (hint) + { + hint->blkno = it->context.items[it->context.index].blkno; + hint->pageChangeCount = it->context.items[it->context.index].pageChangeCount; + } + + return result; +} + +/* + * Free resouces associated with iterator. + */ +void +btree_iterator_free(BTreeIterator *it) +{ + if (it->oidescr) + ResourceOwnerForgetOIndexDescr(CurrentResourceOwner, it->oidescr); + pfree(it); +} + +/* + * Load page from undo for combined result. + */ +static void +load_page_from_undo(BTreeIterator *it, void *key, BTreeKeyType kind) +{ + OBTreeFindPageContext *context = &it->context; + BTreePageHeader *header = (BTreePageHeader *) context->img; + BTreeDescr *desc = context->desc; + + if (it->combinedResult && header->csn >= it->oSnapshot.csn) + { + undo_it_init(&it->undoIt, header->undoLocation, key, kind); + + if (key) + { + btree_page_search(desc, + it->undoIt.image, + key, kind, NULL, + &it->undoLoc); + page_locator_find_real_item(it->undoIt.image, NULL, + &it->undoLoc); + + if (IT_IS_BACKWARD(it)) + { + OTuple founded; + OffsetNumber undoOffset; + + BTREE_PAGE_READ_TUPLE(founded, it->undoIt.image, &it->undoLoc); + + /* + * From btree_page_binary_search(): "When nextkey is false + * (this case), we are looking for the first item >= scankey." + * + * If it's next item than decrement item offset. In case item + * == key bound no need to do this. + */ + undoOffset = BTREE_PAGE_LOCATOR_GET_OFFSET(it->undoIt.image, &it->undoLoc); + Assert(undoOffset <= BTREE_PAGE_ITEMS_COUNT(it->undoIt.image)); + if (undoOffset == BTREE_PAGE_ITEMS_COUNT(it->undoIt.image) || + o_btree_cmp(desc, key, kind, &founded, BTreeKeyLeafTuple)) + BTREE_PAGE_LOCATOR_PREV(it->undoIt.image, &it->undoLoc); + } + + } + else if (IT_IS_FORWARD(it)) + { + BTREE_PAGE_LOCATOR_FIRST(it->undoIt.image, &it->undoLoc); + } + else + { + BTREE_PAGE_LOCATOR_LAST(it->undoIt.image, &it->undoLoc); + } + + it->combinedPage = true; + get_next_combined_location(it); + } + else + { + it->combinedPage = false; + } +} + +/* + * Find the next tuple location for result combination. It should have + * current oxid. + */ +static void +get_next_combined_location(BTreeIterator *it) +{ + OBTreeFindPageContext *context = &it->context; + OXid oxid = get_current_oxid_if_any(); + BTreePageItemLocator *loc = &context->items[context->index].locator; + Page img = context->img; + + if (!BTREE_PAGE_LOCATOR_IS_VALID(img, loc)) + return; + + while (BTREE_PAGE_LOCATOR_IS_VALID(img, loc)) + { + BTreeLeafTuphdr *tupHdr; + + tupHdr = (BTreeLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(img, loc); + + if (XACT_INFO_OXID_EQ(tupHdr->xactInfo, oxid)) + break; + + IT_NEXT_OFFSET(it, loc); + } +} + +/* + * Fetch next tuple without checking for end condition. + */ +static OTuple +o_btree_iterator_fetch_internal(BTreeIterator *it, CommitSeqNo *tupleCsn) +{ + BTreeDescr *desc = it->context.desc; + OBTreeFindPageContext *context = &it->context; + OBtreePageFindItem *leaf_item; + Page img = context->img, + hImg = it->undoIt.image; + OTuple result, + itup, + htup; + int cmp; + + while (true) + { + if (!btree_iterator_check_load_next_page(it)) + { + O_TUPLE_SET_NULL(result); + return result; + } + + leaf_item = &context->items[context->index]; + + if (it->combinedPage) + { + if (!BTREE_PAGE_LOCATOR_IS_VALID(hImg, &it->undoLoc)) + cmp = -1; + else if (!BTREE_PAGE_LOCATOR_IS_VALID(img, &leaf_item->locator)) + cmp = 1; + else + { + BTREE_PAGE_READ_LEAF_TUPLE(itup, img, &leaf_item->locator); + BTREE_PAGE_READ_LEAF_TUPLE(htup, hImg, &it->undoLoc); + cmp = o_btree_cmp(desc, &itup, BTreeKeyLeafTuple, &htup, BTreeKeyLeafTuple); + if (IT_IS_BACKWARD(it)) + cmp *= -1; /* mirror compare logic */ + } + + if (cmp <= 0) + { + result = o_find_tuple_version(desc, img, + &leaf_item->locator, + &it->oSnapshot, tupleCsn, + it->tupleCxt, + it->fetchCallback, + it->fetchCallbackArg); + + IT_NEXT_OFFSET(it, &leaf_item->locator); + + get_next_combined_location(it); + + if (cmp == 0) + UNDO_IT_NEXT_OFFSET(&it->undoIt, &it->undoLoc); + + if (!O_TUPLE_IS_NULL(result)) + return result; + } + else + { + result = o_find_tuple_version(desc, hImg, + &it->undoLoc, + &it->oSnapshot, tupleCsn, + it->tupleCxt, + it->fetchCallback, + it->fetchCallbackArg); + + UNDO_IT_NEXT_OFFSET(&it->undoIt, &it->undoLoc); + + if (!O_TUPLE_IS_NULL(result)) + return result; + } + } + else + { + result = o_find_tuple_version(desc, context->img, + &leaf_item->locator, + &it->oSnapshot, tupleCsn, + it->tupleCxt, + it->fetchCallback, + it->fetchCallbackArg); + + IT_NEXT_OFFSET(it, &leaf_item->locator); + + if (!O_TUPLE_IS_NULL(result)) + return result; + } + } + + O_TUPLE_SET_NULL(result); + return result; /* unreachable */ +} + +/* + * Check and load the next tree page if needed. Works with both normal and undo + * pages. Return true on success. False means there is nothing more to read. + */ +static bool +btree_iterator_check_load_next_page(BTreeIterator *it) +{ + OBTreeFindPageContext *context = &it->context; + Page img = context->img, + hImg = it->undoIt.image; + BTreeDescr *desc = context->desc; + OFixedKey key_buf; + + if (o_btree_interator_can_fetch_from_undo(context->desc, it)) + return true; + + while (!BTREE_PAGE_LOCATOR_IS_VALID(img, &context->items[context->index].locator)) + { + bool step_result; + BTreePageHeader *header; + + if (IS_LAST_PAGE(img, it)) + return false; + + if (IT_IS_FORWARD(it)) + step_result = find_right_page(context, &key_buf); + else + step_result = find_left_page(context, &key_buf); + + if (!step_result) + return false; + + header = (BTreePageHeader *) context->img; + + if (it->combinedResult && header->csn >= it->oSnapshot.csn) + { + bool reload = true; + + if (it->combinedPage) + reload = !o_btree_interator_can_fetch_from_undo(context->desc, it); + + if (reload) + { + /* + * We can not to use current undo images iterator. + */ + + /* finds a tuple to resume */ + undo_it_init(&it->undoIt, + header->undoLocation, + &key_buf, + (IT_IS_FORWARD(it) ? BTreeKeyNonLeafKey : BTreeKeyPageHiKey)); + + btree_page_search(desc, hImg, (Pointer) &key_buf.tuple, + BTreeKeyNonLeafKey, NULL, + &it->undoLoc); + page_locator_find_real_item(hImg, NULL, + &it->undoLoc); + + if (IT_IS_BACKWARD(it)) + BTREE_PAGE_LOCATOR_PREV(hImg, &it->undoLoc); + } + get_next_combined_location(it); + it->combinedPage = true; + } + else + { + it->combinedPage = false; + } + + if (can_fetch_from_undo(it)) + break; + } + + return true; +} + +/* + * Can we fetch more pages form undo page image? + */ +static bool +o_btree_interator_can_fetch_from_undo(BTreeDescr *desc, BTreeIterator *it) +{ + Page hImg = it->undoIt.image, + img = it->context.img; + BTreePageHeader *header = (BTreePageHeader *) img; + + /* + * Nothing to do if we're not on the combined page. All the undo items + * corresponding to the data page key range must happen while we're on + * that data page. + */ + if (!it->combinedPage) + return false; + + Assert(it->combinedResult && header->csn >= it->oSnapshot.csn); + + while (!BTREE_PAGE_LOCATOR_IS_VALID(hImg, &it->undoLoc)) + { + /* switch to next history page if we can */ + if (undo_it_next_page(it->context.desc, &it->undoIt) || + undo_it_switch(desc, &it->undoIt, header->undoLocation)) + { + if (IT_IS_FORWARD(it)) + BTREE_PAGE_LOCATOR_FIRST(hImg, &it->undoLoc); + else + BTREE_PAGE_LOCATOR_LAST(hImg, &it->undoLoc); + } + else + { + break; + } + } + + return can_fetch_from_undo(it); +} + +/* + * Check if `historicalImg` still contains more tuples corresponding to + * the `img` key range. + */ +static bool +can_fetch_from_undo(BTreeIterator *it) +{ + BTreeDescr *desc = it->context.desc; + OBTreeFindPageContext *context = &it->context; + OTuple htup; + int cmp; + + /* False if no tuples to fetch */ + if (!BTREE_PAGE_LOCATOR_IS_VALID(it->undoIt.image, &it->undoLoc)) + return false; + + /* True if `img` key range is inifity in the required direction */ + if (IS_LAST_PAGE(context->img, it)) + return true; + + /* Compare the next tuple with corresponding key range bound */ + BTREE_PAGE_READ_LEAF_TUPLE(htup, it->undoIt.image, &it->undoLoc); + if (IT_IS_FORWARD(it)) + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, context->img); + cmp = o_btree_cmp(desc, &hikey, BTreeKeyNonLeafKey, &htup, BTreeKeyLeafTuple); + return cmp > 0; + } + else /* backward iterator case */ + { + OTuple lokey = btree_find_context_lokey(context); + + cmp = o_btree_cmp(desc, &lokey, BTreeKeyNonLeafKey, &htup, BTreeKeyLeafTuple); + return cmp <= 0; + } +} + +static OTuple +btree_iterate_raw_internal(BTreeIterator *it, void *end, BTreeKeyType endKind, + bool endInclude, bool *scanEnd, + BTreeLocationHint *hint, bool deleted_as_null, + BTreeLeafTuphdr **tupHdr) +{ + BTreeLeafTuphdr *localTupHdr; + OBTreeFindPageContext *context = &it->context; + Page img = context->img; + OTuple result; + OFixedKey key_buf; + + if (!tupHdr) + tupHdr = &localTupHdr; + + *scanEnd = false; + + while (true) + { + BTreePageItemLocator *loc = &context->items[context->index].locator; + + if (BTREE_PAGE_LOCATOR_IS_VALID(img, loc)) + { + BTREE_PAGE_READ_LEAF_ITEM(*tupHdr, result, context->img, loc); + IT_NEXT_OFFSET(it, loc); + + if (end != NULL && endKind != BTreeKeyNone) + { + BTreeDescr *desc = it->context.desc; + int cmp; + + cmp = o_btree_cmp(desc, &result, BTreeKeyLeafTuple, end, endKind); + if (cmp > 0 || (cmp == 0 && !endInclude)) + { + *scanEnd = true; + O_TUPLE_SET_NULL(result); + return result; + } + } + + if (!deleted_as_null || + (*tupHdr)->deleted == BTreeLeafTupleNonDeleted) + { + if (hint) + { + hint->blkno = it->context.items[it->context.index].blkno; + hint->pageChangeCount = it->context.items[it->context.index].pageChangeCount; + } + return result; + } + else + { + O_TUPLE_SET_NULL(result); + return result; + } + } + + if (IS_LAST_PAGE(img, it)) + { + *scanEnd = true; + O_TUPLE_SET_NULL(result); + return result; + } + + if (IT_IS_FORWARD(it)) + { + if (!find_right_page(context, &key_buf)) + { + O_TUPLE_SET_NULL(result); + return result; + } + } + else + { + if (!find_left_page(context, &key_buf)) + { + O_TUPLE_SET_NULL(result); + return result; + } + } + } + O_TUPLE_SET_NULL(result); + return result; /* unreachable */ +} + +/* + * Iterate over leaf page tuples without considering undo log. Deleted tuples + * are reported as NULLs. So, the separate `*end` flag indicates finish of + * iterations. + */ +OTuple +btree_iterate_raw(BTreeIterator *it, void *end, BTreeKeyType endKind, + bool endInclude, bool *scanEnd, BTreeLocationHint *hint) +{ + return btree_iterate_raw_internal(it, end, endKind, endInclude, scanEnd, + hint, true, NULL); +} + +/* + * Iterate over leaf page tuples without considering undo log. Deleted tuples + * also returned. + */ +OTuple +btree_iterate_all(BTreeIterator *it, void *end, BTreeKeyType endKind, + bool endInclude, bool *scanEnd, BTreeLocationHint *hint, + BTreeLeafTuphdr **tupHdr) +{ + return btree_iterate_raw_internal(it, end, endKind, endInclude, scanEnd, + hint, false, tupHdr); +} + +/* + * Fills basic fields of undo iterator + */ +static void +undo_it_create(UndoIterator *undoIt, BTreeIterator *it) +{ + Assert(it->scanDir != NoMovementScanDirection); + + undoIt->it = it; + undoIt->rightmost = true; + undoIt->leftmost = true; + undoIt->baseLoc = InvalidUndoLocation; + undoIt->imageUndoLoc = InvalidUndoLocation; +} + +/* + * Initializes the undo iterator + */ +static void +undo_it_init(UndoIterator *undoIt, UndoLocation location, void *key, BTreeKeyType kind) +{ + undoIt->baseLoc = location; + undo_it_find_internal(undoIt, key, kind); +} + +/* + * Tries to switch to next undo page from the same baseLoc + */ +static bool +undo_it_next_page(BTreeDescr *desc, UndoIterator *undoIt) +{ + BTreeKeyType kind; + OFixedKey key; + UndoLocation prevLoc; + + if (!UndoLocationIsValid(undoIt->baseLoc)) + return false; + + /* Get bound key from the current undo page */ + if (IT_IS_FORWARD(undoIt->it)) + { + if (undoIt->rightmost) + return false; /* no more pages */ + copy_fixed_hikey(desc, &key, undoIt->image); + kind = BTreeKeyNonLeafKey; + } + else + { + if (undoIt->leftmost) + return false; /* no more pages */ + copy_fixed_key(desc, &key, undoIt->lokey.tuple); + kind = BTreeKeyPageHiKey; + } + + Assert(!IS_LAST_PAGE(undoIt->image, undoIt->it)); + + prevLoc = undoIt->imageUndoLoc; + + /* Find undo page corresponding to the key from the undoIt->baseLoc */ + undo_it_find_internal(undoIt, &key.tuple, kind); + + /* Did we manage to find another page? */ + if (prevLoc != undoIt->imageUndoLoc) + { + return true; + } + else + { + Assert(undoIt->rightmost || undoIt->leftmost); + return false; + } +} + +/* + * Tries to switch to the next baseLoc + */ +static bool +undo_it_switch(BTreeDescr *desc, UndoIterator *undoIt, UndoLocation location) +{ + bool is_forward = IT_IS_FORWARD(undoIt->it); + + if (!UndoLocationIsValid(location) || undoIt->baseLoc == location) + return false; + + if (!UndoLocationIsValid(undoIt->baseLoc)) + { + BTreeKeyType kind = is_forward ? BTreeKeyNone : BTreeKeyRightmost; + + /* load of full undo */ + undoIt->baseLoc = location; + undo_it_find_internal(undoIt, NULL, kind); + return true; + } + /* else we must find next page in undo */ + + undoIt->baseLoc = location; + if (IS_LAST_PAGE(undoIt->image, undoIt->it)) + { + /* there is no more pages expected */ + return false; + } + else + { + /* + * We need to find the page, which hikey moves to the required + * direction in comparison with previous undo page. + */ + if (O_PAGE_IS(undoIt->image, RIGHTMOST)) + { + Assert(!is_forward); + + undo_it_find_internal(undoIt, NULL, BTreeKeyRightmost); + + if (O_PAGE_IS(undoIt->image, RIGHTMOST)) + { + /* + * we expect that a loaded rightmost page is equal to the + * previous + */ + return undo_it_next_page(desc, undoIt); + } + return true; + } + else + { + OFixedKey prev_hikey; + int cmp; + + /* copy the previous page hikey */ + copy_fixed_hikey(desc, &prev_hikey, undoIt->image); + undo_it_find_internal(undoIt, &prev_hikey.tuple, BTreeKeyPageHiKey); + + if (O_PAGE_IS(undoIt->image, RIGHTMOST)) + { + cmp = 1; + } + else + { + OTuple image_hikey; + + BTREE_PAGE_GET_HIKEY(image_hikey, undoIt->image); + cmp = o_btree_cmp(desc, &image_hikey, BTreeKeyNonLeafKey, + &prev_hikey.tuple, BTreeKeyNonLeafKey); + } + + if (cmp == 0) + return undo_it_next_page(desc, undoIt); + + /* in forward case we must load next page */ + Assert(!is_forward || cmp > 0); + /* in backward case we must load previous page */ + Assert(is_forward || cmp < 0); + + return true; + } + } +} + +/* + * Find in undo log page corresponding to the given key. + */ +static void +undo_it_find_internal(UndoIterator *undoIt, void *key, BTreeKeyType kind) +{ + BTreePageHeader *header; + CommitSeqNo rec_csn; + BTreeDescr *desc = undoIt->it->context.desc; + UndoLogType undoType PG_USED_FOR_ASSERTS_ONLY = GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType); + UndoLocation rec_undo_loc, + undoLocation; + bool left, + right; + + undoLocation = undoIt->baseLoc; + undoIt->leftmost = true; + undoIt->rightmost = true; + + while (true) + { + /* Load the next page item from page-level undo item */ + if (undoIt->it->scanDir == ForwardScanDirection) + get_page_from_undo(desc, undoLocation, key, kind, + undoIt->image, &left, &right, NULL, NULL, NULL); + else + get_page_from_undo(desc, undoLocation, key, kind, + undoIt->image, &left, &right, &undoIt->lokey, NULL, NULL); + + undoIt->rightmost = (undoIt->rightmost && right) || O_PAGE_IS(undoIt->image, RIGHTMOST); + undoIt->leftmost = (undoIt->leftmost && left) || O_PAGE_IS(undoIt->image, LEFTMOST); + undoIt->imageUndoLoc = O_UNDO_GET_IMAGE_LOCATION(undoLocation, left); + Assert(UNDO_REC_EXISTS(undoType, undoLocation)); + + header = (BTreePageHeader *) undoIt->image; + rec_csn = header->csn; + rec_undo_loc = header->undoLocation; + + /* Check if we need to visit next page-level undo item */ + if (COMMITSEQNO_IS_NORMAL(rec_csn) && rec_csn >= undoIt->it->oSnapshot.csn) + { + undoLocation = rec_undo_loc; + continue; + } + else + { + break; + } + } + + /* if O_PAGE_IS(undoIt->image, RIGHTMOST) then undoIt->righmost == true */ + Assert(!O_PAGE_IS(undoIt->image, RIGHTMOST) || undoIt->rightmost); + /* if O_PAGE_IS(undoIt->image, LEFTMOST) then undoIt->leftmost == true */ + Assert(!O_PAGE_IS(undoIt->image, LEFTMOST) || undoIt->leftmost); +} diff --git a/contrib/orioledb/src/btree/merge.c b/contrib/orioledb/src/btree/merge.c new file mode 100644 index 00000000000..04f44870334 --- /dev/null +++ b/contrib/orioledb/src/btree/merge.c @@ -0,0 +1,727 @@ +/*------------------------------------------------------------------------- + * + * merge.c + * Routines for implementation of B-tree pages merge. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/merge.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/io.h" +#include "btree/merge.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "checkpoint/checkpoint.h" +#include "utils/page_pool.h" +#include "transam/undo.h" + +#include "miscadmin.h" + +/* + * If the ratio of free to total space on a leaf page is greater than the value + * then we will try to merge the node page. + */ +#define O_MERGE_LEAF_FREE_RATIO (0.7) +/* + * If the ratio of free to total space on a node page is greater than the value + * then we will try to merge the node page. + */ +#define O_MERGE_NODE_FREE_RATIO (0.7) + +static bool can_be_merged(BTreeDescr *desc, Page left, Page right, + CommitSeqNo csn); +static void merge_pages(BTreeDescr *desc, OInMemoryBlkno left_blkno, + Page right, CommitSeqNo csn); + + +/* + * Try to merge right page to the left page. Returns true iff succeed. + * + * On success, all pages are unlocked. On failure, all locks are held. + */ +bool +btree_try_merge_pages(BTreeDescr *desc, + OInMemoryBlkno parent_blkno, OFixedKey *parent_hikey, + bool *merge_parent, + OInMemoryBlkno left_blkno, + BTreePageItemLocator *right_loc, + OInMemoryBlkno right_blkno, + bool checkpoint) +{ + Page parent = O_GET_IN_MEMORY_PAGE(parent_blkno), + left = O_GET_IN_MEMORY_PAGE(left_blkno), + right = O_GET_IN_MEMORY_PAGE(right_blkno); + OrioleDBPageDesc *right_desc; + BTreePageHeader *left_header = (BTreePageHeader *) left; + FileExtent right_extent; + CommitSeqNo csn; + UndoLocation undo_loc; + uint32 checkpoint_number; + bool copy_blkno; + bool needsUndo; + int level = PAGE_GET_LEVEL(right); + + if (RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(right))) + { + /* concurrent split in progress */ + return false; + } + + if (!get_checkpoint_number(desc, right_blkno, + &checkpoint_number, ©_blkno)) + { + /* + * page is concurrent to in progress checkpoint and can not be merged + */ + return false; + } + + needsUndo = O_PAGE_IS(left, LEAF) && desc->undoType != UndoLogNone; + if (needsUndo && OXidIsValid(desc->createOxid) && + !XACT_INFO_IS_FINISHED(desc->createOxid)) + needsUndo = false; + + if (needsUndo) + csn = pg_atomic_fetch_add_u64(&TRANSAM_VARIABLES->nextCommitSeqNo, 1); + else + csn = COMMITSEQNO_INPROGRESS; + + if (!can_be_merged(desc, left, right, csn)) + { + return false; + } + + /* all checks are done, errors do not expected after this line */ + START_CRIT_SECTION(); + + /* deletes downlink to right page from the parent node */ + page_block_reads(parent_blkno); + + page_locator_delete_item(parent, right_loc); + MARK_DIRTY_EXTENDED(desc, parent_blkno, checkpoint); + + /* unlocks the parent page */ + if (*merge_parent && is_page_too_sparse(desc, parent)) + { + /* + * We can try to merge thr parent page in the loop. No undo is + * required for non-leaf pages. + */ + if (!O_PAGE_IS(parent, RIGHTMOST)) + copy_fixed_hikey(desc, parent_hikey, parent); + else + O_TUPLE_SET_NULL(parent_hikey->tuple); + unlock_page(parent_blkno); + } + else + { + /* no need to merge parent page */ + unlock_page(parent_blkno); + parent_blkno = OInvalidInMemoryBlkno; + *merge_parent = false; + } + + /* Make a page-level undo item if needed */ + if (needsUndo) + { + undo_loc = make_merge_undo_image(desc, left, right, csn); + Assert(UndoLocationIsValid(undo_loc)); + + /* + * Memory barrier between making undo image and setting the undo + * location. + */ + pg_write_barrier(); + } + else + { + undo_loc = InvalidUndoLocation; + } + + /* + * Merge the pages and remove rightlink to the right page. + * + * It contains the required memory barrier between making undo image and + * setting the undo location. + */ + merge_pages(desc, left_blkno, right, csn); + btree_page_update_max_key_len(desc, left); + MARK_DIRTY_EXTENDED(desc, left_blkno, checkpoint); + + /* the right page can not be found in B-Tree after this line */ + + left_header->undoLocation = undo_loc; + + /* + * Memory barrier between write undo location and csn. See comment in the + * o_btree_read_page() for details. + */ + pg_write_barrier(); + left_header->csn = csn; + + Assert(checkpoint_state->stack[level].hikeyBlkno != left_blkno); + if (checkpoint_state->stack[level].hikeyBlkno == right_blkno) + checkpoint_state->stack[level].hikeyBlkno = left_blkno; + unlock_page(left_blkno); + left_blkno = OInvalidInMemoryBlkno; + + right_desc = O_GET_IN_MEMORY_PAGEDESC(right_blkno); + right_extent = right_desc->fileExtent; + + CLEAN_DIRTY(desc->ppool, right_blkno); + O_PAGE_CHANGE_COUNT_INC(right); + + ppool_free_page(desc->ppool, right_blkno, true); + + if (O_PAGE_IS(left, LEAF)) + pg_atomic_fetch_sub_u32(&BTREE_GET_META(desc)->leafPagesNum, 1); + + END_CRIT_SECTION(); + + if (FileExtentIsValid(right_extent)) + { + free_extent_for_checkpoint(desc, &right_extent, + checkpoint_number); + } + + return true; +} + + +/* + * Returns true if page is successfully merged to the left or to the right. + */ +bool +btree_try_merge_and_unlock(BTreeDescr *desc, OInMemoryBlkno blkno, + bool nested, bool wait_io) +{ + BTreePageItemLocator target_loc, + left_loc, + right_loc; + Page target = O_GET_IN_MEMORY_PAGE(blkno), + parent, + right, + left; + OFixedKey key; + int level; + OBTreeFindPageContext find_context; + OInMemoryBlkno parent_blkno, + target_blkno = OInvalidInMemoryBlkno, + right_blkno, + left_blkno; + uint32 parent_change_count; + bool success = false; + bool needsUndo = desc->undoType != UndoLogNone; + + /* + * Reserve the required undo size. We are holding the page lock, so we + * can only do this with 'wait == false'. + */ + if (needsUndo && !reserve_undo_size_extended(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType), + 2 * O_MERGE_UNDO_IMAGE_SIZE, + false)) + { + /* unable to reserve undo location, no opportunity to resume */ + unlock_page(blkno); + Assert(!have_locked_pages()); + return false; + } + + /* Step 1: get all the information from the parent page */ + level = PAGE_GET_LEVEL(target); + + Assert(page_is_locked(blkno) || O_PAGE_IS_LOCAL(blkno)); + Assert(desc->rootInfo.rootPageBlkno != blkno); + Assert(is_page_too_sparse(desc, target)); + Assert(!O_PAGE_IS(target, LEFTMOST) || !O_PAGE_IS(target, RIGHTMOST)); + Assert(!RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(target))); + + page_block_reads(blkno); + + /* copy hikey of current page */ + if (!O_PAGE_IS(target, RIGHTMOST)) + copy_fixed_hikey(desc, &key, target); + else + O_TUPLE_SET_NULL(key.tuple); + + /* unlock current page */ + unlock_page(blkno); + + /* + * Step 2: refind the parent. We did release the target lock first: locks + * shouldn't go bottom-up. + */ + init_page_find_context(&find_context, desc, + COMMITSEQNO_INPROGRESS, + BTREE_PAGE_FIND_MODIFY | + BTREE_PAGE_FIND_NO_FIX_SPLIT | + BTREE_PAGE_FIND_DOWNLINK_LOCATION); + + /* get a full find context for parent page and lock it */ + if (!O_TUPLE_IS_NULL(key.tuple)) + find_page(&find_context, &key.tuple, BTreeKeyPageHiKey, level + 1); + else + find_page(&find_context, NULL, BTreeKeyRightmost, level + 1); + parent_blkno = find_context.items[find_context.index].blkno; + parent_change_count = find_context.items[find_context.index].pageChangeCount; + + while (true) + { + BTreeNonLeafTuphdr *target_tuph, + *right_tuph, + *left_tuph; + bool merge_parent, + merged = false; + + if (!page_is_locked(parent_blkno)) + { + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + /* refind parent page if needed */ + if (!O_TUPLE_IS_NULL(key.tuple)) + result = refind_page(&find_context, &key.tuple, + BTreeKeyPageHiKey, level + 1, + parent_blkno, parent_change_count); + else + result = refind_page(&find_context, NULL, BTreeKeyRightmost, + level + 1, + parent_blkno, parent_change_count); + Assert(result == OFindPageResultSuccess); + } + + /* Step 3: do all the checks with parent and target */ + parent_change_count = find_context.items[find_context.index].pageChangeCount; + parent_blkno = find_context.items[find_context.index].blkno; + Assert(page_is_locked(parent_blkno) || O_PAGE_IS_LOCAL(parent_blkno)); + parent = O_GET_IN_MEMORY_PAGE(parent_blkno); + + target_loc = find_context.items[find_context.index].locator; + target_tuph = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent, &target_loc); + + if (!DOWNLINK_IS_IN_MEMORY(target_tuph->downlink)) + { + /* + * Page with O_BTREE_FLAG_UNDER_MERGE can not be evicted. But it + * can be split or merged and evicted. + */ + unlock_page(parent_blkno); + break; + } + + target_blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(target_tuph->downlink); + + /* all ok, lock target page */ + lock_page(target_blkno); + target = O_GET_IN_MEMORY_PAGE(target_blkno); + Assert((level == 0) == O_PAGE_IS(target, LEAF)); + + if (BTREE_PAGE_ITEMS_COUNT(parent) == 1 || + RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(target))) + { + /* + * The target page is a single child of parent node or concurrent + * split in progress. + */ + unlock_page(parent_blkno); + unlock_page(target_blkno); + break; + } + + if (page_is_under_checkpoint(desc, parent_blkno, true) + || (level > 0 && page_is_under_checkpoint(desc, target_blkno, true))) + { + /* pages merge is concurrent to in progress checkpoint */ + unlock_page(parent_blkno); + unlock_page(target_blkno); + break; + } + + merge_parent = (nested && find_context.index > 0); + + /* + * Step 4: try to merge to the right. On success, all page lock are + * released. On failure, target and parent page locks are held. + */ + if (BTREE_PAGE_LOCATOR_GET_OFFSET(parent, &target_loc) + 1 < + BTREE_PAGE_ITEMS_COUNT(parent)) + { + right_loc = target_loc; + BTREE_PAGE_LOCATOR_NEXT(parent, &right_loc); + right_tuph = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent, &right_loc); + if (DOWNLINK_IS_IN_IO(right_tuph->downlink) && wait_io) + { + /* Wait till IO completion and retry */ + uint32 io_num = DOWNLINK_GET_IO_LOCKNUM(right_tuph->downlink); + + unlock_page(parent_blkno); + unlock_page(target_blkno); + target_blkno = OInvalidInMemoryBlkno; + wait_for_io_completion(io_num); + continue; + } + else if (DOWNLINK_IS_IN_MEMORY(right_tuph->downlink)) + { + int io_num; + + Assert(DOWNLINK_IS_IN_MEMORY(right_tuph->downlink)); + right_blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(right_tuph->downlink); + lock_page(right_blkno); + right = O_GET_IN_MEMORY_PAGE(right_blkno); + + io_num = O_GET_IN_MEMORY_PAGEDESC(right_blkno)->ionum; + if (io_num >= 0 && wait_io) + { + unlock_page(parent_blkno); + unlock_page(target_blkno); + unlock_page(right_blkno); + target_blkno = OInvalidInMemoryBlkno; + wait_for_io_completion(io_num); + continue; + } + + if (!O_PAGE_IS(right, PRE_CLEANUP) && + !RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(right)) && + !page_is_under_checkpoint(desc, right_blkno, true) && + io_num < 0) + { + merged = btree_try_merge_pages(desc, parent_blkno, &key, + &merge_parent, target_blkno, + &right_loc, right_blkno, + false); + if (!merged) + unlock_page(right_blkno); + } + else + { + merged = false; + unlock_page(right_blkno); + } + } + } + + /* + * Step 5: try to merge to the left. On success, all page lock are + * released. On failure, target and parent page locks are held. + */ + if (!merged && BTREE_PAGE_LOCATOR_GET_OFFSET(parent, &target_loc) > 0) + { + left_loc = target_loc; + BTREE_PAGE_LOCATOR_PREV(parent, &left_loc); + left_tuph = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(parent, &left_loc); + if (DOWNLINK_IS_IN_IO(left_tuph->downlink) && wait_io) + { + /* Wait till IO completion and retry */ + uint32 io_num = DOWNLINK_GET_IO_LOCKNUM(left_tuph->downlink); + + unlock_page(parent_blkno); + unlock_page(target_blkno); + target_blkno = OInvalidInMemoryBlkno; + wait_for_io_completion(io_num); + continue; + } + else if (DOWNLINK_IS_IN_MEMORY(left_tuph->downlink)) + { + int io_num; + + Assert(DOWNLINK_IS_IN_MEMORY(left_tuph->downlink)); + left_blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(left_tuph->downlink); + + /* + * Lock order right => left is dangerous for deadlocks. So, + * we don't wait here, but just optimistically try to lock. + * Other lock waiters would need some time to wake-up and grab + * the lock. So, give up and give them a chance. + */ + if (!try_lock_page(left_blkno)) + { + unlock_page(parent_blkno); + unlock_page(target_blkno); + target_blkno = OInvalidInMemoryBlkno; + break; + } + left = O_GET_IN_MEMORY_PAGE(left_blkno); + + io_num = O_GET_IN_MEMORY_PAGEDESC(target_blkno)->ionum; + if (io_num >= 0 && wait_io) + { + unlock_page(parent_blkno); + unlock_page(left_blkno); + unlock_page(target_blkno); + target_blkno = OInvalidInMemoryBlkno; + wait_for_io_completion(io_num); + continue; + } + + if (!O_PAGE_IS(left, PRE_CLEANUP) && + !RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(left)) && + !page_is_under_checkpoint(desc, left_blkno, true) && + io_num < 0) + { + merged = btree_try_merge_pages(desc, parent_blkno, + &key, &merge_parent, + left_blkno, + &target_loc, target_blkno, + false); + if (!merged) + unlock_page(left_blkno); + } + else + { + merged = false; + unlock_page(left_blkno); + } + } + } + + if (!merged) + { + unlock_page(parent_blkno); + unlock_page(target_blkno); + break; + } + else + { + success = true; + } + + if (merge_parent) + { + blkno = parent_blkno; + find_context.index--; + parent_blkno = find_context.items[find_context.index].blkno; + parent_change_count = find_context.items[find_context.index].pageChangeCount; + level++; + } + else + { + break; + } + /* else we will try to merge the parent page in the loop */ + } + + if (needsUndo) + release_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)); + + Assert(!have_locked_pages()); + return success; +} + +/* + * Checks is pages can be merged. + */ +static bool +can_be_merged(BTreeDescr *desc, Page left, Page right, CommitSeqNo csn) +{ + LocationIndex space_free, + space_needed; + bool is_leaf = O_PAGE_IS(left, LEAF); + + Assert(O_PAGE_IS(left, LEAF) == O_PAGE_IS(right, LEAF)); + Assert(!O_PAGE_IS(left, RIGHTMOST)); + + space_free = BTREE_PAGE_FREE_SPACE(left); + space_needed = ORIOLEDB_BLCKSZ - BTREE_PAGE_FREE_SPACE(right); + + /* we can not compact a node */ + if (!is_leaf) + return space_free >= space_needed; + + /* no need to compact page */ + if (space_free >= space_needed) + return true; + + /* we can merge pages after the pages compaction */ + if (space_free + PAGE_GET_N_VACATED(left) + + PAGE_GET_N_VACATED(right) < space_needed) + return false; + + if (space_free + page_get_vacated_space(desc, left, csn) + + page_get_vacated_space(desc, right, csn) >= space_needed) + return true; + + /* we can not merge this pages */ + return false; +} + +/* + * Merges pages and writes result to the left page. + */ +static void +merge_pages(BTreeDescr *desc, OInMemoryBlkno left_blkno, + Page right, CommitSeqNo csn) +{ + Page left = O_GET_IN_MEMORY_PAGE(left_blkno); + BTreePageHeader *left_header = (BTreePageHeader *) left, + *right_header = (BTreePageHeader *) right; + OTuple leftHikey, + rightHikey; + LocationIndex leftHikeySize, + rightHikeySize; + BTreePageItemLocator loc; + BTreePageItem items[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int i; + bool leaf = O_PAGE_IS(left, LEAF); + bool first; + char newItem[Max(BTreeLeafTuphdrSize, BTreeNonLeafTuphdrSize) + O_BTREE_MAX_TUPLE_SIZE]; + + Assert(O_PAGE_IS(left, LEAF) == O_PAGE_IS(right, LEAF)); + Assert(!O_PAGE_IS(left, RIGHTMOST)); + + leftHikeySize = BTREE_PAGE_GET_HIKEY_SIZE(left); + BTREE_PAGE_GET_HIKEY(leftHikey, left); + if (O_PAGE_IS(right, RIGHTMOST)) + { + rightHikeySize = 0; + O_TUPLE_SET_NULL(rightHikey); + } + else + { + rightHikeySize = BTREE_PAGE_GET_HIKEY_SIZE(right); + BTREE_PAGE_GET_HIKEY(rightHikey, right); + } + + i = 0; + if (leaf) + { + BTREE_PAGE_FOREACH_ITEMS(left, &loc) + { + BTreeLeafTuphdr *tupHdr; + OTuple tup; + bool finished; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, tup, left, &loc); + finished = XACT_INFO_FINISHED_FOR_EVERYBODY(tupHdr->xactInfo); + if (finished && tupHdr->deleted) + { + if (COMMITSEQNO_IS_INPROGRESS(csn) || XACT_INFO_MAP_CSN(tupHdr->xactInfo) < csn) + continue; + } + + items[i].data = (Pointer) tupHdr; + items[i].flags = tup.formatFlags; + items[i].size = finished ? (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tup, OTupleLength))) : + BTREE_PAGE_GET_ITEM_SIZE(left, &loc); + i++; + } + } + else + { + BTREE_PAGE_FOREACH_ITEMS(left, &loc) + { + items[i].data = BTREE_PAGE_LOCATOR_GET_ITEM(left, &loc); + items[i].flags = BTREE_PAGE_GET_ITEM_FLAGS(left, &loc); + items[i].size = BTREE_PAGE_GET_ITEM_SIZE(left, &loc); + i++; + } + } + + if (leaf) + { + BTREE_PAGE_FOREACH_ITEMS(right, &loc) + { + BTreeLeafTuphdr *tupHdr; + OTuple tup; + bool finished; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, tup, right, &loc); + finished = XACT_INFO_FINISHED_FOR_EVERYBODY(tupHdr->xactInfo); + if (finished && tupHdr->deleted) + { + if (COMMITSEQNO_IS_INPROGRESS(csn) || XACT_INFO_MAP_CSN(tupHdr->xactInfo) < csn) + continue; + } + + items[i].data = (Pointer) tupHdr; + items[i].flags = tup.formatFlags; + items[i].size = finished ? (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tup, OTupleLength))) : + BTREE_PAGE_GET_ITEM_SIZE(right, &loc); + i++; + } + } + else + { + first = true; + BTREE_PAGE_FOREACH_ITEMS(right, &loc) + { + if (first) + { + first = false; + memcpy(newItem, + BTREE_PAGE_LOCATOR_GET_ITEM(right, &loc), + BTreeNonLeafTuphdrSize); + memcpy(&newItem[BTreeNonLeafTuphdrSize], + leftHikey.data, + leftHikeySize); + items[i].data = newItem; + items[i].flags = leftHikey.formatFlags; + items[i].size = MAXALIGN(BTreeNonLeafTuphdrSize + leftHikeySize); + i++; + continue; + } + items[i].data = BTREE_PAGE_LOCATOR_GET_ITEM(right, &loc); + items[i].flags = BTREE_PAGE_GET_ITEM_FLAGS(right, &loc); + items[i].size = BTREE_PAGE_GET_ITEM_SIZE(right, &loc); + i++; + } + } + + page_block_reads(left_blkno); + + left_header->flags = left_header->flags | right_header->flags; + + btree_page_reorg(desc, left, items, i, rightHikeySize, rightHikey); + + o_btree_page_calculate_statistics(desc, left); + + left_header->rightLink = InvalidRightLink; + left_header->prevInsertOffset = InvalidOffsetNumber; +} + +/* + * Returns true if page is too sparse and we can try to merge it. + */ +bool +is_page_too_sparse(BTreeDescr *desc, Page p) +{ + LocationIndex space_free; + + /* we can not merge rootPageBlkno page */ + if (O_PAGE_IS(p, RIGHTMOST) && O_PAGE_IS(p, LEFTMOST)) + return false; + + /* page should not be under split */ + if (RightLinkIsValid(BTREE_PAGE_GET_RIGHTLINK(p))) + return false; + + if (O_PAGE_IS(p, LEAF)) + { + /* if leaf have no items */ + if (BTREE_PAGE_ITEMS_COUNT(p) == 0) + return true; + + space_free = BTREE_PAGE_FREE_SPACE(p) + PAGE_GET_N_VACATED(p); + if (((double) space_free / ORIOLEDB_BLCKSZ) < O_MERGE_LEAF_FREE_RATIO) + return false; + + space_free = BTREE_PAGE_FREE_SPACE(p) + page_get_vacated_space(desc, p, 0); + return ((double) space_free / ORIOLEDB_BLCKSZ) >= O_MERGE_LEAF_FREE_RATIO; + } + else + { + /* if node have only one downlink */ + if (BTREE_PAGE_ITEMS_COUNT(p) == 1) + return true; + + space_free = BTREE_PAGE_FREE_SPACE(p); + return ((double) space_free / ORIOLEDB_BLCKSZ) >= O_MERGE_NODE_FREE_RATIO; + } +} diff --git a/contrib/orioledb/src/btree/modify.c b/contrib/orioledb/src/btree/modify.c new file mode 100644 index 00000000000..56a41a11b9a --- /dev/null +++ b/contrib/orioledb/src/btree/modify.c @@ -0,0 +1,1640 @@ +/*------------------------------------------------------------------------- + * + * modify.c + * Routines for OrioleDB B-tree modification. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/modify.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/insert.h" +#include "btree/io.h" +#include "btree/merge.h" +#include "btree/modify.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "catalog/o_tables.h" +#include "recovery/recovery.h" +#include "recovery/wal.h" +#include "transam/undo.h" +#include "transam/oxid.h" +#include "utils/page_pool.h" +#include "utils/stopevent.h" + +#include "miscadmin.h" + +#define IsRelationTree(desc) (ORelOidsIsValid(desc->oids) && !IS_SYS_TREE_OIDS(desc->oids)) + +/* + * Context for o_btree_modify_internal() + */ +typedef struct +{ + OBTreeFindPageContext *pageFindContext; + OTuple tuple; + BTreeKeyType tupleType; + BTreeLeafTuphdr leafTuphdr; + BTreeLeafTuphdr conflictTupHdr; + bool replace; + UndoLocation conflictUndoLocation; + OXid opOxid; + CommitSeqNo opCsn; + RowLockMode lockMode; + LOCKTAG hwLockTag; + LOCKMODE hwLockMode; + bool needsUndo; + int pageReserveKind; + int cmp; + BTreeModifyLockStatus lockStatus; + bool pagesAreReserved; + bool undoIsReserved; + BTreeOperationType action; + Pointer key; + BTreeKeyType keyType; + UndoLocation savepointUndoLocation; + BTreeModifyCallbackInfo *callbackInfo; +} BTreeModifyInternalContext; + +typedef enum ConflictResolution +{ + ConflictResolutionOK, + ConflictResolutionRetry, + ConflictResolutionFound +} ConflictResolution; + +BTreeModifyCallbackInfo nullCallbackInfo = +{ + .waitCallback = NULL, + .modifyCallback = NULL, + .modifyDeletedCallback = NULL, + .needsUndoForSelfCreated = false, + .arg = NULL +}; + +static const LOCKMODE hwLockModes[] = {AccessShareLock, RowShareLock, ExclusiveLock, AccessExclusiveLock}; + +static void unlock_release(BTreeModifyInternalContext *context, bool unlock); +static ConflictResolution o_btree_modify_handle_conflicts(BTreeModifyInternalContext *context); +static OBTreeModifyResult o_btree_modify_handle_tuple_not_found(BTreeModifyInternalContext *context); +static bool o_btree_modify_item_rollback(BTreeModifyInternalContext *context); +static void o_btree_modify_insert_update(BTreeModifyInternalContext *context); +static void o_btree_modify_add_undo_record(BTreeModifyInternalContext *context); +static OBTreeModifyResult o_btree_modify_delete(BTreeModifyInternalContext *context); +static OBTreeModifyResult o_btree_modify_lock(BTreeModifyInternalContext *context); +static Jsonb *prepare_modify_start_params(BTreeDescr *desc); +static OBTreeModifyResult o_btree_normal_modify(BTreeDescr *desc, + BTreeOperationType action, + OTuple tuple, BTreeKeyType tupleType, + Pointer key, BTreeKeyType keyType, + OXid opOxid, + CommitSeqNo opCsn, + RowLockMode lockMode, + BTreeLocationHint *hint, + BTreeLeafTupleDeletedStatus deleted, + BTreeModifyCallbackInfo *callbackInfo); + +/* + * Perform modification of btree leaf tuple, when page is alredy located + * and locked, all reservations are done. + */ +static OBTreeModifyResult +o_btree_modify_internal(OBTreeFindPageContext *pageFindContext, + BTreeOperationType action, + OTuple _tuple, BTreeKeyType tupleType, + Pointer key, BTreeKeyType keyType, + OXid opOxid, CommitSeqNo opCsn, + RowLockMode _lockMode, + BTreeLeafTupleDeletedStatus deleted, + int pageReserveKind, + BTreeModifyCallbackInfo *callbackInfo) +{ + BTreeDescr *desc = pageFindContext->desc; + Page page; + BTreePageItemLocator loc; + OInMemoryBlkno blkno; + OBTreeModifyResult result = OBTreeModifyResultInserted; + OTuple curTuple; + BTreeLeafTuphdr *tuphdr; + BTreeModifyInternalContext context; + OXid tupleOxid = OXidIsValid(opOxid) ? opOxid : BootstrapTransactionId; + + context.tuple = _tuple; + context.tupleType = tupleType; + context.pageFindContext = pageFindContext; + context.replace = false; + context.opOxid = opOxid; + context.opCsn = opCsn; + context.lockMode = _lockMode; + context.hwLockMode = NoLock; + context.lockStatus = BTreeModifyNoLock; + context.action = action; + context.key = key; + context.keyType = keyType; + context.savepointUndoLocation = get_subxact_undo_location(desc->undoType); + context.pageReserveKind = pageReserveKind; + context.callbackInfo = callbackInfo; + + Assert(callbackInfo); + Assert((action != BTreeOperationInsert) || (tupleType == BTreeKeyLeafTuple)); + Assert((action == BTreeOperationLock) || (context.lockMode >= RowLockNoKeyUpdate)); + Assert((deleted == BTreeLeafTupleNonDeleted) || (action == BTreeOperationDelete)); + + context.pagesAreReserved = (action != BTreeOperationDelete); + context.undoIsReserved = (desc->undoType != UndoLogNone); + + /* Undo should be reserved for transactional operations */ + Assert(OXidIsValid(opOxid) == context.undoIsReserved); + +retry: + + context.needsUndo = desc->undoType != UndoLogNone; + if (!(callbackInfo && callbackInfo->needsUndoForSelfCreated) && + OXidIsValid(desc->createOxid) && + desc->createOxid == opOxid && + !UndoLocationIsValid(context.savepointUndoLocation)) + context.needsUndo = false; + context.leafTuphdr.deleted = deleted; + context.leafTuphdr.undoLocation = InvalidUndoLocation; + context.leafTuphdr.formatFlags = 0; + context.leafTuphdr.chainHasLocks = false; + context.leafTuphdr.xactInfo = OXID_GET_XACT_INFO(tupleOxid, context.lockMode, false); + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + Assert(page_is_locked(blkno) || O_PAGE_IS_LOCAL(blkno)); + + if (!BTREE_PAGE_LOCATOR_IS_VALID(page, &loc)) + return o_btree_modify_handle_tuple_not_found(&context); + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, &loc); + Assert(tuphdr != NULL); + context.cmp = o_btree_cmp(desc, key, keyType, &curTuple, BTreeKeyLeafTuple); + + /* Trees without undo cannot have row locks */ + if (desc->undoType == UndoLogNone) + { + context.conflictTupHdr = *tuphdr; + context.conflictUndoLocation = InvalidUndoLocation; + } + else if (context.cmp == 0) + { + ConflictResolution resolution; + + resolution = o_btree_modify_handle_conflicts(&context); + + if (resolution == ConflictResolutionFound) + return OBTreeModifyResultFound; + else if (resolution == ConflictResolutionRetry) + goto retry; + } + + Assert(page_is_locked(blkno) || O_PAGE_IS_LOCAL(blkno)); + + if (context.cmp != 0) + return o_btree_modify_handle_tuple_not_found(&context); + + if (tuphdr->deleted == BTreeLeafTupleNonDeleted) + { + /* Existing (non-deleted) tuple is found */ + OBTreeModifyCallbackAction cbAction = OBTreeCallbackActionDoNothing; + RowLockMode prev_lock_mode = context.lockMode; + + /* + * We should have set conflictTupHdr in the (cmp == 0) branch above. + */ + if (callbackInfo->modifyCallback) + { + BTreeLocationHint cbHint; + + cbHint.blkno = pageFindContext->items[pageFindContext->index].blkno; + cbHint.pageChangeCount = pageFindContext->items[pageFindContext->index].pageChangeCount; + cbAction = callbackInfo->modifyCallback(desc, curTuple, + &context.tuple, opOxid, context.conflictTupHdr.xactInfo, + context.conflictTupHdr.undoLocation, + &context.lockMode, &cbHint, callbackInfo->arg); + context.leafTuphdr.xactInfo = OXID_GET_XACT_INFO(tupleOxid, context.lockMode, false); + } + + if (cbAction == OBTreeCallbackActionUndo) + { + (void) o_btree_modify_item_rollback(&context); + goto retry; + } + + Assert(page_is_locked(blkno) || O_PAGE_IS_LOCAL(blkno)); + + if (callbackInfo->modifyCallback || (action == BTreeOperationInsert || + action == BTreeOperationUpdate || + action == BTreeOperationLock)) + { + if (cbAction == OBTreeCallbackActionDoNothing) + { + unlock_release(&context, true); + return OBTreeModifyResultFound; + } + else + { + if (context.lockMode > prev_lock_mode) + { + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + unlock_page(blkno); + + result = refind_page(pageFindContext, + key, + keyType, + 0, + pageFindContext->items[pageFindContext->index].blkno, + pageFindContext->items[pageFindContext->index].pageChangeCount); + Assert(result == OFindPageResultSuccess); + goto retry; + } + + if (cbAction == OBTreeCallbackActionUpdate) + { + Assert(tupleType == BTreeKeyLeafTuple); + context.replace = true; + result = OBTreeModifyResultUpdated; + } + else if (cbAction == OBTreeCallbackActionLock) + { + action = BTreeOperationLock; + } + else + { + Assert(cbAction == OBTreeCallbackActionDelete); + action = BTreeOperationDelete; + } + } + } + + Assert((action == BTreeOperationLock) || (context.lockMode >= RowLockNoKeyUpdate)); + + if (action == BTreeOperationDelete) + return o_btree_modify_delete(&context); + else if (action == BTreeOperationLock) + return o_btree_modify_lock(&context); + } + else if (tuphdr->deleted != BTreeLeafTupleNonDeleted) + { + /* + * We should have set conflictTupHdr in the (cmp == 0) branch above. + */ + + if (action == BTreeOperationInsert && callbackInfo->modifyDeletedCallback) + { + OBTreeModifyCallbackAction cbAction = OBTreeCallbackActionDoNothing; + BTreeLocationHint cbHint; + + cbHint.blkno = pageFindContext->items[pageFindContext->index].blkno; + cbHint.pageChangeCount = pageFindContext->items[pageFindContext->index].pageChangeCount; + cbAction = callbackInfo->modifyDeletedCallback(desc, curTuple, + &context.tuple, opOxid, + context.conflictTupHdr.xactInfo, + context.conflictTupHdr.deleted, + context.conflictTupHdr.undoLocation, + &context.lockMode, &cbHint, callbackInfo->arg); + context.leafTuphdr.xactInfo = OXID_GET_XACT_INFO(tupleOxid, context.lockMode, false); + + if (cbAction == OBTreeCallbackActionUndo) + { + (void) o_btree_modify_item_rollback(&context); + goto retry; + } + + if (cbAction == OBTreeCallbackActionDoNothing) + { + unlock_release(&context, true); + return OBTreeModifyResultNotFound; + } + Assert(cbAction == OBTreeCallbackActionUpdate); + } + + /* + * Deleted tuple found, we only can handle insert at this point. This + * insert essentially becomes update. + */ + if (action == BTreeOperationInsert) + { + /* + * There is no anything to undo for UndoLogNone trees so just + * proceed with replacing while page still locked + */ + if (!context.needsUndo && desc->undoType != UndoLogNone) + { + /* + * If we don't need undo, just revert the deletion and then + * continue with normal insert (with undo). + */ + (void) o_btree_modify_item_rollback(&context); + context.needsUndo = true; + } + else if (IsolationUsesXactSnapshot() && IsRelationTree(desc)) + { + if (XACT_INFO_MAP_CSN(context.conflictTupHdr.xactInfo) >= opCsn) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } + } + context.replace = true; + } + else + { + unlock_release(&context, true); + if (callbackInfo->modifyDeletedCallback) + callbackInfo->modifyDeletedCallback(desc, curTuple, + &context.tuple, opOxid, + context.conflictTupHdr.xactInfo, + context.conflictTupHdr.deleted, + context.conflictTupHdr.undoLocation, + &context.lockMode, NULL, + callbackInfo->arg); + return OBTreeModifyResultNotFound; + } + } + + Assert(tupleType == BTreeKeyLeafTuple); + + o_btree_modify_insert_update(&context); + unlock_release(&context, false); + return result; +} + +static void +unlock_release(BTreeModifyInternalContext *context, bool unlock) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + OInMemoryBlkno blkno; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + + if (unlock) + unlock_page(blkno); + if (context->undoIsReserved) + { + release_undo_size(desc->undoType); + if (GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType) != desc->undoType) + release_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)); + } + if (context->pagesAreReserved) + ppool_release_reserved(desc->ppool, + PPOOL_KIND_GET_MASK(context->pageReserveKind)); + if (context->hwLockMode != NoLock) + LockRelease(&context->hwLockTag, context->hwLockMode, false); +} + +static void +wait_for_tuple(BTreeDescr *desc, OTuple tuple, OXid oxid, + RowLockMode lockMode, BTreeModifyLockStatus lockStatus, + LOCKTAG *hwLockTag, LOCKMODE *hwLockMode) +{ + uint32 hash; + + /* + * Acquire the lock, if necessary (but skip it when we're requesting a + * lock and already have one; avoids deadlock). + */ + if (*hwLockMode == NoLock && lockStatus == BTreeModifyNoLock) + { + hash = o_btree_hash(desc, tuple, BTreeKeyLeafTuple); + + SET_LOCKTAG_TUPLE(*hwLockTag, + desc->oids.datoid, + desc->oids.reloid, + hash, + 0); + *hwLockMode = hwLockModes[lockMode]; + + (void) LockAcquire(hwLockTag, *hwLockMode, false, false); + } + + wait_for_oxid(oxid, false); +} + +static ConflictResolution +o_btree_modify_handle_conflicts(BTreeModifyInternalContext *context) +{ + bool haveRedundantRowLocks = false; + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + OInMemoryBlkno blkno; + BTreePageItemLocator *loc; + Page page; + OTuple curTuple; + BTreeLeafTuphdr *tuphdr; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = &pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, loc); + + if (row_lock_conflicts(tuphdr, + &context->conflictTupHdr, + desc->undoType, + &context->conflictUndoLocation, + context->lockMode, context->opOxid, context->opCsn, + blkno, context->savepointUndoLocation, + &haveRedundantRowLocks, &context->lockStatus)) + { + OTupleXactInfo xactInfo = context->conflictTupHdr.xactInfo; + OXid oxid = XACT_INFO_GET_OXID(xactInfo); + + if (oxid == context->opOxid) + { + if (context->action == BTreeOperationLock || + (UndoLocationIsValid(context->savepointUndoLocation) && + (!UndoLocationIsValid(context->conflictTupHdr.undoLocation) || + context->conflictTupHdr.undoLocation < context->savepointUndoLocation)) || + o_btree_needs_undo(desc, context->action, curTuple, xactInfo, + tuphdr->deleted != BTreeLeafTupleNonDeleted, + context->tuple, context->opOxid)) + { + context->needsUndo = true; + } + else + { + if (XACT_INFO_GET_LOCK_MODE(xactInfo) > context->lockMode) + { + /* + * Upgrade our lock mode if we're going to replace our own + * undo item. + */ + Assert(OXidIsValid(context->opOxid)); + context->lockMode = XACT_INFO_GET_LOCK_MODE(xactInfo); + context->leafTuphdr.xactInfo = OXID_GET_XACT_INFO(context->opOxid, + context->lockMode, + false); + } + context->needsUndo = false; + } + } + else + { + CommitSeqNo csn; + + /* + * Test hook: parks the backend here, with the leaf-page-content + * lock held, so a concurrent aborter that has stamped the + * COMMITTING bit on its oxid can deadlock with our oxid_get_csn() + * spin (the page-lock vs. apply_undo_stack() cycle described in + * undo_xact_callback's XACT_EVENT_ABORT block). + */ + STOPEVENT(STOPEVENT_BEFORE_MODIFY_OXID_GET_CSN, NULL); + + csn = oxid_get_csn(oxid, false); + + if (XACT_INFO_IS_LOCK_ONLY(xactInfo) && (COMMITSEQNO_IS_ABORTED(csn) || + COMMITSEQNO_IS_NORMAL(csn) || + COMMITSEQNO_IS_FROZEN(csn))) + { + /* + * Normally row_lock_conflicts() should have lock-only records + * of committed and aborted transactions already removed from + * the undo chain. But if locker transaction commit or abort + * concurrently, then retry. + */ + return ConflictResolutionRetry; + } + + if (COMMITSEQNO_IS_ABORTED(csn)) + { + /* + * Transaction changes should be undone by the transaction + * owner. But we rollback those changes ourself instead of + * waiting. + */ + START_CRIT_SECTION(); + page_block_reads(blkno); + if (!page_item_rollback(desc, page, loc, true, + &context->conflictTupHdr, + context->conflictUndoLocation)) + context->cmp = -1; + MARK_DIRTY(desc, blkno); + END_CRIT_SECTION(); + } + else if (COMMITSEQNO_IS_NORMAL(csn) || COMMITSEQNO_IS_FROZEN(csn)) + { + /* + * Check for serialization conflicts. + * + * TODO: check for such conflicts in page-level undo as well. + */ + if (csn >= context->opCsn && IsolationUsesXactSnapshot() && + IsRelationTree(desc)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } + } + else + { + /* + * Conflicting transaction is in-progress. If the callback is + * provided, ask it what to do. Just wait otherwise. + */ + OBTreeWaitCallbackAction cbAction = OBTreeCallbackActionXidWait; + OFindPageResult result PG_USED_FOR_ASSERTS_ONLY; + + Assert(COMMITSEQNO_IS_INPROGRESS(csn)); + + if (context->callbackInfo->waitCallback) + { + BTreeLocationHint cbHint; + + cbHint.blkno = pageFindContext->items[pageFindContext->index].blkno; + cbHint.pageChangeCount = pageFindContext->items[pageFindContext->index].pageChangeCount; + cbAction = context->callbackInfo->waitCallback(desc, + curTuple, &context->tuple, oxid, + context->conflictTupHdr.xactInfo, + context->conflictTupHdr.undoLocation, + &context->lockMode, &cbHint, + context->callbackInfo->arg); + } + + unlock_page(blkno); + + Assert(cbAction <= OBTreeCallbackActionXidExit); + + if (cbAction == OBTreeCallbackActionXidWait) + wait_for_tuple(desc, curTuple, oxid, + context->lockMode, + context->lockStatus, + &context->hwLockTag, + &context->hwLockMode); + else if (cbAction == OBTreeCallbackActionXidExit) + return ConflictResolutionFound; + else + { + Assert(cbAction == OBTreeCallbackActionXidNoWait); + } + + result = refind_page(pageFindContext, + context->key, + context->keyType, + 0, + pageFindContext->items[pageFindContext->index].blkno, + pageFindContext->items[pageFindContext->index].pageChangeCount); + Assert(result == OFindPageResultSuccess); + return ConflictResolutionRetry; + } + + /* Update tuple and header pointer after page_item_rollback() */ + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, loc); + } + } + else if (IsolationUsesXactSnapshot() && IsRelationTree(desc)) + { + /* + * Check for serialization conflicts. + * + * TODO: check for such conflicts in page-level undo as well. + */ + CommitSeqNo csn = XACT_INFO_MAP_CSN(context->conflictTupHdr.xactInfo); + + if (csn >= context->opCsn) + { + if (tuphdr->deleted == BTreeLeafTupleDeleted || + tuphdr->deleted == BTreeLeafTupleMovedPartitions) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + else + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } + } + + /* + * Remove redundant row-level locks if any. + */ + if (haveRedundantRowLocks && + !(context->action == BTreeOperationLock && + context->lockStatus == BTreeModifySameOrStrongerLock)) + { + remove_redundant_row_locks(tuphdr, &context->conflictTupHdr, + desc->undoType, + &context->conflictUndoLocation, + context->lockMode, + context->opOxid, blkno, + context->savepointUndoLocation); + } + + if (!context->needsUndo) + context->leafTuphdr.undoLocation = tuphdr->undoLocation; + return ConflictResolutionOK; +} + +static OBTreeModifyResult +o_btree_modify_handle_tuple_not_found(BTreeModifyInternalContext *context) +{ + /* + * Matching tuple is not found. + * + * Ideally, for IsolationUsesXactSnapshot() we should also check + * page-level undo for conflicting tuples. But it's not implemented so + * far. + */ + if (context->action == BTreeOperationUpdate || + context->action == BTreeOperationDelete || + context->action == BTreeOperationLock) + { + unlock_release(context, true); + return OBTreeModifyResultNotFound; + } + else + { + Assert(context->tupleType == BTreeKeyLeafTuple); + + o_btree_modify_insert_update(context); + unlock_release(context, false); + return OBTreeModifyResultInserted; + } +} + +static bool +o_btree_modify_item_rollback(BTreeModifyInternalContext *context) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + OInMemoryBlkno blkno; + BTreePageItemLocator loc; + Page page; + bool applyResult; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + + START_CRIT_SECTION(); + page_block_reads(blkno); + applyResult = page_item_rollback(desc, page, &loc, false, + &context->conflictTupHdr, + context->conflictUndoLocation); + MARK_DIRTY(desc, blkno); + END_CRIT_SECTION(); + + if (!applyResult) + { + btree_page_search(desc, page, context->key, + context->keyType, NULL, &loc); + pageFindContext->items[pageFindContext->index].locator = loc; + } + + return applyResult; +} + + +static void +o_btree_modify_insert_update(BTreeModifyInternalContext *context) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + int tuplen; + + if (context->undoIsReserved && context->needsUndo) + { + o_btree_modify_add_undo_record(context); + } + else if (!context->needsUndo) + { + BTreeLeafTuphdr *leafTuphdr = &context->leafTuphdr; + + if (desc->undoType == UndoLogRegular) + { + leafTuphdr->undoLocation = InvalidUndoLocation; + if (!is_recovery_process()) + leafTuphdr->undoLocation |= current_command_get_undo_location(); + } + + /* + * Self-created shortcut: no undo record was made. Fire the post-undo + * hook with WaitingSkUndoLoc so the table AM can install a "wait for + * me" marker before this page lock drops. + */ + if (context->callbackInfo && context->callbackInfo->postUndoRecorded) + context->callbackInfo->postUndoRecorded(WaitingSkUndoLoc, + context->callbackInfo->arg); + } + + if (desc->undoType == UndoLogRegular && !is_recovery_process()) + { + Assert(undo_location_get_command(UndoLocationGetValue(context->leafTuphdr.undoLocation)) == o_get_current_command()); + } + + tuplen = o_btree_len(desc, context->tuple, OTupleLength); + Assert(tuplen <= O_BTREE_MAX_TUPLE_SIZE); + + /* no more sense in that */ + BTREE_PAGE_FIND_UNSET(pageFindContext, FIX_LEAF_SPLIT); + o_btree_insert_tuple_to_leaf(pageFindContext, + context->tuple, tuplen, + &context->leafTuphdr, + context->replace, + context->pageReserveKind); +} + +static void +o_btree_modify_add_undo_record(BTreeModifyInternalContext *context) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + BTreeLeafTuphdr *leafTuphdr = &context->leafTuphdr; + UndoLocation undoLocation = InvalidUndoLocation; + OInMemoryBlkno blkno; + BTreePageItemLocator loc; + Page page; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + + if (context->replace) + { + /* Make undo item and connect it with page tuple */ + OTuple curTuple; + BTreeLeafTuphdr *tuphdr; + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, &loc); + + undoLocation = make_undo_record(desc, curTuple, true, + BTreeOperationUpdate, blkno, + O_PAGE_GET_CHANGE_COUNT(page), + tuphdr); + leafTuphdr->undoLocation = undoLocation; + leafTuphdr->chainHasLocks = tuphdr->chainHasLocks || + XACT_INFO_IS_LOCK_ONLY(tuphdr->xactInfo); + } + else + { + /* Still need the undo item to deal with transaction rollback */ + undoLocation = make_undo_record(desc, context->tuple, true, + BTreeOperationInsert, blkno, + O_PAGE_GET_CHANGE_COUNT(page), + NULL); + if (desc->undoType == UndoLogRegular) + { + leafTuphdr->undoLocation = InvalidUndoLocation; + leafTuphdr->undoLocation |= current_command_get_undo_location(); + } + } + + /* + * Fire post-undo hook with the freshly created undo location, while the + * leaf page is still locked. Used by the table AM to install the + * PK-applied/SK-pending marker before unlock. + */ + if (context->callbackInfo && context->callbackInfo->postUndoRecorded) + context->callbackInfo->postUndoRecorded(undoLocation, + context->callbackInfo->arg); +} + +static OBTreeModifyResult +o_btree_modify_delete(BTreeModifyInternalContext *context) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + uint32 pageChangeCount; + UndoLocation undoLocation; + OInMemoryBlkno blkno; + BTreePageItemLocator loc; + Page page; + OTuple curTuple; + BTreeLeafTuphdr *tuphdr; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, &loc); + + if (!context->needsUndo) + { + bool stillExists; + + stillExists = o_btree_modify_item_rollback(context); + + if (stillExists) + { + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, &loc); + Assert(tuphdr != NULL); + stillExists = (tuphdr->deleted == BTreeLeafTupleNonDeleted); + } + + if (!stillExists) + { + /* Already deleted */ + unlock_release(context, true); + + return OBTreeModifyResultDeleted; + } + else + { + /* + * We rollback our own changes to the version existed before. + * Thus, we need an undo record to modify it. + */ + context->needsUndo = true; + } + } + + if (context->undoIsReserved && context->needsUndo) + { + OTuple key; + bool key_is_tuple; + + if (context->tupleType == BTreeKeyNonLeafKey) + { + key = context->tuple; + key_is_tuple = false; + } + else + { + key = curTuple; + key_is_tuple = true; + } + + pageChangeCount = O_PAGE_GET_CHANGE_COUNT(page); + undoLocation = make_undo_record(desc, key, key_is_tuple, + BTreeOperationDelete, blkno, + pageChangeCount, tuphdr); + + /* + * Fire post-undo hook with the freshly created undo location, while + * the leaf page is still locked. Used by the table AM to install the + * PK-applied/SK-pending marker before unlock. + */ + if (context->callbackInfo && context->callbackInfo->postUndoRecorded) + context->callbackInfo->postUndoRecorded(undoLocation, + context->callbackInfo->arg); + } + else + { + undoLocation = InvalidUndoLocation; + } + + START_CRIT_SECTION(); + page_block_reads(blkno); + + tuphdr->chainHasLocks = tuphdr->chainHasLocks || + XACT_INFO_IS_LOCK_ONLY(tuphdr->xactInfo); + tuphdr->undoLocation = undoLocation; + tuphdr->xactInfo = context->leafTuphdr.xactInfo; + if (context->leafTuphdr.deleted == BTreeLeafTupleNonDeleted) + tuphdr->deleted = BTreeLeafTupleDeleted; + else + tuphdr->deleted = context->leafTuphdr.deleted; + + /* Bridge index deleted tuples not treated as vacated */ + if (desc->type != oIndexBridge) + PAGE_ADD_N_VACATED(page, + BTreeLeafTuphdrSize + + MAXALIGN(o_btree_len(desc, curTuple, OTupleLength))); + + MARK_DIRTY(desc, blkno); + + END_CRIT_SECTION(); + + if (!OXidIsValid(context->opOxid) && is_page_too_sparse(desc, page)) + { + (void) btree_try_merge_and_unlock(desc, blkno, false, false); + unlock_release(context, false); + } + else + { + unlock_release(context, true); + } + + return OBTreeModifyResultDeleted; +} + +static OBTreeModifyResult +o_btree_modify_lock(BTreeModifyInternalContext *context) +{ + OBTreeFindPageContext *pageFindContext = context->pageFindContext; + BTreeDescr *desc = pageFindContext->desc; + UndoLocation undoLocation; + uint32 pageChangeCount; + OTuple key; + bool key_is_tuple; + OInMemoryBlkno blkno; + BTreePageItemLocator loc; + Page page; + OTuple curTuple; + BTreeLeafTuphdr *tuphdr; + + blkno = pageFindContext->items[pageFindContext->index].blkno; + loc = pageFindContext->items[pageFindContext->index].locator; + page = O_GET_IN_MEMORY_PAGE(blkno); + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, page, &loc); + + if (context->lockStatus == BTreeModifySameOrStrongerLock) + { + unlock_release(context, true); + return OBTreeModifyResultLocked; + } + + Assert(context->needsUndo); + Assert(context->undoIsReserved); + Assert(OXidIsValid(context->opOxid)); + + if (context->tupleType == BTreeKeyNonLeafKey) + { + key = context->tuple; + key_is_tuple = false; + } + else + { + key = curTuple; + key_is_tuple = true; + } + + pageChangeCount = O_PAGE_GET_CHANGE_COUNT(page); + undoLocation = make_undo_record(desc, key, key_is_tuple, + BTreeOperationLock, blkno, + pageChangeCount, tuphdr); + + START_CRIT_SECTION(); + page_block_reads(blkno); + + tuphdr->chainHasLocks = tuphdr->chainHasLocks || + XACT_INFO_IS_LOCK_ONLY(tuphdr->xactInfo); + tuphdr->undoLocation = undoLocation; + tuphdr->xactInfo = OXID_GET_XACT_INFO(context->opOxid, + context->lockMode, + true); + tuphdr->deleted = BTreeLeafTupleNonDeleted; + + MARK_DIRTY(desc, blkno); + END_CRIT_SECTION(); + unlock_release(context, true); + + return OBTreeModifyResultLocked; +} + +static Jsonb * +prepare_modify_start_params(BTreeDescr *desc) +{ + JsonbParseState *state = NULL; + Jsonb *res; + + MemoryContext mctx = MemoryContextSwitchTo(stopevents_cxt); + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + btree_desc_stopevent_params_internal(desc, &state); + res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL)); + MemoryContextSwitchTo(mctx); + + return res; +} + +static void +reserve_undo_for_modification(UndoLogType undoType) +{ + if (undoType == UndoLogNone) + return; + + if (GET_PAGE_LEVEL_UNDO_TYPE(undoType) == undoType) + { + (void) reserve_undo_size(undoType, O_MODIFY_UNDO_RESERVE_SIZE); + } + else + { + (void) reserve_undo_size(undoType, 2 * O_UPDATE_MAX_UNDO_SIZE); + (void) reserve_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(undoType), 2 * O_MAX_SPLIT_UNDO_IMAGE_SIZE); + } +} + +static OBTreeModifyResult +o_btree_normal_modify(BTreeDescr *desc, BTreeOperationType action, + OTuple tuple, BTreeKeyType tupleType, + Pointer key, BTreeKeyType keyType, + OXid opOxid, CommitSeqNo opCsn, + RowLockMode lockMode, BTreeLocationHint *hint, + BTreeLeafTupleDeletedStatus deleted, + BTreeModifyCallbackInfo *callbackInfo) +{ + OBTreeFindPageContext pageFindContext; + int pageReserveKind; + Jsonb *params = NULL; + OFindPageResult findResult; + + if (STOPEVENTS_ENABLED()) + params = prepare_modify_start_params(desc); + STOPEVENT(STOPEVENT_MODIFY_START, params); + + /* No no key is separately given, use the tuple itself */ + if (key == NULL) + { + key = (Pointer) &tuple; + keyType = tupleType; + } + + reserve_undo_for_modification(desc->undoType); + + if (OIDS_EQ_SYS_TREE(desc->oids, SYS_TREES_SHARED_ROOT_INFO)) + pageReserveKind = PPOOL_RESERVE_SHARED_INFO_INSERT; + else + pageReserveKind = PPOOL_RESERVE_INSERT; + + if (action != BTreeOperationDelete) + ppool_reserve_pages(desc->ppool, pageReserveKind, 2); + + init_page_find_context(&pageFindContext, desc, COMMITSEQNO_INPROGRESS, + BTREE_PAGE_FIND_MODIFY | BTREE_PAGE_FIND_FIX_LEAF_SPLIT); + + if (action == BTreeOperationInsert && tupleType == BTreeKeyLeafTuple) + { + pageFindContext.insertTuple = tuple; + if (OXidIsValid(opOxid)) + pageFindContext.insertXactInfo = OXID_GET_XACT_INFO(opOxid, lockMode, false); + else + pageFindContext.insertXactInfo = OXID_GET_XACT_INFO(BootstrapTransactionId, lockMode, false); + } + + if (hint && OInMemoryBlknoIsValid(hint->blkno)) + findResult = refind_page(&pageFindContext, key, keyType, 0, hint->blkno, hint->pageChangeCount); + else + findResult = find_page(&pageFindContext, key, keyType, 0); + + if (findResult == OFindPageResultInserted) + { + Assert(action == BTreeOperationInsert); + Assert(tupleType == BTreeKeyLeafTuple); + + if (desc->undoType != UndoLogNone) + { + release_undo_size(desc->undoType); + if (GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType) != desc->undoType) + release_undo_size(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)); + } + ppool_release_reserved(desc->ppool, PPOOL_RESERVE_INSERT); + Assert(!have_locked_pages()); + return OBTreeModifyResultInserted; + } + Assert(findResult == OFindPageResultSuccess); + + return o_btree_modify_internal(&pageFindContext, action, tuple, tupleType, + key, keyType, opOxid, opCsn, + lockMode, deleted, pageReserveKind, + callbackInfo); +} + +#include "tableam/descr.h" +#include "tableam/key_range.h" +#include "tableam/toast.h" + +#include "utils/lsyscache.h" + +static bool +page_unique_check(BTreeDescr *desc, Page p, BTreePageItemLocator *locator, + Pointer key, OXid opOxid, OTupleXactInfo *xactInfo, + IndexUniqueCheck checkUnique) +{ + (void) page_locator_find_real_item(p, NULL, locator); + + while (BTREE_PAGE_LOCATOR_IS_VALID(p, locator)) + { + int cmp; + OTuple tuple; + BTreeLeafTuphdr *pageTuphdr, + tuphdr; + + BTREE_PAGE_READ_LEAF_ITEM(pageTuphdr, tuple, p, locator); + cmp = o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, + key, BTreeKeyUniqueUpperBound); + if (cmp > 0) + return false; + else if (cmp < 0 && checkUnique == UNIQUE_CHECK_EXISTING) + { + cmp = o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, + key, BTreeKeyBound); + if (cmp == 0) + { + BTREE_PAGE_LOCATOR_NEXT(p, locator); + continue; + } + } + + tuphdr = *pageTuphdr; + (void) find_non_lock_only_undo_record(desc->undoType, &tuphdr); + if (XACT_INFO_OXID_EQ(tuphdr.xactInfo, opOxid) || XACT_INFO_IS_FINISHED(tuphdr.xactInfo)) + { + if (tuphdr.deleted != BTreeLeafTupleNonDeleted) + { + BTREE_PAGE_LOCATOR_NEXT(p, locator); + continue; + } + *xactInfo = tuphdr.xactInfo; + return true; + } + + *xactInfo = tuphdr.xactInfo; + return true; + } + return false; +} + +static bool +slowpath_unique_check(BTreeDescr *desc, OBTreeFindPageContext *pageFindContext, + Pointer key, OXid opOxid, OTupleXactInfo *xactInfo, + IndexUniqueCheck checkUnique) +{ + Page p; + OFixedKey hikey_buf; + + btree_find_context_from_modify_to_read(pageFindContext, + key, BTreeKeyUniqueLowerBound, 0); + + p = pageFindContext->img; + + while (true) + { + int cmp; + OTuple hikey; + + if (page_unique_check(desc, p, &pageFindContext->items[pageFindContext->index].locator, + key, opOxid, xactInfo, checkUnique)) + return true; + + if (O_PAGE_IS(p, RIGHTMOST)) + break; + + BTREE_PAGE_GET_HIKEY(hikey, p); + + cmp = o_btree_cmp(desc, &hikey, BTreeKeyNonLeafKey, + key, BTreeKeyUniqueUpperBound); + if (cmp > 0) + break; + + (void) find_right_page(pageFindContext, &hikey_buf); + + /* + * Due to concurrent merges, some tuples might be lower than the + * unique key. So, we can't just start from the beginning, but have + * to find the right position on the page. + */ + btree_page_search(desc, p, key, BTreeKeyUniqueLowerBound, + NULL, &pageFindContext->items[pageFindContext->index].locator); + } + return false; +} + +OBTreeModifyResult +o_btree_insert_unique(BTreeDescr *desc, OTuple tuple, BTreeKeyType tupleType, + Pointer key, BTreeKeyType keyType, + OXid opOxid, CommitSeqNo opCsn, + RowLockMode lockMode, BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo, + IndexUniqueCheck checkUnique) +{ + OBTreeFindPageContext pageFindContext; + int pageReserveKind; + bool fastpath; + Page p; + OInMemoryBlkno blkno; + uint32 pageChangeCount; + LWLock *uniqueLock; + OBTreeModifyResult result; + Jsonb *params = NULL; + OFindPageResult findResult PG_USED_FOR_ASSERTS_ONLY; + bool found_but_insert; + + Assert(checkUnique == UNIQUE_CHECK_YES || checkUnique == UNIQUE_CHECK_EXISTING || checkUnique == UNIQUE_CHECK_PARTIAL); + + if (STOPEVENTS_ENABLED()) + params = prepare_modify_start_params(desc); + STOPEVENT(STOPEVENT_MODIFY_START, params); + + Assert(key != NULL && keyType == BTreeKeyBound); + + reserve_undo_for_modification(desc->undoType); + + if (OIDS_EQ_SYS_TREE(desc->oids, SYS_TREES_SHARED_ROOT_INFO)) + pageReserveKind = PPOOL_RESERVE_SHARED_INFO_INSERT; + else + pageReserveKind = PPOOL_RESERVE_INSERT; + + ppool_reserve_pages(desc->ppool, pageReserveKind, 2); + + init_page_find_context(&pageFindContext, desc, COMMITSEQNO_INPROGRESS, + BTREE_PAGE_FIND_MODIFY | + BTREE_PAGE_FIND_IMAGE | + BTREE_PAGE_FIND_FIX_LEAF_SPLIT); + + if (hint && OInMemoryBlknoIsValid(hint->blkno)) + findResult = refind_page(&pageFindContext, key, + BTreeKeyUniqueLowerBound, 0, + hint->blkno, hint->pageChangeCount); + else + findResult = find_page(&pageFindContext, key, + BTreeKeyUniqueLowerBound, 0); + + Assert(findResult == OFindPageResultSuccess); + +retry: + + fastpath = false; + found_but_insert = false; + blkno = pageFindContext.items[pageFindContext.index].blkno; + pageChangeCount = pageFindContext.items[pageFindContext.index].pageChangeCount; + p = O_GET_IN_MEMORY_PAGE(blkno); + if (O_PAGE_IS(p, RIGHTMOST)) + { + fastpath = true; + } + else + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, p); + fastpath = (o_btree_cmp(desc, &hikey, BTreeKeyNonLeafKey, + key, BTreeKeyUniqueUpperBound) >= 0); + } + + uniqueLock = &unique_locks[o_btree_unique_hash(desc, tuple) % num_unique_locks].lock; + + /*--- + * We can do fast path unique check if we know that the required key range + * resides the single page, and we managed to take a unique lwlock + * simultaneusly. + * + * It might seem that we don't need unique lwlock as soon as we see all the + * key range in the locked page. However, consider the following example. + * + * s1: Unique lwlock acquire + * s1: Slow path check + * Page merge + * s2: Fast patch check + * s2: Insert + * s1: Insert + * + * Due to page merge, we might end up with double insert. This even fast + * path check requires unique lwlock. + */ + if (fastpath && LWLockConditionalAcquire(uniqueLock, LW_EXCLUSIVE)) + { + OTupleXactInfo xactInfo; + bool refind = false; + + if (page_unique_check(desc, p, &pageFindContext.items[pageFindContext.index].locator, + key, opOxid, &xactInfo, checkUnique)) + { + OTuple curTuple; + BTreeLocationHint cbHint = {pageFindContext.items[pageFindContext.index].blkno, pageFindContext.items[pageFindContext.index].pageChangeCount}; + BTreeLeafTuphdr *tuphdr; + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, p, &pageFindContext.items[pageFindContext.index].locator); + + if (XACT_INFO_OXID_EQ(xactInfo, opOxid) || XACT_INFO_IS_FINISHED(xactInfo)) + { + OBTreeModifyCallbackAction cbAction PG_USED_FOR_ASSERTS_ONLY; + + if (callbackInfo->modifyCallback) + { + cbAction = callbackInfo->modifyCallback(desc, + curTuple, &tuple, opOxid, + xactInfo, tuphdr->undoLocation, + &lockMode, &cbHint, callbackInfo->arg); + + /* + * We could support other callback actions, but it's not + * yet needed. + */ + Assert(cbAction == OBTreeCallbackActionDoNothing); + } + if (checkUnique == UNIQUE_CHECK_YES) + { + unlock_page(blkno); + LWLockRelease(uniqueLock); + return OBTreeModifyResultFound; + } + else + { + found_but_insert = true; + refind = true; + } + } + else + { + OBTreeWaitCallbackAction cbAction; + + LWLockRelease(uniqueLock); + if (callbackInfo->waitCallback) + { + cbAction = callbackInfo->waitCallback(desc, + curTuple, &tuple, XACT_INFO_GET_OXID(xactInfo), + xactInfo, tuphdr->undoLocation, + &lockMode, &cbHint, callbackInfo->arg); + Assert(cbAction != OBTreeCallbackActionXidNoWait); + if (cbAction == OBTreeCallbackActionXidExit) + { + if (checkUnique == UNIQUE_CHECK_YES) + { + unlock_page(blkno); + return OBTreeModifyResultFound; + } + else + { + found_but_insert = true; + refind = true; + } + } + } + unlock_page(blkno); + wait_for_oxid(XACT_INFO_GET_OXID(xactInfo), false); + findResult = refind_page(&pageFindContext, key, + BTreeKeyUniqueLowerBound, 0, + blkno, pageChangeCount); + Assert(findResult == OFindPageResultSuccess); + goto retry; + } + } + else + refind = true; + + if (refind) + { + /* + * We've to find approprivate offset for the new tuple. It should + * be within the page, but can not match current offset, because + * we've searched for BTreeUniqueMinBound. + */ + btree_page_search(desc, p, key, BTreeKeyBound, + NULL, &pageFindContext.items[pageFindContext.index].locator); + } + } + else + { + OTupleXactInfo xactInfo; + bool refind = false; + + /* + * Evade deadlock: unlock the page before taking an unique lwlock. + */ + unlock_page(blkno); + + LWLockAcquire(uniqueLock, LW_EXCLUSIVE); + + if (slowpath_unique_check(desc, &pageFindContext, key, + opOxid, &xactInfo, checkUnique)) + { + BTreePageItemLocator *loc = &pageFindContext.items[pageFindContext.index].locator; + OTuple curTuple; + BTreeLocationHint cbHint = {pageFindContext.items[pageFindContext.index].blkno, pageFindContext.items[pageFindContext.index].pageChangeCount}; + BTreeLeafTuphdr *tuphdr; + + p = O_GET_IN_MEMORY_PAGE(pageFindContext.items[pageFindContext.index].blkno); + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, curTuple, p, loc); + if (XACT_INFO_OXID_EQ(xactInfo, opOxid) || XACT_INFO_IS_FINISHED(xactInfo)) + { + OBTreeModifyCallbackAction cbAction PG_USED_FOR_ASSERTS_ONLY; + + if (callbackInfo->modifyCallback) + { + cbAction = callbackInfo->modifyCallback(desc, + curTuple, &tuple, opOxid, + xactInfo, tuphdr->undoLocation, + &lockMode, &cbHint, callbackInfo->arg); + + /* + * We could support other callback actions, but it's not + * yet needed. + */ + Assert(cbAction == OBTreeCallbackActionDoNothing); + } + LWLockRelease(uniqueLock); + if (checkUnique == UNIQUE_CHECK_YES) + return OBTreeModifyResultFound; + else + { + found_but_insert = true; + refind = true; + } + } + else + { + OBTreeWaitCallbackAction cbAction; + + LWLockRelease(uniqueLock); + + if (callbackInfo->waitCallback) + { + cbAction = callbackInfo->waitCallback(desc, + curTuple, &tuple, XACT_INFO_GET_OXID(xactInfo), + tuphdr->undoLocation, + xactInfo, &lockMode, &cbHint, callbackInfo->arg); + Assert(cbAction != OBTreeCallbackActionXidNoWait); + if (cbAction == OBTreeCallbackActionXidExit) + { + if (checkUnique == UNIQUE_CHECK_YES) + return OBTreeModifyResultFound; + else + { + found_but_insert = true; + refind = true; + } + } + } + wait_for_oxid(XACT_INFO_GET_OXID(xactInfo), false); + BTREE_PAGE_FIND_SET(&pageFindContext, MODIFY); + findResult = refind_page(&pageFindContext, key, + BTreeKeyUniqueLowerBound, 0, + blkno, pageChangeCount); + Assert(findResult == OFindPageResultSuccess); + goto retry; + } + } + else + refind = true; + + if (refind) + { + BTREE_PAGE_FIND_SET(&pageFindContext, MODIFY); + findResult = find_page(&pageFindContext, key, BTreeKeyBound, 0); + Assert(findResult == OFindPageResultSuccess); + } + } + + if (checkUnique != UNIQUE_CHECK_EXISTING) + { + result = o_btree_modify_internal(&pageFindContext, BTreeOperationInsert, + tuple, tupleType, key, + keyType, opOxid, opCsn, lockMode, + BTreeLeafTupleNonDeleted, pageReserveKind, + callbackInfo); + } + else + { + unlock_page(blkno); + result = found_but_insert ? OBTreeModifyResultFound : OBTreeModifyResultNotFound; + } + if (result == OBTreeModifyResultInserted && found_but_insert) + result = OBTreeModifyResultFound; + + LWLockRelease(uniqueLock); + return result; +} + +OBTreeModifyResult +o_btree_modify(BTreeDescr *desc, BTreeOperationType action, + OTuple tuple, BTreeKeyType tupleType, + Pointer key, BTreeKeyType keyType, + OXid oxid, CommitSeqNo csn, RowLockMode lockMode, + BTreeLocationHint *hint, BTreeModifyCallbackInfo *callbackInfo) +{ + return o_btree_normal_modify(desc, action, tuple, tupleType, + key, keyType, oxid, csn, lockMode, + hint, BTreeLeafTupleNonDeleted, callbackInfo); +} + +OBTreeModifyResult +o_btree_delete_moved_partitions(BTreeDescr *desc, Pointer key, + BTreeKeyType keyType, OXid oxid, + CommitSeqNo csn, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo) +{ + OTuple nullTup; + + O_TUPLE_SET_NULL(nullTup); + + return o_btree_normal_modify(desc, BTreeOperationDelete, + nullTup, BTreeKeyNone, + key, keyType, oxid, csn, RowLockUpdate, + hint, BTreeLeafTupleMovedPartitions, + callbackInfo); +} + +OBTreeModifyResult +o_btree_delete_pk_changed(BTreeDescr *desc, Pointer key, + BTreeKeyType keyType, OXid oxid, + CommitSeqNo csn, + BTreeLocationHint *hint, + BTreeModifyCallbackInfo *callbackInfo) +{ + OTuple nullTup; + + O_TUPLE_SET_NULL(nullTup); + + return o_btree_normal_modify(desc, BTreeOperationDelete, + nullTup, BTreeKeyNone, + key, keyType, oxid, csn, RowLockUpdate, + hint, BTreeLeafTuplePKChanged, + callbackInfo); +} + +bool +o_btree_autonomous_insert(BTreeDescr *desc, OTuple tuple) +{ + OAutonomousTxState state; + OBTreeModifyResult result; + + if (desc->undoType != UndoLogNone) + { + start_autonomous_transaction(&state); + PG_TRY(); + { + result = o_btree_normal_modify(desc, BTreeOperationInsert, + tuple, BTreeKeyLeafTuple, + NULL, BTreeKeyNone, + get_current_oxid(), + COMMITSEQNO_INPROGRESS, + RowLockUpdate, + NULL, BTreeLeafTupleNonDeleted, + &nullCallbackInfo); + /* no version is necessary here for system trees other than OTable */ + if (result == OBTreeModifyResultInserted) + o_wal_insert(desc, tuple, REPLICA_IDENTITY_DEFAULT, O_TABLE_INVALID_VERSION); + } + PG_CATCH(); + { + abort_autonomous_transaction(&state); + PG_RE_THROW(); + } + PG_END_TRY(); + finish_autonomous_transaction(&state); + } + else + { + result = o_btree_normal_modify(desc, BTreeOperationInsert, + tuple, BTreeKeyLeafTuple, + NULL, BTreeKeyNone, + InvalidOXid, + COMMITSEQNO_INPROGRESS, + RowLockUpdate, + NULL, BTreeLeafTupleNonDeleted, + &nullCallbackInfo); + } + + return (result == OBTreeModifyResultInserted); +} + +bool +o_btree_autonomous_delete(BTreeDescr *desc, OTuple key, BTreeKeyType keyType, + BTreeLocationHint *hint) +{ + OAutonomousTxState state; + OBTreeModifyResult result; + + Assert(keyType == BTreeKeyLeafTuple || keyType == BTreeKeyNonLeafKey); + + if (desc->undoType != UndoLogNone) + { + start_autonomous_transaction(&state); + PG_TRY(); + { + result = o_btree_normal_modify(desc, BTreeOperationDelete, + key, keyType, + NULL, BTreeKeyNone, + get_current_oxid(), COMMITSEQNO_INPROGRESS, + RowLockUpdate, + hint, BTreeLeafTupleNonDeleted, + &nullCallbackInfo); + Assert(IS_SYS_TREE_OIDS(desc->oids)); + /* no version is necessary here for system trees other than OTable */ + if (result == OBTreeModifyResultDeleted) + { + if (keyType == BTreeKeyLeafTuple) + o_wal_delete(desc, key, REPLICA_IDENTITY_DEFAULT, O_TABLE_INVALID_VERSION); + else if (keyType == BTreeKeyNonLeafKey) + o_wal_delete_key(desc, key, false, O_TABLE_INVALID_VERSION); + } + } + PG_CATCH(); + { + abort_autonomous_transaction(&state); + PG_RE_THROW(); + } + PG_END_TRY(); + finish_autonomous_transaction(&state); + } + else + { + result = o_btree_normal_modify(desc, BTreeOperationDelete, + key, keyType, + NULL, BTreeKeyNone, + InvalidOXid, COMMITSEQNO_INPROGRESS, + RowLockUpdate, + hint, BTreeLeafTupleNonDeleted, + &nullCallbackInfo); + } + + return (result == OBTreeModifyResultDeleted); +} diff --git a/contrib/orioledb/src/btree/page_chunks.c b/contrib/orioledb/src/btree/page_chunks.c new file mode 100644 index 00000000000..a7e5fe64afb --- /dev/null +++ b/contrib/orioledb/src/btree/page_chunks.c @@ -0,0 +1,1448 @@ +/*------------------------------------------------------------------------- + * + * page_chunks.c + * Internals of OrioleDB page chunks: routines for working with chunks + * and their items. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/page_chunks.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/insert.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "recovery/recovery.h" +#include "transam/undo.h" +#include "tuple/format.h" +#include "utils/page_pool.h" +#include "utils/ucm.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "utils/memdebug.h" + +/* + * Load chunk to the partial page. + */ +bool +partial_load_hikeys_chunk(PartialPageState *partial, Page img) +{ + uint64 imgState, + srcState; + Page src = partial->src; + LocationIndex chunkBegin, + chunkEnd; + BTreePageHeader *header = (BTreePageHeader *) img; + + if (!partial->isPartial || partial->hikeysChunkIsLoaded) + return true; + + chunkBegin = offsetof(BTreePageHeader, chunkDesc); + chunkEnd = header->hikeysEnd; + + Assert(chunkBegin >= 0 && chunkBegin <= ORIOLEDB_BLCKSZ); + Assert(chunkEnd >= 0 && chunkEnd <= ORIOLEDB_BLCKSZ); + + memcpy((Pointer) img + chunkBegin, + (Pointer) src + chunkBegin, + chunkEnd - chunkBegin); + + pg_read_barrier(); + + imgState = pg_atomic_read_u64(&(O_PAGE_HEADER(img)->state)); + srcState = pg_atomic_read_u64(&(O_PAGE_HEADER(src)->state)); + if ((imgState & PAGE_STATE_CHANGE_COUNT_MASK) != (srcState & PAGE_STATE_CHANGE_COUNT_MASK) || + O_PAGE_STATE_READ_IS_BLOCKED(srcState)) + return false; + + if (O_PAGE_GET_CHANGE_COUNT(img) != O_PAGE_GET_CHANGE_COUNT(src)) + return false; + + partial->hikeysChunkIsLoaded = true; + return true; +} + +/* + * Load chunk to the partial page. + */ +bool +partial_load_chunk(PartialPageState *partial, Page img, + OffsetNumber chunkOffset, BTreePageItemLocator *loc) +{ + uint64 imgState = pg_atomic_read_u64(&(O_PAGE_HEADER(img)->state)), + srcState; + Page src = partial->src; + LocationIndex chunkBegin, + chunkEnd; + BTreePageHeader *header; + + if (!partial->isPartial || partial->chunkIsLoaded[chunkOffset]) + return true; + + if (partial->hikeysChunkIsLoaded) + { + header = (BTreePageHeader *) img; + chunkBegin = SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].shortLocation); + if (chunkOffset + 1 < header->chunksCount) + { + if (loc) + loc->chunkItemsCount = header->chunkDesc[chunkOffset + 1].offset - + header->chunkDesc[chunkOffset].offset; + chunkEnd = SHORT_GET_LOCATION(header->chunkDesc[chunkOffset + 1].shortLocation); + } + else + { + if (loc) + loc->chunkItemsCount = header->itemsCount - + header->chunkDesc[chunkOffset].offset; + chunkEnd = header->dataSize; + } + } + else + { + header = (BTreePageHeader *) src; + chunkBegin = SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].shortLocation); + if (chunkOffset + 1 < header->chunksCount) + { + if (loc) + loc->chunkItemsCount = header->chunkDesc[chunkOffset + 1].offset - + header->chunkDesc[chunkOffset].offset; + chunkEnd = SHORT_GET_LOCATION(header->chunkDesc[chunkOffset + 1].shortLocation); + } + else + { + if (loc) + loc->chunkItemsCount = header->itemsCount - + header->chunkDesc[chunkOffset].offset; + chunkEnd = header->dataSize; + } + + pg_read_barrier(); + + srcState = pg_atomic_read_u64(&(O_PAGE_HEADER(src)->state)); + if ((imgState & PAGE_STATE_CHANGE_COUNT_MASK) != (srcState & PAGE_STATE_CHANGE_COUNT_MASK) || + O_PAGE_STATE_READ_IS_BLOCKED(srcState)) + return false; + } + + Assert(chunkBegin >= 0 && chunkBegin <= ORIOLEDB_BLCKSZ); + Assert(chunkEnd >= 0 && chunkEnd <= ORIOLEDB_BLCKSZ); + + VALGRIND_CHECK_MEM_IS_DEFINED((Pointer) src + chunkBegin, + chunkEnd - chunkBegin); + + memcpy((Pointer) img + chunkBegin, + (Pointer) src + chunkBegin, + chunkEnd - chunkBegin); + + pg_read_barrier(); + + srcState = pg_atomic_read_u64(&(O_PAGE_HEADER(src)->state)); + if ((imgState & PAGE_STATE_CHANGE_COUNT_MASK) != (srcState & PAGE_STATE_CHANGE_COUNT_MASK) || + O_PAGE_STATE_READ_IS_BLOCKED(srcState)) + return false; + + if (O_PAGE_GET_CHANGE_COUNT(img) != O_PAGE_GET_CHANGE_COUNT(src)) + return false; + + partial->chunkIsLoaded[chunkOffset] = true; + if (loc) + { + loc->chunkOffset = chunkOffset; + loc->itemOffset = 0; + loc->chunk = (BTreePageChunk *) ((Pointer) img + chunkBegin); + loc->chunkSize = chunkEnd - chunkBegin; + } + return true; +} + +BTreeItemPageFitType +page_locator_fits_item(BTreeDescr *desc, Page p, BTreePageItemLocator *locator, + LocationIndex size, bool replace, CommitSeqNo csn) +{ + int freeSpace = BTREE_PAGE_FREE_SPACE(p); + int spaceNeeded = size; + int compactedFreeSpace; + int oldItemSize; + + Assert(spaceNeeded == MAXALIGN(spaceNeeded)); + + if (!replace) + { + /* + * During insert of new item, take into account extension of chunk + * items array. + */ + spaceNeeded += + MAXALIGN((locator->chunkItemsCount + 1) * sizeof(LocationIndex)) - + MAXALIGN(locator->chunkItemsCount * sizeof(LocationIndex)); + } + else + { + /* + * During tuple replacement, take into account change in the item + * size. We can replace tuples only on leafs. + */ + Assert(O_PAGE_IS(p, LEAF)); + + oldItemSize = BTREE_PAGE_GET_ITEM_SIZE(p, locator); + + spaceNeeded -= oldItemSize; + Assert(spaceNeeded == MAXALIGN(spaceNeeded)); + } + + if (freeSpace >= spaceNeeded) + { + /* Already have enough of free space on the page */ + return BTreeItemPageFitAsIs; + } + + /* + * Tuple didn't fit "as is". Page needs compaction or split. + */ + Assert(spaceNeeded >= 0); + + /* + * Compaction is only possible on leafs, and not possible for bridge + * indexes. + */ + if (!O_PAGE_IS(p, LEAF) || desc->type == oIndexBridge) + return BTreeItemPageFitSplitRequired; + + /* Start with optimistic estimate of free space after compaction */ + compactedFreeSpace = freeSpace + PAGE_GET_N_VACATED(p); + + if (replace) + { + /* Correct the estimation according to our tuple replacement */ + BTreeLeafTuphdr *tupHdr; + + tupHdr = (BTreeLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, locator); + + if (tupHdr->deleted) + { + /* + * If the current tuple is deleted, then the item size is already + * in out estimation. However, we also took into account for + * spaceNeeded. Correct our calculation of free space after + * compaction. + */ + compactedFreeSpace -= BTREE_PAGE_GET_ITEM_SIZE(p, locator); + } + else + { + OTuple tuple; + int oldItemCompactedSize; + + /* + * Similarly to the previous case, the possible tuple shrinking is + * both in our estimation of free space after compaction and in + * the spaceNeeded. + */ + BTREE_PAGE_READ_LEAF_TUPLE(tuple, p, locator); + oldItemCompactedSize = (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tuple, OTupleLength))); + + Assert(oldItemSize >= oldItemCompactedSize); + compactedFreeSpace -= (oldItemSize - oldItemCompactedSize); + } + } + + /* + * We have a chance to do a compation on leaf. Check if at least + * optimistic esimate will work. + */ + if (compactedFreeSpace < spaceNeeded) + return BTreeItemPageFitSplitRequired; + + /* + * Switch to real estimate. Real estimate is much slower, but there is a + * good chance to avoid a page split. For the tuple replacement case, + * skip item to be replaced from the calculation, as it's already taken + * into account for spaceNeeded. + */ + if (!replace) + compactedFreeSpace = freeSpace + page_get_vacated_space(desc, p, csn); + else + compactedFreeSpace = freeSpace + page_get_vacated_skip_item(desc, p, csn, BTREE_PAGE_LOCATOR_GET_OFFSET(p, locator)); + + if (compactedFreeSpace >= spaceNeeded) + return BTreeItemPageFitCompactRequired; + else + return BTreeItemPageFitSplitRequired; +} + +void +init_page_first_chunk(BTreeDescr *desc, Page p, LocationIndex hikeySize) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + + Assert(hikeySize == MAXALIGN(hikeySize)); + + if (hikeySize == 0) + header->flags |= O_BTREE_FLAG_HIKEYS_FIXED; + + header->chunksCount = 1; + header->itemsCount = 0; + + header->hikeysEnd = MAXALIGN(sizeof(BTreePageHeader)) + hikeySize; + if (header->hikeysEnd > BTREE_PAGE_HIKEYS_END(desc, p)) + header->dataSize = header->hikeysEnd; + else + header->dataSize = BTREE_PAGE_HIKEYS_END(desc, p); + + header->chunkDesc[0].hikeyShortLocation = LOCATION_GET_SHORT(MAXALIGN(sizeof(BTreePageHeader))); + header->chunkDesc[0].shortLocation = LOCATION_GET_SHORT(header->dataSize); + header->chunkDesc[0].offset = 0; + header->chunkDesc[0].hikeyFlags = 0; + header->chunkDesc[0].chunkKeysFixed = 1; +} + +void +page_chunk_fill_locator(Page p, OffsetNumber chunkOffset, + BTreePageItemLocator *locator) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + + if (chunkOffset + 1 < header->chunksCount) + { + locator->chunkItemsCount = header->chunkDesc[chunkOffset + 1].offset - + header->chunkDesc[chunkOffset].offset; + locator->chunkSize = SHORT_GET_LOCATION(header->chunkDesc[chunkOffset + 1].shortLocation) - + SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].shortLocation); + } + else + { + locator->chunkItemsCount = header->itemsCount - + header->chunkDesc[chunkOffset].offset; + locator->chunkSize = header->dataSize - + SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].shortLocation); + } + locator->chunkOffset = chunkOffset; + locator->itemOffset = 0; + locator->chunk = (BTreePageChunk *) (p + SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].shortLocation)); +} + +void +page_item_fill_locator(Page p, OffsetNumber itemOffset, + BTreePageItemLocator *locator) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber chunkOffset; + + chunkOffset = 0; + while (chunkOffset < header->chunksCount - 1 && + itemOffset >= header->chunkDesc[chunkOffset + 1].offset) + chunkOffset++; + + page_chunk_fill_locator(p, chunkOffset, locator); + locator->itemOffset = itemOffset - header->chunkDesc[chunkOffset].offset; +} + +void +page_item_fill_locator_backwards(Page p, OffsetNumber itemOffset, + BTreePageItemLocator *locator) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber chunkOffset; + + chunkOffset = header->chunksCount - 1; + while (itemOffset < header->chunkDesc[chunkOffset].offset) + { + Assert(chunkOffset > 0); + chunkOffset--; + } + + page_chunk_fill_locator(p, chunkOffset, locator); + locator->itemOffset = itemOffset - header->chunkDesc[chunkOffset].offset; +} + +/* + * Locate the next page item. + */ +bool +page_locator_next_chunk(Page p, BTreePageItemLocator *locator) +{ + while (locator->itemOffset >= locator->chunkItemsCount) + { + BTreePageHeader *header = (BTreePageHeader *) p; + + if (locator->chunkOffset + 1 < header->chunksCount) + { + page_chunk_fill_locator(p, locator->chunkOffset + 1, locator); + } + else + { + return false; + } + } + return true; +} + +/* + * Locate the next page item. + */ +bool +page_locator_prev_chunk(Page p, BTreePageItemLocator *locator) +{ + do + { + if (locator->chunkOffset > 0) + { + page_chunk_fill_locator(p, locator->chunkOffset - 1, locator); + } + else + { + locator->chunk = NULL; + return false; + } + } + while (locator->chunkItemsCount == 0); + locator->itemOffset = locator->chunkItemsCount - 1; + return true; +} + +/* + * Insert a new item of given size at the given location. + */ +void +page_locator_insert_item(Page p, BTreePageItemLocator *locator, + LocationIndex itemsize) +{ + int itemsShift, + dataShift; + Pointer firstItemPtr, + itemPtr, + endPtr; + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber i; + + Assert(itemsize == MAXALIGN(itemsize)); + + /* Calculate the change of (maxaligned) item array size */ + itemsShift = MAXALIGN(sizeof(LocationIndex) * (locator->chunkItemsCount + 1)) - + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + + /* Calculate the shift of the data after new item inserted */ + dataShift = itemsShift + itemsize; + + firstItemPtr = (Pointer) locator->chunk + + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + + if (locator->itemOffset < locator->chunkItemsCount) + { + itemPtr = (Pointer) locator->chunk + + ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]); + } + else + { + Assert(locator->itemOffset == locator->chunkItemsCount); + itemPtr = (Pointer) locator->chunk + locator->chunkSize; + } + endPtr = (Pointer) p + header->dataSize; + + /* Data should still fit to the page */ + Assert(endPtr + dataShift <= (Pointer) p + ORIOLEDB_BLCKSZ); + + /* Shift the data after insert location */ + Assert(itemPtr <= endPtr); + memmove(itemPtr + dataShift, itemPtr, endPtr - itemPtr); + + /* Adjust chunks parameters */ + for (i = locator->chunkOffset + 1; i < header->chunksCount; i++) + { + header->chunkDesc[i].shortLocation += LOCATION_GET_SHORT(dataShift); + header->chunkDesc[i].offset++; + } + header->itemsCount++; + header->dataSize += dataShift; + + if (itemsShift != 0) + { + /* + * If items array size is changed, then we have to also move the items + * before insert location and adjust those locations in the items + * array. + */ + memmove(firstItemPtr + itemsShift, firstItemPtr, itemPtr - firstItemPtr); + for (i = 0; i < locator->itemOffset; i++) + locator->chunk->items[i] += itemsShift; + } + + /* Add new element to the items array */ + for (i = locator->chunkItemsCount; i > locator->itemOffset; i--) + locator->chunk->items[i] = locator->chunk->items[i - 1] + dataShift; + locator->chunk->items[locator->itemOffset] = (Pointer) itemPtr - (Pointer) locator->chunk + itemsShift; + + /* Adjust the locator */ + locator->chunkItemsCount++; + locator->chunkSize += dataShift; +} + +bool +page_locator_fits_new_item(Page p, BTreePageItemLocator *locator, + LocationIndex itemsize) +{ + LocationIndex sizeDiff; + + sizeDiff = MAXALIGN(sizeof(LocationIndex) * (locator->chunkItemsCount + 1)) - + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + + sizeDiff += MAXALIGN(itemsize); + + return BTREE_PAGE_FREE_SPACE(p) >= sizeDiff; +} + +/* + * Get size of the item at given location. + */ +LocationIndex +page_locator_get_item_size(Page p, BTreePageItemLocator *locator) +{ + LocationIndex itemOffset, + nextItemOffset; + BTreePageHeader *header = (BTreePageHeader *) p; + + /* Calculate offset form the beginning of the chunk */ + itemOffset = ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]); + if (locator->itemOffset + 1 < locator->chunkItemsCount) + { + /* Next item is in the same chunk */ + nextItemOffset = ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset + 1]); + } + else + { + /* + * Next item is in the next chunk. Recalculate offsets from the + * beginning of the page. + */ + itemOffset += (Pointer) locator->chunk - (Pointer) p; + if (locator->chunkOffset + 1 < header->chunksCount) + nextItemOffset = SHORT_GET_LOCATION(header->chunkDesc[locator->chunkOffset + 1].shortLocation); + else + nextItemOffset = header->dataSize; + } + return (nextItemOffset - itemOffset); +} + +/* + * Resizes page item under given locator. + */ +void +page_locator_resize_item(Page p, BTreePageItemLocator *locator, + LocationIndex newsize) +{ + int dataShift; + Pointer nextItemPtr, + endPtr; + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber i; + + /* Calculate data shift */ + Assert(newsize == MAXALIGN(newsize)); + dataShift = newsize - page_locator_get_item_size(p, locator); + Assert(dataShift == MAXALIGN(dataShift)); + + if (dataShift == 0) + return; + + Assert(locator->itemOffset < locator->chunkSize); + if (locator->itemOffset + 1 < locator->chunkItemsCount) + nextItemPtr = (Pointer) locator->chunk + + ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset + 1]); + else + nextItemPtr = (Pointer) locator->chunk + locator->chunkSize; + endPtr = (Pointer) p + header->dataSize; + + Assert(endPtr + dataShift <= (Pointer) p + ORIOLEDB_BLCKSZ); + + /* Shift the data after the item */ + memmove(nextItemPtr + dataShift, nextItemPtr, endPtr - nextItemPtr); + + /* Adjust chunk positions */ + for (i = locator->chunkOffset + 1; i < header->chunksCount; i++) + header->chunkDesc[i].shortLocation += LOCATION_GET_SHORT(dataShift); + header->dataSize += dataShift; + + /* Adjust the items array */ + for (i = locator->itemOffset + 1; i < locator->chunkItemsCount; i++) + locator->chunk->items[i] += dataShift; + + /* Adjust the locator */ + locator->chunkSize += dataShift; +} + +/* + * Merge two chunks into one. + */ +static void +page_merge_chunks(Page p, OffsetNumber index) +{ + LocationIndex tmpItems[BTREE_PAGE_MAX_CHUNK_ITEMS], + hikeyShift, + hikeyShift2, + shift1, + shift2; + OffsetNumber i, + count1, + count2; + BTreePageHeader *header = (BTreePageHeader *) p; + BTreePageItemLocator loc1, + loc2; + Pointer chunk1DataPtr, + chunk1EndPtr, + chunk2DataPtr, + endPtr, + p1_1, + p1_2, + p2_1, + p2_2; + int len1, + len2; + + Assert(index + 1 < header->chunksCount); + + page_chunk_fill_locator(p, index, &loc1); + page_chunk_fill_locator(p, index + 1, &loc2); + + count1 = loc1.chunkItemsCount; + count2 = loc2.chunkItemsCount; + + chunk1DataPtr = (Pointer) loc1.chunk + + MAXALIGN(sizeof(LocationIndex) * count1); + chunk1EndPtr = (Pointer) loc1.chunk + loc1.chunkSize; + + chunk2DataPtr = (Pointer) loc2.chunk + + MAXALIGN(sizeof(LocationIndex) * count2); + endPtr = (Pointer) p + header->dataSize; + + shift1 = MAXALIGN(sizeof(LocationIndex) * (count1 + count2)) - + MAXALIGN(sizeof(LocationIndex) * count1); + shift2 = MAXALIGN(sizeof(LocationIndex) * count1) + + MAXALIGN(sizeof(LocationIndex) * count2) - + MAXALIGN(sizeof(LocationIndex) * (count1 + count2)); + + tmpItems[0] = 0; + for (i = 0; i < count2; i++) + { + tmpItems[i] = loc2.chunk->items[i] - + MAXALIGN(sizeof(LocationIndex) * count2) + + MAXALIGN(sizeof(LocationIndex) * (count1 + count2)) + + (chunk1EndPtr - chunk1DataPtr); + } + + if (shift1 != 0) + { + for (i = 0; i < count1; i++) + loc1.chunk->items[i] += shift1; + memmove(chunk1DataPtr + shift1, + chunk1DataPtr, + chunk1EndPtr - chunk1DataPtr); + } + + if (shift2 != 0) + { + memmove(chunk2DataPtr - shift2, + chunk2DataPtr, + endPtr - chunk2DataPtr); + header->dataSize -= shift2; + } + + memcpy((Pointer) loc1.chunk + sizeof(LocationIndex) * count1, + tmpItems, + sizeof(LocationIndex) * count2); + + hikeyShift = MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * header->chunksCount) - + MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * (header->chunksCount - 1)); + hikeyShift2 = hikeyShift + + SHORT_GET_LOCATION(header->chunkDesc[index + 1].hikeyShortLocation) - + SHORT_GET_LOCATION(header->chunkDesc[index].hikeyShortLocation); + + p1_1 = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation) - hikeyShift; + p1_2 = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation); + len1 = SHORT_GET_LOCATION(header->chunkDesc[index].hikeyShortLocation) - + SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation); + + p2_1 = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[index].hikeyShortLocation) - hikeyShift; + p2_2 = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[index + 1].hikeyShortLocation); + len2 = header->hikeysEnd - SHORT_GET_LOCATION(header->chunkDesc[index + 1].hikeyShortLocation); + + header->chunkDesc[index].hikeyFlags = header->chunkDesc[index + 1].hikeyFlags; + header->chunkDesc[index].chunkKeysFixed &= header->chunkDesc[index + 1].chunkKeysFixed; + for (i = index + 2; i < header->chunksCount; i++) + { + header->chunkDesc[i - 1].offset = header->chunkDesc[i].offset; + header->chunkDesc[i - 1].hikeyFlags = header->chunkDesc[i].hikeyFlags; + header->chunkDesc[i - 1].chunkKeysFixed = header->chunkDesc[i].chunkKeysFixed; + header->chunkDesc[i - 1].hikeyShortLocation = header->chunkDesc[i].hikeyShortLocation - LOCATION_GET_SHORT(hikeyShift2); + header->chunkDesc[i - 1].shortLocation = header->chunkDesc[i].shortLocation - LOCATION_GET_SHORT(shift2); + } + + if (hikeyShift > 0) + { + for (i = 0; i <= index; i++) + header->chunkDesc[i].hikeyShortLocation = header->chunkDesc[i].hikeyShortLocation - LOCATION_GET_SHORT(hikeyShift); + } + + if (hikeyShift > 0) + memmove(p1_1, p1_2, len1); + + memmove(p2_1, p2_2, len2); + + header->hikeysEnd -= hikeyShift2; + header->chunksCount--; + + header->flags |= O_BTREE_FLAG_HIKEYS_FIXED; + for (i = 0; i < header->chunksCount; i++) + { + if (!(header->chunkDesc[i].hikeyFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + header->flags &= ~O_BTREE_FLAG_HIKEYS_FIXED; + } + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); +} + +/* + * Deletes page item under given locator. + */ +void +page_locator_delete_item(Page p, BTreePageItemLocator *locator) +{ + int itemsShift, + dataShift, + itemsize; + Pointer firstItemPtr, + itemPtr, + endPtr; + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber i; + + /* Get item size */ + itemsize = page_locator_get_item_size(p, locator); + Assert(itemsize == MAXALIGN(itemsize)); + + itemsShift = MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount) - + MAXALIGN(sizeof(LocationIndex) * (locator->chunkItemsCount - 1)); + dataShift = itemsShift + itemsize; + + firstItemPtr = (Pointer) locator->chunk + + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + Assert(locator->itemOffset < locator->chunkSize); + itemPtr = (Pointer) locator->chunk + + ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]); + endPtr = (Pointer) p + header->dataSize; + + Assert(endPtr - dataShift >= itemPtr - itemsShift); + + /* + * Adjust the items array. We should do this first to prevent it been + * overridden by the data when it's shorten. + */ + for (i = locator->itemOffset; i < locator->chunkItemsCount - 1; i++) + locator->chunk->items[i] = locator->chunk->items[i + 1] - dataShift; + + if (itemsShift != 0) + { + /* Shift the data before deleted item when items arrays is shorten. */ + memmove(firstItemPtr - itemsShift, firstItemPtr, itemPtr - firstItemPtr); + + /* Shift item pointers of those items */ + for (i = 0; i < locator->itemOffset; i++) + locator->chunk->items[i] -= itemsShift; + } + + /* Move the data after deleted item */ + memmove(itemPtr - itemsShift, itemPtr + itemsize, endPtr - itemPtr - itemsize); + + /* Adjust position of following chunks */ + for (i = locator->chunkOffset + 1; i < header->chunksCount; i++) + { + header->chunkDesc[i].shortLocation -= LOCATION_GET_SHORT(dataShift); + header->chunkDesc[i].offset--; + } + header->itemsCount--; + header->dataSize -= dataShift; + + /* Adjust the locator */ + locator->chunkItemsCount--; + locator->chunkSize -= dataShift; + + if (locator->chunkItemsCount == 0) + { + if (locator->chunkOffset > 0) + { + page_merge_chunks(p, locator->chunkOffset - 1); + page_chunk_fill_locator(p, locator->chunkOffset - 1, locator); + locator->itemOffset = locator->chunkItemsCount; + } + else if (locator->chunkOffset + 1 < header->chunksCount) + { + page_merge_chunks(p, locator->chunkOffset); + page_chunk_fill_locator(p, locator->chunkOffset, locator); + } + } +} + +/* + * Split the given page chunk into two. + */ +static void +page_split_chunk(Page p, BTreePageItemLocator *locator, + LocationIndex hikeysEnd, LocationIndex hikeySize) +{ + LocationIndex tmpItems[BTREE_PAGE_MAX_CHUNK_ITEMS], + leftItemsShift, + rightItemsShift, + dataShift, + chunkDescShift, + hikeyShift; + Pointer firstItemPtr, + itemPtr, + endPtr, + rightChunkPtr, + firstHikeyPtr, + hikeyPtr, + hikeyEndPtr; + bool leftChunkKeysFixed = true, + rightChunkKeysFixed = true; + OffsetNumber i, + leftItemsCount, + rightItemsCount; + BTreePageHeader *header = (BTreePageHeader *) p; + + + Assert(hikeySize == MAXALIGN(hikeySize)); + + leftItemsCount = locator->itemOffset; + rightItemsCount = locator->chunkItemsCount - locator->itemOffset; + firstItemPtr = (Pointer) locator->chunk + + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + itemPtr = (Pointer) locator->chunk + + ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]); + endPtr = (Pointer) p + header->dataSize; + Assert(firstItemPtr >= p && itemPtr >= firstItemPtr && endPtr >= itemPtr); + Assert(endPtr <= (Pointer) p + ORIOLEDB_BLCKSZ); + + /* + * Save positions of the items, which go to the right chunk. We have to + * do this in order to make these items not overridden while data is + * moved. Position are counted from the beginning of the new chunk. + */ + leftItemsShift = MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount) - + MAXALIGN(sizeof(LocationIndex) * leftItemsCount); + rightItemsShift = ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]) - + MAXALIGN(sizeof(LocationIndex) * rightItemsCount); + for (i = locator->itemOffset; i < locator->chunkItemsCount; i++) + { + if (!(ITEM_GET_FLAGS(locator->chunk->items[i]) & O_TUPLE_FLAGS_FIXED_FORMAT)) + rightChunkKeysFixed = false; + tmpItems[i - locator->itemOffset] = locator->chunk->items[i] - rightItemsShift; + } + + VALGRIND_CHECK_MEM_IS_DEFINED(tmpItems, sizeof(tmpItems[0]) * rightItemsCount); + + /* + * Move the data items belong to the left chunk accordingly to new size of + * items array. + */ + for (i = 0; i < locator->itemOffset; i++) + { + if (!(ITEM_GET_FLAGS(locator->chunk->items[i]) & O_TUPLE_FLAGS_FIXED_FORMAT)) + leftChunkKeysFixed = false; + locator->chunk->items[i] -= leftItemsShift; + } + memmove(firstItemPtr - leftItemsShift, + firstItemPtr, + itemPtr - firstItemPtr); + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + /* Shift the data items belong to the right chunk */ + dataShift = MAXALIGN(sizeof(LocationIndex) * rightItemsCount) + + MAXALIGN(sizeof(LocationIndex) * leftItemsCount) - + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount); + Assert(itemPtr + dataShift + (endPtr - itemPtr) <= (Pointer) p + ORIOLEDB_BLCKSZ); + memmove(itemPtr + dataShift, itemPtr, endPtr - itemPtr); + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + /* Place the right chunk items array */ + rightChunkPtr = itemPtr - + MAXALIGN(sizeof(LocationIndex) * locator->chunkItemsCount) + + MAXALIGN(sizeof(LocationIndex) * leftItemsCount); + memcpy(rightChunkPtr, tmpItems, sizeof(LocationIndex) * rightItemsCount); + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + /* Calculate shift of hikeys before the new hikey */ + chunkDescShift = MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * (header->chunksCount + 1)) - + MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * header->chunksCount); + /* Calculate shift of hikeys after the new hikey */ + hikeyShift = chunkDescShift + hikeySize; + + firstHikeyPtr = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation); + hikeyPtr = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[locator->chunkOffset].hikeyShortLocation); + hikeyEndPtr = (Pointer) p + header->hikeysEnd; + Assert(firstHikeyPtr >= p && hikeyPtr >= firstHikeyPtr && hikeyEndPtr >= hikeyPtr); + + /* Move hikeys */ + Assert(hikeyEndPtr + hikeyShift <= (Pointer) p + hikeysEnd); + memmove(hikeyPtr + hikeyShift, hikeyPtr, hikeyEndPtr - hikeyPtr); + memmove(firstHikeyPtr + chunkDescShift, firstHikeyPtr, hikeyPtr - firstHikeyPtr); + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + /* Adjust chunk descs */ + for (i = 0; i <= locator->chunkOffset; i++) + header->chunkDesc[i].hikeyShortLocation += LOCATION_GET_SHORT(chunkDescShift); + + for (i = header->chunksCount; i > locator->chunkOffset; i--) + { + header->chunkDesc[i].hikeyShortLocation = header->chunkDesc[i - 1].hikeyShortLocation + LOCATION_GET_SHORT(hikeyShift); + header->chunkDesc[i].hikeyFlags = header->chunkDesc[i - 1].hikeyFlags; + header->chunkDesc[i].offset = header->chunkDesc[i - 1].offset; + header->chunkDesc[i].chunkKeysFixed = header->chunkDesc[i - 1].chunkKeysFixed; + header->chunkDesc[i].shortLocation = header->chunkDesc[i - 1].shortLocation + LOCATION_GET_SHORT(dataShift); + } + + i = locator->chunkOffset + 1; + header->chunkDesc[i].hikeyShortLocation = header->chunkDesc[i - 1].hikeyShortLocation + + LOCATION_GET_SHORT(hikeySize); + header->chunkDesc[i].offset = header->chunkDesc[i - 1].offset + leftItemsCount; + header->chunkDesc[i].shortLocation = LOCATION_GET_SHORT(rightChunkPtr - (Pointer) p); + header->chunkDesc[i].hikeyFlags = header->chunkDesc[i - 1].hikeyFlags; + header->chunkDesc[i].chunkKeysFixed = rightChunkKeysFixed ? 1 : 0; + header->chunkDesc[i - 1].chunkKeysFixed = leftChunkKeysFixed ? 1 : 0; + header->chunksCount++; + header->hikeysEnd += hikeyShift; + header->dataSize += dataShift; + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + page_chunk_fill_locator(p, i, locator); +} + +#define MAXALIGN_WASTE(s) \ + ((MAXIMUM_ALIGNOF - 1) - ((s) + (MAXIMUM_ALIGNOF - 1)) % (MAXIMUM_ALIGNOF)) + +void +page_split_chunk_if_needed(BTreeDescr *desc, Page p, BTreePageItemLocator *locator) +{ + OffsetNumber i, + chunkOffset; + LocationIndex hikeysFreeSpace, + dataFreeSpace, + newChunkDescSize; + BTreePageHeader *header = (BTreePageHeader *) p; + int bestOffset = -1; + float4 bestScore = 0.0f; + LocationIndex bestHiKeySize = 0, + bestHiKeySizeUnaligned = 0, + hikeysEnd = BTREE_PAGE_HIKEYS_END(desc, p); + OFixedKey newHikey; + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(p, ORIOLEDB_BLCKSZ); + + if (header->hikeysEnd >= hikeysEnd) + return; + + chunkOffset = locator->chunkOffset; + + if ((float4) locator->chunkSize / (float4) (ORIOLEDB_BLCKSZ - hikeysEnd) < + (float4) MAXALIGN(header->maxKeyLen) * 2.0f / (float4) (hikeysEnd - offsetof(BTreePageHeader, chunkDesc))) + return; + + hikeysFreeSpace = hikeysEnd - header->hikeysEnd; + + /* + * Reserve some hikeys space on rightmost page to protect from the + * overflow of hikeyLocation. + */ + if (O_PAGE_IS(p, RIGHTMOST) && + hikeysEnd == HIKEY_SHORT_LOCATION_LIMIT && + hikeysFreeSpace >= SHORT_LOCATION_MULTIPLIER) + hikeysFreeSpace -= SHORT_LOCATION_MULTIPLIER; + + newChunkDescSize = MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + + sizeof(BTreePageChunkDesc) * + (header->chunksCount + 1)) - + MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + + sizeof(BTreePageChunkDesc) * + header->chunksCount); + /* Also don't split if there is no space for new chunkDesc */ + if (hikeysFreeSpace > newChunkDescSize) + hikeysFreeSpace -= newChunkDescSize; + else + hikeysFreeSpace = 0; /* Just to be more explicit about not having + * enough space */ + dataFreeSpace = ORIOLEDB_BLCKSZ - header->dataSize; + + for (i = 1; i < locator->chunkItemsCount; i++) + { + LocationIndex hikeySize, + hikeySizeUnaligned, + dataSize, + leftDataSize, + rightDataSize; + float4 score; + + locator->itemOffset = i; + if (O_PAGE_IS(p, LEAF)) + { + OTuple tuple; + + tuple.data = BTREE_PAGE_LOCATOR_GET_ITEM(p, locator) + BTreeLeafTuphdrSize; + tuple.formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locator); + hikeySizeUnaligned = o_btree_len(desc, tuple, OTupleKeyLengthNoVersion); + hikeySize = MAXALIGN(hikeySizeUnaligned); + } + else + { + hikeySize = BTREE_PAGE_GET_ITEM_SIZE(p, locator) - + BTreeNonLeafTuphdrSize; + hikeySizeUnaligned = hikeySize; + } + if (hikeySize > hikeysFreeSpace) + continue; + + dataSize = MAXALIGN(i * sizeof(LocationIndex)) + + MAXALIGN((locator->chunkItemsCount - i) * sizeof(LocationIndex)) - + MAXALIGN(locator->chunkItemsCount * sizeof(LocationIndex)); + + if (dataSize > dataFreeSpace) + continue; + + leftDataSize = ITEM_GET_OFFSET(locator->chunk->items[locator->itemOffset]); + rightDataSize = locator->chunkSize - leftDataSize; + leftDataSize -= MAXALIGN(locator->chunkItemsCount * sizeof(LocationIndex)); + + score = (float4) Min(leftDataSize, rightDataSize) / (float4) hikeySize; + + if (score > bestScore) + { + bestOffset = i; + bestHiKeySize = hikeySize; + bestHiKeySizeUnaligned = hikeySizeUnaligned; + bestScore = score; + } + } + + if (bestOffset < 0) + return; + + locator->itemOffset = bestOffset; + if (O_PAGE_IS(p, LEAF)) + { + OTuple tuple; + bool allocated; + + tuple.data = BTREE_PAGE_LOCATOR_GET_ITEM(p, locator) + BTreeLeafTuphdrSize; + tuple.formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locator); + newHikey.tuple = o_btree_tuple_make_key(desc, tuple, newHikey.fixedData, + false, &allocated); + if (bestHiKeySize != bestHiKeySizeUnaligned) + memset(newHikey.fixedData + bestHiKeySizeUnaligned, + 0, + bestHiKeySize - bestHiKeySizeUnaligned); + Assert(allocated == false); + VALGRIND_CHECK_MEM_IS_DEFINED(newHikey.fixedData, bestHiKeySizeUnaligned); + VALGRIND_CHECK_MEM_IS_DEFINED(newHikey.fixedData, bestHiKeySize); + } + else + { + OTuple key; + + key.data = BTREE_PAGE_LOCATOR_GET_ITEM(p, locator) + BTreeNonLeafTuphdrSize; + key.formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locator); + copy_fixed_key(desc, &newHikey, key); + VALGRIND_CHECK_MEM_IS_DEFINED(newHikey.fixedData, bestHiKeySize); + } + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + page_split_chunk(p, locator, hikeysEnd, bestHiKeySize); + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + + memcpy(p + SHORT_GET_LOCATION(header->chunkDesc[chunkOffset].hikeyShortLocation), + newHikey.fixedData, + bestHiKeySize); + header->chunkDesc[chunkOffset].hikeyFlags = newHikey.tuple.formatFlags; + if (!(newHikey.tuple.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + header->flags &= ~O_BTREE_FLAG_HIKEYS_FIXED; + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); +} + +#ifdef NOT_USED +static void +check_page(BTreeDescr *desc, Page p) +{ + BTreePageItemLocator loc; + OTuple prev, + tup; + BTreeKeyType kind = O_PAGE_IS(p, LEAF) ? BTreeTuple : BTreeKey; + BTreePageHeader *header = (BTreePageHeader *) p; + + O_TUPLE_SET_NULL(prev); + + BTREE_PAGE_LOCATOR_FIRST(p, &loc); + + if (!O_PAGE_IS(p, LEAF)) + BTREE_PAGE_LOCATOR_NEXT(p, &loc); + + while (BTREE_PAGE_LOCATOR_IS_VALID(p, &loc)) + { + BTREE_PAGE_READ_TUPLE(tup, p, &loc); + + if (!O_TUPLE_IS_NULL(prev)) + { + Assert(o_btree_cmp(desc, &prev, kind, &tup, kind) < 0); + } + + if (loc.chunkOffset < header->chunksCount - 1 || !O_PAGE_IS(p, RIGHTMOST)) + { + OTuple chunkHikey; + + chunkHikey.data = p + SHORT_GET_LOCATION(header->chunkDesc[loc.chunkOffset].hikeyShortLocation); + chunkHikey.formatFlags = header->chunkDesc[loc.chunkOffset].hikeyFlags; + Assert(o_btree_cmp(desc, &tup, kind, &chunkHikey, BTreeKey) < 0); + } + + prev = tup; + BTREE_PAGE_LOCATOR_NEXT(p, &loc); + } +} +#endif + +static LocationIndex +item_get_key_size(BTreeDescr *desc, bool leaf, BTreePageItem *item) +{ + OTuple tuple; + + if (leaf) + { + tuple.data = item->data + BTreeLeafTuphdrSize; + tuple.formatFlags = item->flags; + return MAXALIGN(o_btree_len(desc, tuple, OTupleKeyLengthNoVersion)); + } + else + { + tuple.data = item->data + BTreeNonLeafTuphdrSize; + tuple.formatFlags = item->flags; + return MAXALIGN(o_btree_len(desc, tuple, OKeyLength)); + } +} + +/* + * Split the page containing the single chunk into multiple chunks. + */ +void +btree_page_reorg(BTreeDescr *desc, Page p, BTreePageItem *items, + OffsetNumber count, LocationIndex hikeySize, OTuple hikey) +{ + int chunksCount; + LocationIndex totalDataSize, + itemHeaderSize = O_PAGE_IS(p, LEAF) ? BTreeLeafTuphdrSize : BTreeNonLeafTuphdrSize; + BTreePageChunk *chunk; + BTreePageHeader *header = (BTreePageHeader *) p; + Pointer ptr, + hikeysPtr; + bool chunkFixedKeys[BTREE_PAGE_MAX_CHUNKS]; + bool fixedKeys = true; + OffsetNumber chunkOffsets[BTREE_PAGE_MAX_CHUNKS + 1]; + LocationIndex itemsArray[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int i, + j; + LocationIndex hikeysFreeSpace, + hikeysFreeSpaceLeft; + LocationIndex dataFreeSpace, + dataFreeSpaceLeft, + hikeysEnd; + bool isRightmost = O_PAGE_IS(p, RIGHTMOST); + LocationIndex chunkDataSize; + LocationIndex maxKeyLen; + + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(p, ORIOLEDB_BLCKSZ); + + hikeysEnd = Max(BTREE_PAGE_HIKEYS_END(desc, p), MAXALIGN(sizeof(BTreePageHeader)) + MAXALIGN(hikeySize)); + + totalDataSize = 0; + for (i = 0; i < count; i++) + totalDataSize += items[i].size; + + hikeysFreeSpaceLeft = hikeysFreeSpace = hikeysEnd - (MAXALIGN(sizeof(BTreePageHeader)) + MAXALIGN(hikeySize)); + + /* + * Reserve some hikeys space on rightmost page to protect from the + * overflow of hikeyLocation. + */ + if (isRightmost && + hikeysEnd == HIKEY_SHORT_LOCATION_LIMIT && + hikeysFreeSpace >= SHORT_LOCATION_MULTIPLIER) + hikeysFreeSpace -= SHORT_LOCATION_MULTIPLIER; + + dataFreeSpaceLeft = dataFreeSpace = (ORIOLEDB_BLCKSZ - hikeysEnd) - totalDataSize - MAXALIGN(sizeof(LocationIndex) * count); + + /* + * Calculate the chunks count to fit both chunks area and data area. + */ + maxKeyLen = MAXALIGN(hikeySize); + + /* Calculate chunks boundaries */ + chunkOffsets[0] = 0; + j = 1; + chunkDataSize = 0; + if (count >= 1) + { + chunkDataSize += items[0].size; + if (O_PAGE_IS(p, LEAF) && !(items[0].flags & O_TUPLE_FLAGS_FIXED_FORMAT)) + fixedKeys = false; + } + if (O_PAGE_IS(p, LEAF) && count > 0) + maxKeyLen = Max(maxKeyLen, item_get_key_size(desc, O_PAGE_IS(p, LEAF), &items[0])); + + for (i = 1; i < count; i++) + { + LocationIndex nextKeySize, + hikeySizeDiff, + dataSpaceDiff; + float4 dataSizeRatio; + + nextKeySize = item_get_key_size(desc, O_PAGE_IS(p, LEAF), &items[i]); + maxKeyLen = Max(maxKeyLen, nextKeySize); + hikeySizeDiff = nextKeySize + + (MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * (j + 1)) - + MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * j)); + dataSpaceDiff = MAXALIGN_WASTE(sizeof(LocationIndex) * (i - chunkOffsets[j - 1])); + + if (hikeySizeDiff > hikeysFreeSpaceLeft || + dataSpaceDiff > dataFreeSpaceLeft) + { + if (!(items[i].flags & O_TUPLE_FLAGS_FIXED_FORMAT)) + fixedKeys = false; + chunkDataSize += items[i].size; + continue; + } + + dataSizeRatio = (float4) chunkDataSize / (float4) totalDataSize; + if (dataSizeRatio >= (float4) (nextKeySize + sizeof(BTreePageChunkDesc)) / (float4) hikeysFreeSpace && + dataSizeRatio >= (float4) dataSpaceDiff / (float4) dataFreeSpace) + { + hikeysFreeSpaceLeft -= hikeySizeDiff; + dataFreeSpaceLeft -= dataSpaceDiff; + chunkOffsets[j] = i; + chunkFixedKeys[j - 1] = fixedKeys; + fixedKeys = true; + chunkDataSize = 0; + j++; + } + + if (!(items[i].flags & O_TUPLE_FLAGS_FIXED_FORMAT)) + fixedKeys = false; + chunkDataSize += items[i].size; + } + Assert(j <= BTREE_PAGE_MAX_CHUNKS); + chunkOffsets[j] = count; + chunkFixedKeys[j - 1] = fixedKeys; + chunksCount = j; + + /* Calculate chunk items */ + ptr = (Pointer) p + hikeysEnd; + for (j = 0; j < chunksCount; j++) + { + OffsetNumber chunkItemsCount; + LocationIndex itemShift; + + chunkItemsCount = chunkOffsets[j + 1] - chunkOffsets[j]; + itemShift = MAXALIGN(sizeof(LocationIndex) * chunkItemsCount); + + for (i = chunkOffsets[j]; i < chunkOffsets[j + 1]; i++) + { + itemsArray[i] = ITEM_SET_FLAGS(itemShift, items[i].flags); + itemShift += items[i].size; + } + + ptr += itemShift; + } + + header->maxKeyLen = maxKeyLen; + header->dataSize = ptr - (Pointer) p; + header->chunksCount = chunksCount; + + /* + * Place the chunks data. We need to do this backwards to be sure we only + * move the data forwards and not override. + */ + for (j = chunksCount - 1; j >= 0; j--) + { + OffsetNumber chunkItemsCount; + + chunkItemsCount = chunkOffsets[j + 1] - chunkOffsets[j]; + + for (i = chunkOffsets[j + 1] - 1; i >= chunkOffsets[j]; i--) + { + ptr -= items[i].size; + + if (items[i].data >= p && items[i].data < p + ORIOLEDB_BLCKSZ && + ptr > items[i].data) + memmove(ptr, items[i].data, items[i].size); + } + + ptr -= MAXALIGN(sizeof(LocationIndex) * chunkItemsCount); + } + + /* Place chunks item arrays and fill chunk descs */ + Assert(ptr == (Pointer) p + hikeysEnd); + hikeysPtr = (Pointer) p + MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * chunksCount); + fixedKeys = true; + for (j = 0; j < chunksCount; j++) + { + OffsetNumber chunkItemsCount; + + chunkItemsCount = chunkOffsets[j + 1] - chunkOffsets[j]; + i = chunkOffsets[j]; + memmove(ptr, &itemsArray[i], sizeof(LocationIndex) * chunkItemsCount); + chunk = (BTreePageChunk *) ptr; + header->chunkDesc[j].shortLocation = LOCATION_GET_SHORT(ptr - (Pointer) p); + header->chunkDesc[j].offset = chunkOffsets[j]; + header->chunkDesc[j].chunkKeysFixed = chunkFixedKeys[j]; + ptr += MAXALIGN(sizeof(LocationIndex) * chunkItemsCount); + + for (i = chunkOffsets[j]; i < chunkOffsets[j + 1]; i++) + { + if (!(items[i].data >= p && items[i].data < p + ORIOLEDB_BLCKSZ) || + ptr < items[i].data) + memmove(ptr, items[i].data, items[i].size); + ptr += items[i].size; + } + + if (j > 0) + { + OTuple chunkHikeyTuple; + LocationIndex chunkHikeySize; + + chunkHikeyTuple.formatFlags = ITEM_GET_FLAGS(chunk->items[0]); + chunkHikeyTuple.data = (Pointer) chunk + ITEM_GET_OFFSET(chunk->items[0]) + itemHeaderSize; + if (O_PAGE_IS(p, LEAF)) + { + bool shouldFree; + + chunkHikeyTuple = o_btree_tuple_make_key(desc, chunkHikeyTuple, hikeysPtr, false, &shouldFree); + Assert(chunkHikeyTuple.data == hikeysPtr); + Assert(!shouldFree); + } + + chunkHikeySize = MAXALIGN(o_btree_len(desc, chunkHikeyTuple, OKeyLength)); + if (!(chunkHikeyTuple.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + fixedKeys = false; + if (chunkHikeyTuple.data != hikeysPtr) + memcpy(hikeysPtr, chunkHikeyTuple.data, chunkHikeySize); + header->chunkDesc[j - 1].hikeyFlags = chunkHikeyTuple.formatFlags; + header->chunkDesc[j - 1].hikeyShortLocation = LOCATION_GET_SHORT(hikeysPtr - (Pointer) p); + hikeysPtr += chunkHikeySize; + Assert((hikeysPtr - (Pointer) p) <= hikeysEnd); + } + } + + /* Place page hikey */ + if (!isRightmost) + { + if (!(hikey.formatFlags & O_TUPLE_FLAGS_FIXED_FORMAT)) + fixedKeys = false; + memcpy(hikeysPtr, hikey.data, hikeySize); + if (hikeySize != MAXALIGN(hikeySize)) + memset(hikeysPtr + hikeySize, 0, MAXALIGN(hikeySize) - hikeySize); + header->chunkDesc[j - 1].hikeyFlags = hikey.formatFlags; + header->chunkDesc[j - 1].hikeyShortLocation = LOCATION_GET_SHORT(hikeysPtr - (Pointer) p); + hikeysPtr += MAXALIGN(hikeySize); + Assert((hikeysPtr - (Pointer) p) <= hikeysEnd); + } + else + { + header->chunkDesc[j - 1].hikeyFlags = 0; + header->chunkDesc[j - 1].hikeyShortLocation = LOCATION_GET_SHORT(hikeysPtr - (Pointer) p); + } + header->hikeysEnd = hikeysPtr - (Pointer) p; + header->itemsCount = count; + header->flags |= O_BTREE_FLAG_HIKEYS_FIXED; + if (!fixedKeys) + header->flags &= ~O_BTREE_FLAG_HIKEYS_FIXED; + VALGRIND_CHECK_MEM_IS_DEFINED(p, ORIOLEDB_BLCKSZ); +} + +void +split_page_by_chunks(BTreeDescr *desc, Page p) +{ + BTreePageItemLocator loc; + BTreePageItem items[BTREE_PAGE_MAX_CHUNK_ITEMS]; + int i = 0; + OFixedKey hikey; + LocationIndex hikeySize; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + items[i].data = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + items[i].flags = BTREE_PAGE_GET_ITEM_FLAGS(p, &loc); + items[i].size = BTREE_PAGE_GET_ITEM_SIZE(p, &loc); + i++; + } + + if (O_PAGE_IS(p, RIGHTMOST)) + { + O_TUPLE_SET_NULL(hikey.tuple); + hikeySize = 0; + } + else + { + copy_fixed_hikey(desc, &hikey, p); + hikeySize = BTREE_PAGE_GET_HIKEY_SIZE(p); + } + + btree_page_reorg(desc, p, items, i, hikeySize, hikey.tuple); +} + +bool +page_locator_find_real_item(Page p, PartialPageState *partial, + BTreePageItemLocator *locator) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + OffsetNumber offset; + + while (locator->itemOffset >= locator->chunkItemsCount) + { + if (locator->chunkOffset >= header->chunksCount - 1) + return true; + + offset = locator->itemOffset - locator->chunkItemsCount; + if (partial) + { + if (!partial_load_chunk(partial, p, locator->chunkOffset + 1, locator)) + return false; + } + else + { + page_chunk_fill_locator(p, locator->chunkOffset + 1, locator); + } + locator->itemOffset = offset; + } + return true; +} + +OffsetNumber +page_locator_get_offset(Page p, BTreePageItemLocator *locator) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + + return header->chunkDesc[locator->chunkOffset].offset + locator->itemOffset; +} diff --git a/contrib/orioledb/src/btree/page_contents.c b/contrib/orioledb/src/btree/page_contents.c new file mode 100644 index 00000000000..4efc09b1144 --- /dev/null +++ b/contrib/orioledb/src/btree/page_contents.c @@ -0,0 +1,866 @@ +/*------------------------------------------------------------------------- + * + * page_contents.c + * Low-level routines for working with b-tree page contents. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/page_contents.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "recovery/recovery.h" +#include "tableam/descr.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "utils/page_pool.h" +#include "utils/ucm.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/s_lock.h" +#include "utils/memdebug.h" + +/* + * Navigates and reads the page image from undo log according to `key` of + * `keyType` and `csn`. Saves lokey of the page to lokey if *lokey != NULL. + */ +UndoLocation +read_page_from_undo(BTreeDescr *desc, Page img, UndoLocation undo_loc, + CommitSeqNo csn, void *key, BTreeKeyType keyType, + OFixedKey *lokey) +{ + BTreePageHeader *header; + CommitSeqNo page_csn; + UndoLocation rec_undo_location; + bool is_left = true; + UndoLogType undoType PG_USED_FOR_ASSERTS_ONLY = GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType); + + Assert(UndoLocationIsValid(undo_loc)); + + while (true) + { + /* Read page image from page-level undo item */ + get_page_from_undo(desc, undo_loc, key, keyType, img, + &is_left, NULL, lokey, NULL, NULL); + + header = (BTreePageHeader *) img; + page_csn = header->csn; + rec_undo_location = header->undoLocation; + + /* Page-level undo item should be retained */ + Assert(UNDO_REC_EXISTS(undoType, undo_loc)); + + /* Continue traversing undo chain if needed */ + if (COMMITSEQNO_IS_NORMAL(page_csn) && page_csn >= csn) + { + undo_loc = rec_undo_location; + continue; + } + else + { + break; + } + } + + /* Page-level undo item should be retained */ + Assert(UNDO_REC_EXISTS(undoType, undo_loc)); + + return O_UNDO_GET_IMAGE_LOCATION(undo_loc, is_left); +} + +/* + * Try to copy consistent image of page with page number = blkno to dest. + */ +static inline ReadPageResult +try_copy_page(OInMemoryBlkno blkno, uint32 pageChangeCount, Page dest, + PartialPageState *partial, bool loadHikeysChunk, + CommitSeqNo *readCsn) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + uint64 state1, + state2; + bool hiKeysEndOK PG_USED_FOR_ASSERTS_ONLY = true; + PagePool *ppool; + + state1 = pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state)); + if (O_PAGE_STATE_READ_IS_BLOCKED(state1)) + return ReadPageResultFailed; + + pg_read_barrier(); + + if (partial) + { + BTreePageHeader *header = (BTreePageHeader *) p; + LocationIndex hikeysEnd = loadHikeysChunk ? header->hikeysEnd : offsetof(BTreePageHeader, chunkDesc); + + pg_read_barrier(); + + if (!loadHikeysChunk || (hikeysEnd >= sizeof(BTreePageHeader) && hikeysEnd < ORIOLEDB_BLCKSZ)) + memcpy(dest, p, hikeysEnd); + else + hiKeysEndOK = false; + + partial->isPartial = true; + partial->hikeysChunkIsLoaded = loadHikeysChunk; + partial->src = p; + memset(&partial->chunkIsLoaded, 0, sizeof(bool) * BTREE_PAGE_MAX_CHUNKS); + } + else + memcpy(dest, p, ORIOLEDB_BLCKSZ); + + if (readCsn) + *readCsn = pg_atomic_read_u64(&TRANSAM_VARIABLES->nextCommitSeqNo); + + pg_read_barrier(); + state2 = pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state)); + + if ((state1 & PAGE_STATE_CHANGE_COUNT_MASK) != (state2 & PAGE_STATE_CHANGE_COUNT_MASK) || + O_PAGE_STATE_READ_IS_BLOCKED(state2)) + return ReadPageResultFailed; + + if (O_PAGE_GET_CHANGE_COUNT(p) != pageChangeCount) + return ReadPageResultWrongPageChangeCount; + + Assert(hiKeysEndOK); + + ppool = get_ppool_by_blkno(blkno); + ppool_ucm_inc_usage(ppool, blkno); + + return ReadPageResultOK; +} + +/* + * Copy consistent image of page with page number = blkno to dest. + */ +static inline bool +copy_page(OInMemoryBlkno blkno, uint32 pageChangeCount, Page dest, + PartialPageState *partial, bool loadHikeysChunk, + CommitSeqNo *readCsn) +{ + while (true) + { + ReadPageResult result; + + result = try_copy_page(blkno, pageChangeCount, dest, + partial, loadHikeysChunk, readCsn); + + if (result == ReadPageResultOK) + return true; + else if (result == ReadPageResultWrongPageChangeCount) + return false; + (void) page_wait_for_read_enable(blkno); + } +} + +/* + * Read in-memory page number `blkno` into `img`. Check expected + * `pageChangeCount`. Lookup for undo page according to `csn` when `key` of + * `keyType`. + */ +bool +o_btree_read_page(BTreeDescr *desc, OInMemoryBlkno blkno, + uint32 pageChangeCount, Page img, + CommitSeqNo csn, void *key, BTreeKeyType keyType, + OFixedKey *lokey, PartialPageState *partial, + bool loadHikeysChunk, UndoLocation *undoLocation, + CommitSeqNo *readCsn) +{ + Page p; + BTreePageHeader *header; + CommitSeqNo headerCsn; + UndoLocation headerUndoLocation; + bool read_undo; + + Assert(pageChangeCount != InvalidOPageChangeCount); + + /* + * For local pool pages, the slot may have been reclaimed by a reentrant + * eviction that ran between the caller capturing this downlink and now. + * Treat a NULL slot as a read failure so the caller can refetch the + * downlink from the parent (which now points to disk). + */ + if (O_PAGE_IS_LOCAL(blkno) && + local_ppool_pages[blkno & O_BLKNO_MASK] == NULL) + return false; + + p = O_GET_IN_MEMORY_PAGE(blkno); + header = (BTreePageHeader *) p; + read_undo = O_PAGE_IS(p, LEAF); + + EA_READ_INC(blkno); + + /*--- + * Check if we need to load page image from undo? + * + * We do this check without holding a page lock or even usage of state + * protocol. Istead we ensure correctenss of this check in a following + * way. + * + * 1. We read csn before undo location (ensured with memory barriers). + * We write csn after undo location (also ensured with memory barriers). + * Thus, undo location we read is probably more recent than csn. That could + * lead to traverse of extra step of undo chain, which is not a problem. + * Also that could lead to miss the need of reading undo, but that would + * be catched by subsequent check. + * 2. We check page change count after reading csn and undo location. That + * ensures page wasn't reused for something while reading csn and undo + * location. Note, that there is at least one memory barrier between + * increasing page change count and reusing the page during page unlock. + */ + headerCsn = header->csn; + + if (read_undo && COMMITSEQNO_IS_NORMAL(csn) && headerCsn >= csn) + { + UndoLocation pageUndoLoc; + + pg_read_barrier(); + headerUndoLocation = header->undoLocation; + pg_read_barrier(); + if (header->o_header.pageChangeCount != pageChangeCount) + return false; + + pageUndoLoc = read_page_from_undo(desc, img, headerUndoLocation, csn, + key, keyType, lokey); + header = (BTreePageHeader *) img; + header->o_header.pageChangeCount = pageChangeCount; + if (partial) + partial->isPartial = false; + if (undoLocation) + *undoLocation = pageUndoLoc; + if (readCsn) + *readCsn = header->csn; + return true; + } + + if (!copy_page(blkno, pageChangeCount, img, partial, + loadHikeysChunk, readCsn)) + return false; + header = (BTreePageHeader *) img; + + /* Re-try reading page-level undo item due to concurrent changes */ + if (read_undo && COMMITSEQNO_IS_NORMAL(csn) && header->csn >= csn) + { + UndoLocation pageUndoLoc; + + pageUndoLoc = read_page_from_undo(desc, img, header->undoLocation, csn, + key, keyType, lokey); + header = (BTreePageHeader *) img; + header->o_header.pageChangeCount = pageChangeCount; + if (partial) + partial->isPartial = false; + if (undoLocation) + *undoLocation = pageUndoLoc; + if (readCsn) + *readCsn = header->csn; + return true; + } + + if (undoLocation) + *undoLocation = InvalidUndoLocation; + + return true; +} + +/* + * Try to read page with concurrent changes. Returns true on success. + */ +ReadPageResult +o_btree_try_read_page(BTreeDescr *desc, OInMemoryBlkno blkno, uint32 pageChangeCount, Page img, + CommitSeqNo csn, Pointer key, BTreeKeyType keyType, + PartialPageState *partial, bool loadHikeysChunk, + CommitSeqNo *readCsn) +{ + Page p; + BTreePageHeader *header; + bool read_undo; + ReadPageResult result; + + Assert(pageChangeCount != InvalidOPageChangeCount); + + /* + * For local pool pages, the slot may have been reclaimed by a reentrant + * eviction that ran between the caller capturing this downlink and now. + * Treat a NULL slot as a read failure so the caller can refetch the + * downlink from the parent (which now points to disk). + */ + if (O_PAGE_IS_LOCAL(blkno) && + local_ppool_pages[blkno & O_BLKNO_MASK] == NULL) + return ReadPageResultFailed; + + p = O_GET_IN_MEMORY_PAGE(blkno); + header = (BTreePageHeader *) p; + read_undo = O_PAGE_IS(p, LEAF); + + EA_READ_INC(blkno); + + /* Check pointer to page-level undo item */ + if (read_undo && COMMITSEQNO_IS_NORMAL(csn) && header->csn >= csn) + { + UndoLocation undoLoc; + + pg_read_barrier(); + undoLoc = header->undoLocation; + pg_read_barrier(); + + if (header->o_header.pageChangeCount != pageChangeCount) + return ReadPageResultWrongPageChangeCount; + + read_page_from_undo(desc, img, undoLoc, csn, + key, keyType, NULL); + header = (BTreePageHeader *) img; + header->o_header.pageChangeCount = pageChangeCount; + if (readCsn) + *readCsn = header->csn; + return ReadPageResultOK; + } + + result = try_copy_page(blkno, pageChangeCount, img, partial, + loadHikeysChunk, readCsn); + if (result != ReadPageResultOK) + return result; + + /* Re-try reading page-level undo item due to concurrent changes */ + header = (BTreePageHeader *) img; + if (read_undo && COMMITSEQNO_IS_NORMAL(csn) && header->csn >= csn) + { + read_page_from_undo(desc, img, header->undoLocation, csn, + key, keyType, NULL); + header = (BTreePageHeader *) img; + header->o_header.pageChangeCount = pageChangeCount; + if (readCsn) + *readCsn = header->csn; + } + + return ReadPageResultOK; +} + +void +init_new_btree_page(BTreeDescr *desc, OInMemoryBlkno blkno, uint16 flags, + uint16 level, bool noLock) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + + if (!noLock) + { + lock_page(blkno); + page_block_reads(blkno); + } + + page_desc->oids = desc->oids; + page_desc->type = desc->type; + page_desc->fileExtent.len = InvalidFileExtentLen; + page_desc->fileExtent.off = InvalidFileExtentOff; + header->flags = flags; + if (flags & O_BTREE_FLAG_LEAF) + { + header->field1 = 0; + PAGE_SET_N_VACATED(p, 0); + } + else + { + PAGE_SET_LEVEL(p, level); + PAGE_SET_N_ONDISK(p, 0); + } + header->rightLink = InvalidRightLink; + header->csn = COMMITSEQNO_FROZEN; + header->undoLocation = InvalidUndoLocation; + header->o_header.checkpointNum = 0; + header->itemsCount = 0; + header->prevInsertOffset = MaxOffsetNumber; + header->maxKeyLen = 0; + ppool_ucm_init(desc->ppool, blkno); + + memset(p + offsetof(BTreePageHeader, chunkDesc), + 0, + ORIOLEDB_BLCKSZ - offsetof(BTreePageHeader, chunkDesc)); +} + +void +init_meta_page(OInMemoryBlkno blkno, uint32 leafPagesNum) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + BTreeMetaPage *metaPage = (BTreeMetaPage *) p; + int i, + j; + + memset(p + O_PAGE_HEADER_SIZE, 0, ORIOLEDB_BLCKSZ - O_PAGE_HEADER_SIZE); + pg_atomic_init_u32(&metaPage->leafPagesNum, leafPagesNum); + pg_atomic_init_u64(&metaPage->numFreeBlocks, 0); + pg_atomic_init_u64(&metaPage->datafileLength[0], 0); + pg_atomic_init_u64(&metaPage->datafileLength[1], 0); + pg_atomic_init_u64(&metaPage->ctid, 0); + pg_atomic_init_u64(&metaPage->bridge_ctid, 0); + for (i = 0; i < NUM_SEQ_SCANS_ARRAY_SIZE; i++) + pg_atomic_init_u32(&metaPage->numSeqScans[i], 0); + + LWLockInitialize(&metaPage->copyBlknoLock, + checkpoint_state->copyBlknoTrancheId); + LWLockInitialize(&metaPage->metaLock, + checkpoint_state->oMetaTrancheId); + LWLockInitialize(&metaPage->punchHolesLock, + checkpoint_state->punchHolesTrancheId); + + page_desc->type = oIndexInvalid; + ORelOidsSetInvalid(page_desc->oids); + page_desc->fileExtent.len = InvalidFileExtentLen; + page_desc->fileExtent.off = InvalidFileExtentOff; + + for (i = 0; i < 2; i++) + { + metaPage->freeBuf.pages[i] = OInvalidInMemoryBlkno; + for (j = 0; j < 2; j++) + { + metaPage->nextChkp[j].pages[i] = OInvalidInMemoryBlkno; + metaPage->tmpBuf[j].pages[i] = OInvalidInMemoryBlkno; + } + + metaPage->partsInfo[i].writeMaxLocation = 0; + for (j = 0; j < MAX_NUM_DIRTY_PARTS; j++) + { + metaPage->partsInfo[i].dirtyParts[j].segNum = -1; + metaPage->partsInfo[i].dirtyParts[j].partNum = -1; + } + } + metaPage->punchHolesChkpNum = checkpoint_state->lastCheckpointNumber; + metaPage->toBeFreedOnSeqScanRelease = false; +} + +/* + * Estimate vacated space in the page after item replace on the given offset. + */ +LocationIndex +page_get_vacated_skip_item(BTreeDescr *desc, Page p, CommitSeqNo csn, + LocationIndex offset) +{ + LocationIndex vacatedBytes = 0; + BTreePageItemLocator loc; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeLeafTuphdr *header; + OTuple tuple; + + if (BTREE_PAGE_LOCATOR_GET_OFFSET(p, &loc) == offset) + continue; + + BTREE_PAGE_READ_LEAF_ITEM(header, tuple, p, &loc); + if (XACT_INFO_FINISHED_FOR_EVERYBODY(header->xactInfo)) + { + if (header->deleted) + { + if (COMMITSEQNO_IS_INPROGRESS(csn) || XACT_INFO_MAP_CSN(header->xactInfo) < csn) + vacatedBytes += BTREE_PAGE_GET_ITEM_SIZE(p, &loc); + } + else + { + LocationIndex itemCompactedSize; + + itemCompactedSize = BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tuple, OTupleLength)); + vacatedBytes += BTREE_PAGE_GET_ITEM_SIZE(p, &loc) - itemCompactedSize; + } + } + } + + return vacatedBytes; +} + +/* + * Estimate vacated space in the page. + */ +LocationIndex +page_get_vacated_space(BTreeDescr *desc, Page p, CommitSeqNo csn) +{ + return page_get_vacated_skip_item(desc, p, csn, -1); +} + +/* + * Sets to 0 unused space on the page. + */ +void +null_unused_bytes(Page img) +{ + BTreePageHeader *header = (BTreePageHeader *) img; + + memset((Pointer) img + header->dataSize, 0, + ORIOLEDB_BLCKSZ - header->dataSize); +} + +void +page_cut_first_key(Page node) +{ + BTreeNonLeafTuphdr *tuphdr, + tmp; + BTreePageItemLocator loc; + + Assert(!O_PAGE_IS(node, LEAF)); + BTREE_PAGE_LOCATOR_FIRST(node, &loc); + Assert(BTREE_PAGE_GET_ITEM_SIZE(node, &loc) > BTreeNonLeafTuphdrSize); + + tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(node, &loc); + tmp = *tuphdr; + + page_locator_resize_item(node, &loc, BTreeNonLeafTuphdrSize); + + tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(node, &loc); + *tuphdr = tmp; +} + +void +put_page_image(OInMemoryBlkno blkno, Page img) +{ + Page page = O_GET_IN_MEMORY_PAGE(blkno); + int skipSize = offsetof(OrioleDBPageHeader, checkpointNum); + + pg_write_barrier(); + + memcpy(page + skipSize, + (char *) img + skipSize, + ORIOLEDB_BLCKSZ - skipSize); +} + +/* + * Calculates number of vacated bytes for leaf pages and number of + * disk downlinks for non-leaf pages. + */ +void +o_btree_page_calculate_statistics(BTreeDescr *desc, Pointer p) +{ + BTreePageItemLocator loc; + + if (O_PAGE_IS(p, LEAF)) + { + int nVacated = 0; + + /* Bridge tuples not treated as vacated */ + if (desc->type == oIndexBridge) + return; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeLeafTuphdr *tupHdr; + OTuple tuple; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, tuple, p, &loc); + + if (tupHdr->deleted) + nVacated += BTREE_PAGE_GET_ITEM_SIZE(p, &loc); + else + nVacated += BTREE_PAGE_GET_ITEM_SIZE(p, &loc) - + (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tuple, OTupleLength))); + } + PAGE_SET_N_VACATED(p, nVacated); + } + else + { + int nOnDisk = 0; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tupHdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (DOWNLINK_IS_ON_DISK(tupHdr->downlink)) + nOnDisk++; + } + PAGE_SET_N_ONDISK(p, nOnDisk); + } +} + +void +copy_fixed_tuple(BTreeDescr *desc, OFixedTuple *dst, OTuple src) +{ + int tuplen; + + if (O_TUPLE_IS_NULL(src)) + { + clear_fixed_tuple(dst); + return; + } + + tuplen = o_btree_len(desc, src, OTupleLength); + Assert(tuplen <= sizeof(dst->fixedData)); + dst->tuple.formatFlags = src.formatFlags; + dst->tuple.data = dst->fixedData; + memcpy(dst->fixedData, src.data, tuplen); + if (tuplen != MAXALIGN(tuplen)) + memset(&dst->fixedData[tuplen], 0, MAXALIGN(tuplen) - tuplen); +} + +static void +copy_fixed_key_with_len(OFixedKey *dst, OTuple src, int tuplen) +{ + if (O_TUPLE_IS_NULL(src)) + { + clear_fixed_key(dst); + return; + } + + dst->tuple.formatFlags = src.formatFlags; + dst->tuple.data = dst->fixedData; + memcpy(dst->fixedData, src.data, tuplen); + if (tuplen != MAXALIGN(tuplen)) + memset(&dst->fixedData[tuplen], 0, MAXALIGN(tuplen) - tuplen); +} + +void +copy_fixed_key(BTreeDescr *desc, OFixedKey *dst, OTuple src) +{ + int tuplen; + + if (O_TUPLE_IS_NULL(src)) + { + clear_fixed_key(dst); + return; + } + + tuplen = o_btree_len(desc, src, OKeyLength); + Assert(tuplen <= sizeof(dst->fixedData)); + copy_fixed_key_with_len(dst, src, tuplen); +} + +void +copy_fixed_page_key(BTreeDescr *desc, OFixedKey *dst, + Page p, BTreePageItemLocator *loc) +{ + OTuple src; + + BTREE_PAGE_READ_TUPLE(src, p, loc); + copy_fixed_key(desc, dst, src); +} + +void +copy_fixed_hikey(BTreeDescr *desc, OFixedKey *dst, Page p) +{ + OTuple src; + + BTREE_PAGE_GET_HIKEY(src, p); + copy_fixed_key(desc, dst, src); +} + +void +clear_fixed_tuple(OFixedTuple *dst) +{ + dst->tuple.formatFlags = 0; + dst->tuple.data = NULL; +} + +void +clear_fixed_key(OFixedKey *dst) +{ + dst->tuple.formatFlags = 0; + dst->tuple.data = NULL; +} + +void +copy_from_fixed_shmem_key(OFixedKey *dst, OFixedShmemKey *src) +{ + if (!src->notNull) + { + clear_fixed_key(dst); + return; + } + + memcpy(dst->fixedData, src->data.fixedData, src->len); + dst->tuple.data = dst->fixedData; + dst->tuple.formatFlags = src->formatFlags; +} + +void +copy_fixed_shmem_key(BTreeDescr *desc, OFixedShmemKey *dst, OTuple src) +{ + if (O_TUPLE_IS_NULL(src)) + { + clear_fixed_shmem_key(dst); + return; + } + + dst->len = o_btree_len(desc, src, OKeyLength); + Assert(dst->len <= sizeof(dst->data.fixedData)); + memcpy(dst->data.fixedData, src.data, dst->len); + dst->notNull = true; + dst->formatFlags = src.formatFlags; +} + +void +copy_fixed_shmem_page_key(BTreeDescr *desc, OFixedShmemKey *dst, + Page p, BTreePageItemLocator *loc) +{ + OTuple src; + + BTREE_PAGE_READ_TUPLE(src, p, loc); + copy_fixed_shmem_key(desc, dst, src); +} + +void +copy_fixed_shmem_hikey(BTreeDescr *desc, OFixedShmemKey *dst, Page p) +{ + OTuple src; + + BTREE_PAGE_GET_HIKEY(src, p); + copy_fixed_shmem_key(desc, dst, src); +} + +void +clear_fixed_shmem_key(OFixedShmemKey *dst) +{ + dst->notNull = false; + dst->formatFlags = 0; + dst->len = 0; +} + +OTuple +fixed_shmem_key_get_tuple(OFixedShmemKey *src) +{ + OTuple result; + + if (src->notNull) + { + result.data = src->data.fixedData; + result.formatFlags = src->formatFlags; + } + else + { + result.data = NULL; + result.formatFlags = 0; + } + return result; +} + +OTuple +page_get_hikey(Page p) +{ + BTreePageChunkDesc *chunkDesc; + BTreePageHeader *header = (BTreePageHeader *) p; + OTuple result; + + Assert(!O_PAGE_IS(p, RIGHTMOST)); + + chunkDesc = &header->chunkDesc[header->chunksCount - 1]; + + result.formatFlags = chunkDesc->hikeyFlags; + result.data = (Pointer) p + SHORT_GET_LOCATION(chunkDesc->hikeyShortLocation); + + return result; +} + +int +page_get_hikey_size(Page p) +{ + BTreePageChunkDesc *chunkDesc; + BTreePageHeader *header = (BTreePageHeader *) p; + + Assert(!O_PAGE_IS(p, RIGHTMOST)); + chunkDesc = &header->chunkDesc[header->chunksCount - 1]; + + return (header->hikeysEnd - SHORT_GET_LOCATION(chunkDesc->hikeyShortLocation)); +} + +void +page_set_hikey_flags(Page p, uint8 flags) +{ + BTreePageChunkDesc *chunkDesc; + BTreePageHeader *header = (BTreePageHeader *) p; + + Assert(!O_PAGE_IS(p, RIGHTMOST)); + chunkDesc = &header->chunkDesc[header->chunksCount - 1]; + chunkDesc->hikeyFlags = flags; +} + +bool +page_fits_hikey(Page p, LocationIndex newHikeySize) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + LocationIndex dataShift, + hikeyLocation, + dataLocation; + + Assert(newHikeySize = MAXALIGN(newHikeySize)); + Assert(header->chunksCount == 1); + + hikeyLocation = SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation); + dataLocation = SHORT_GET_LOCATION(header->chunkDesc[0].shortLocation); + if (hikeyLocation + newHikeySize <= dataLocation) + return true; + + dataShift = hikeyLocation + newHikeySize - dataLocation; + return (header->dataSize + dataShift <= ORIOLEDB_BLCKSZ); +} + +void +page_resize_hikey(Page p, LocationIndex newHikeySize) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + LocationIndex dataShift, + hikeyLocation, + dataLocation; + + Assert(newHikeySize = MAXALIGN(newHikeySize)); + Assert(header->chunksCount == 1); + + hikeyLocation = SHORT_GET_LOCATION(header->chunkDesc[0].hikeyShortLocation); + dataLocation = SHORT_GET_LOCATION(header->chunkDesc[0].shortLocation); + if (hikeyLocation + newHikeySize <= dataLocation) + { + /* Fits */ + header->hikeysEnd = hikeyLocation + newHikeySize; + return; + } + + dataShift = hikeyLocation + newHikeySize - dataLocation; + Assert(header->dataSize + dataShift <= ORIOLEDB_BLCKSZ); + memmove((Pointer) p + dataLocation + dataShift, + (Pointer) p + dataLocation, + header->dataSize - dataLocation); + header->chunkDesc[0].shortLocation += LOCATION_GET_SHORT(dataShift); + header->hikeysEnd = hikeyLocation + newHikeySize; + header->dataSize += dataShift; +} + +void +btree_page_update_max_key_len(BTreeDescr *desc, Page p) +{ + LocationIndex maxKeyLen; + BTreePageHeader *header = (BTreePageHeader *) p; + BTreePageItemLocator loc; + + if (!O_PAGE_IS(p, RIGHTMOST)) + maxKeyLen = BTREE_PAGE_GET_HIKEY_SIZE(p); + else + maxKeyLen = 0; + + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + LocationIndex keyLen; + + if (!O_PAGE_IS(p, LEAF)) + { + keyLen = BTREE_PAGE_GET_ITEM_SIZE(p, &loc) - + BTreeNonLeafTuphdrSize; + } + else + { + OTuple tuple; + + BTREE_PAGE_READ_TUPLE(tuple, p, &loc); + keyLen = o_btree_len(desc, tuple, OTupleKeyLengthNoVersion); + } + maxKeyLen = Max(maxKeyLen, keyLen); + } + header->maxKeyLen = maxKeyLen; +} diff --git a/contrib/orioledb/src/btree/page_state.c b/contrib/orioledb/src/btree/page_state.c new file mode 100644 index 00000000000..115b233e8f7 --- /dev/null +++ b/contrib/orioledb/src/btree/page_state.c @@ -0,0 +1,1420 @@ +/*------------------------------------------------------------------------- + * + * page_state.c + * OrioleDB B-tree page locking, waiting, reading etc. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/page_state.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/io.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "recovery/recovery.h" +#include "storage/itemptr.h" +#include "tableam/descr.h" +#include "tableam/key_range.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "utils/dsa.h" +#include "utils/page_pool.h" +#include "utils/stopevent.h" +#include "utils/ucm.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/s_lock.h" +#include "utils/memdebug.h" + +/* Maximum simultaneously locked pages per process */ +#define MAX_PAGES_PER_PROCESS 8 + +/* + * Enable this to recheck page stats on every unlock. + */ +/* #define CHECK_PAGE_STATS */ + +typedef struct +{ + OInMemoryBlkno blkno; + uint64 state; +} MyLockedPage; + +static MyLockedPage myLockedPages[MAX_PAGES_PER_PROCESS]; +static OInMemoryBlkno myInProgressSplitPages[ORIOLEDB_MAX_DEPTH * 2]; +static int numberOfMyLockedPages = 0; +static int numberOfMyInProgressSplitPages = 0; + +OPageWaiterShmemState *lockerStates = NULL; + +#ifdef CHECK_PAGE_STATS +static void o_check_btree_page_statistics(BTreeDescr *desc, Pointer p); +#endif + +Size +page_state_shmem_needs(void) +{ + return CACHELINEALIGN(sizeof(OPageWaiterShmemState) * max_procs); +} + +void +page_state_shmem_init(Pointer buf, bool found) +{ + Pointer ptr = buf; + + lockerStates = (OPageWaiterShmemState *) ptr; +} + +static int +get_my_locked_page_index(OInMemoryBlkno blkno) +{ + int i; + + for (i = 0; i < numberOfMyLockedPages; i++) + if (myLockedPages[i].blkno == blkno) + return i; + return -1; +} + +static void +my_locked_page_add(OInMemoryBlkno blkno, uint64 state) +{ + Assert(get_my_locked_page_index(blkno) < 0); + Assert(numberOfMyLockedPages < MAX_PAGES_PER_PROCESS); + + Assert(pg_atomic_read_u64(&((OrioleDBPageHeader *) O_GET_IN_MEMORY_PAGE(blkno))->state) & PAGE_STATE_LOCKED_FLAG); + myLockedPages[numberOfMyLockedPages].blkno = blkno; + myLockedPages[numberOfMyLockedPages++].state = state; +} + +static uint64 +my_locked_page_del(OInMemoryBlkno blkno) +{ + int i = get_my_locked_page_index(blkno); + uint64 state; + + Assert(i >= 0 && i < MAX_PAGES_PER_PROCESS); + state = myLockedPages[i].state; + myLockedPages[i] = myLockedPages[--numberOfMyLockedPages]; + + return state; +} + +static uint64 +my_locked_page_get_state(OInMemoryBlkno blkno) +{ + int i = get_my_locked_page_index(blkno); + + Assert(i >= 0 && i < MAX_PAGES_PER_PROCESS); + return myLockedPages[i].state; +} + +static uint64 +lock_page_or_queue(OInMemoryBlkno blkno, uint32 pgprocnum) +{ + OPagePool *ppool = (OPagePool *) get_ppool_by_blkno(blkno); + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageHeader *header = (OrioleDBPageHeader *) p; + uint64 state; + OPageWaiterShmemState *lockerState = &lockerStates[pgprocnum]; + bool ucmUpdateTried = false; + + Assert(pgprocnum < max_procs); + Assert(!O_PAGE_IS_LOCAL(blkno)); + + state = pg_atomic_read_u64(&header->state); + while (true) + { + uint64 newState; + + if (!O_PAGE_STATE_IS_LOCKED(state)) + { + newState = O_PAGE_STATE_LOCK(state); + } + else + { + Assert((state & PAGE_STATE_LIST_TAIL_MASK) != pgprocnum); + lockerState->status = OPageWaitExclusive; + lockerState->next = (state & PAGE_STATE_LIST_TAIL_MASK); + newState = state & (~PAGE_STATE_LIST_TAIL_MASK); + newState |= pgprocnum; + } + + if (!ucmUpdateTried) + { + newState = ucm_update_state(&ppool->ucm, blkno, newState); + ucmUpdateTried = true; + } + + if (pg_atomic_compare_exchange_u64(&header->state, &state, newState)) + { + ucm_after_update_state(&ppool->ucm, blkno, state, newState); + break; + } + } + + return state; +} + +typedef struct +{ + char img[8192]; + PartialPageState partial; + bool load; +} PageImg; + +typedef enum +{ + LockPageResultLocked = 1, + LockPageResultQueued = 2, + LockPageResultSplitDetected = 3 +} LockPageResult; + +static LockPageResult +lock_page_or_queue_or_split_detect(BTreeDescr *desc, OInMemoryBlkno *blkno, + uint32 *pageChangeCount, uint32 pgprocnum, + PageImg *img, OTupleXactInfo xactInfo, + OTuple tuple, uint64 *prevState, + bool *keySerialized) +{ + OPagePool *ppool = (OPagePool *) get_ppool_by_blkno(*blkno); + Page p = O_GET_IN_MEMORY_PAGE(*blkno); + OrioleDBPageHeader *header = (OrioleDBPageHeader *) p; + OrioleDBPageHeader *imgHeader = (OrioleDBPageHeader *) img->img; + uint64 state; + OPageWaiterShmemState *lockerState = &lockerStates[pgprocnum]; + bool ucmUpdateTried = false; + + Assert(pgprocnum < max_procs); + Assert(!O_PAGE_IS_LOCAL(*blkno)); + + state = pg_atomic_read_u64(&header->state); + while (true) + { + uint64 newState; + + if (!O_PAGE_STATE_IS_LOCKED(state)) + { + newState = O_PAGE_STATE_LOCK(state); + } + else + { + if (!img->load || + (state & PAGE_STATE_CHANGE_COUNT_MASK) != (pg_atomic_read_u64(&imgHeader->state) & PAGE_STATE_CHANGE_COUNT_MASK)) + { + if (!o_btree_read_page(desc, *blkno, *pageChangeCount, img->img, + COMMITSEQNO_INPROGRESS, NULL, BTreeKeyNone, NULL, + &img->partial, true, NULL, NULL)) + { + return LockPageResultSplitDetected; + } + img->load = true; + + if (!O_PAGE_IS(img->img, RIGHTMOST)) + { + OTuple hikey; + + BTREE_PAGE_GET_HIKEY(hikey, img->img); + + if (o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, + &hikey, BTreeKeyNonLeafKey) >= 0) + { + uint64 rightlink = BTREE_PAGE_GET_RIGHTLINK(img->img); + + if (OInMemoryBlknoIsValid(RIGHTLINK_GET_BLKNO(rightlink))) + { + *blkno = RIGHTLINK_GET_BLKNO(rightlink); + *pageChangeCount = RIGHTLINK_GET_CHANGECOUNT(rightlink); + p = O_GET_IN_MEMORY_PAGE(*blkno); + header = (OrioleDBPageHeader *) p; + Assert(get_my_locked_page_index(*blkno) < 0); + state = pg_atomic_read_u64(&header->state); + continue; + } + else + { + return LockPageResultSplitDetected; + } + } + } + } + + if (!*keySerialized) + { + BTreeLeafTuphdr tuphdr; + int tuplen; + + tuphdr.deleted = false; + tuphdr.undoLocation = InvalidUndoLocation; + tuphdr.formatFlags = 0; + tuphdr.chainHasLocks = false; + tuphdr.xactInfo = xactInfo; + + lockerState->reloids = desc->oids; + if (desc->undoType != UndoLogNone) + lockerState->reservedUndoSize = get_reserved_undo_size(desc->undoType); + else + lockerState->reservedUndoSize = 0; + lockerState->tupleFlags = tuple.formatFlags; + memcpy(lockerState->tupleData.fixedData, + &tuphdr, + BTreeLeafTuphdrSize); + tuplen = o_btree_len(desc, tuple, OTupleLength); + memcpy(&lockerState->tupleData.fixedData[BTreeLeafTuphdrSize], + tuple.data, + tuplen); + if (tuplen != MAXALIGN(tuplen)) + memset(&lockerState->tupleData.fixedData[BTreeLeafTuphdrSize + tuplen], + 0, MAXALIGN(tuplen) - tuplen); + *keySerialized = true; + } + + Assert((state & PAGE_STATE_LIST_TAIL_MASK) != pgprocnum); + lockerState->status = OPageWaitInsert; + lockerState->undoLocation = InvalidUndoLocation; + lockerState->pageChangeCount = *pageChangeCount; + lockerState->autonomousNestingLevel = GET_CUR_PROCDATA()->autonomousNestingLevel; + Assert(!lockerState->inserted); + lockerState->next = (state & PAGE_STATE_LIST_TAIL_MASK); + newState = state & (~PAGE_STATE_LIST_TAIL_MASK); + newState |= pgprocnum; + } + + if (!ucmUpdateTried) + { + newState = ucm_update_state(&ppool->ucm, *blkno, newState); + ucmUpdateTried = true; + } + + if (pg_atomic_compare_exchange_u64(&header->state, &state, newState)) + { + ucm_after_update_state(&ppool->ucm, *blkno, state, newState); + break; + } + } + + *prevState = state; + + if (!O_PAGE_STATE_IS_LOCKED(state)) + return LockPageResultLocked; + else + return LockPageResultQueued; +} + +/* + * This function finishes when page is enable to read or we managed to lock + * the page list. + */ +static uint64 +read_enabled_or_queue(OInMemoryBlkno blkno, uint32 pgprocnum) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageHeader *header = (OrioleDBPageHeader *) p; + uint64 state; + OPageWaiterShmemState *lockerState = &lockerStates[pgprocnum]; + + state = pg_atomic_read_u64(&header->state); + while (true) + { + uint64 newState; + + if (!O_PAGE_STATE_READ_IS_BLOCKED(state)) + { + break; + } + else + { + Assert((state & PAGE_STATE_LIST_TAIL_MASK) != pgprocnum); + lockerState->status = OPageWaitNonExclusive; + lockerState->next = (state & PAGE_STATE_LIST_TAIL_MASK); + newState = state & (~PAGE_STATE_LIST_TAIL_MASK); + newState |= pgprocnum; + } + + if (pg_atomic_compare_exchange_u64(&header->state, &state, newState)) + break; + } + + return state; +} + +static uint64 +state_changed_or_queue(OInMemoryBlkno blkno, uint32 pgprocnum, + uint64 oldState) +{ + OPagePool *ppool = (OPagePool *) get_ppool_by_blkno(blkno); + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageHeader *header = (OrioleDBPageHeader *) p; + uint64 state; + OPageWaiterShmemState *lockerState = &lockerStates[pgprocnum]; + bool ucmUpdateTried = false; + + Assert(!O_PAGE_IS_LOCAL(blkno)); + + state = pg_atomic_read_u64(&header->state); + while (true) + { + uint64 newState; + + if ((state & PAGE_STATE_CHANGE_COUNT_MASK) != + (oldState & PAGE_STATE_CHANGE_COUNT_MASK)) + { + break; + } + else + { + Assert((state & PAGE_STATE_LIST_TAIL_MASK) != pgprocnum); + lockerState->status = OPageWaitNonExclusive; + lockerState->next = (state & PAGE_STATE_LIST_TAIL_MASK); + newState = state & (~PAGE_STATE_LIST_TAIL_MASK); + newState |= pgprocnum; + } + + if (!ucmUpdateTried) + { + newState = ucm_update_state(&ppool->ucm, blkno, newState); + ucmUpdateTried = true; + } + + if (pg_atomic_compare_exchange_u64(&header->state, &state, newState)) + { + ucm_after_update_state(&ppool->ucm, blkno, state, newState); + break; + } + } + + return state; +} + + +/* + * Place exclusive lock on the page. Doesn't block readers before + * page_block_reads() is called. + */ +void +lock_page(OInMemoryBlkno blkno) +{ + OPageWaiterShmemState *lockerState = &lockerStates[MYPROCNUMBER]; + uint64 prevState; + int extraWaits = 0; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + Assert(get_my_locked_page_index(blkno) < 0); + + EA_LOCK_INC(blkno); + + while (true) + { + prevState = lock_page_or_queue(blkno, MYPROCNUMBER); + + if (!O_PAGE_STATE_IS_LOCKED(prevState)) + break; + + pgstat_report_wait_start(PG_WAIT_LWLOCK | LWTRANCHE_BUFFER_CONTENT); + + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (lockerState->status == OPageWaitWakeUp) + break; + extraWaits++; + } + + pgstat_report_wait_end(); + } + + my_locked_page_add(blkno, prevState | PAGE_STATE_LOCKED_FLAG); + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); +} + +/* + * Place exclusive lock on the page. Doesn't block readers before + * page_block_reads() is called. + */ +OLockPageWithTupleResult +lock_page_with_tuple(BTreeDescr *desc, + OInMemoryBlkno *blkno, uint32 *pageChangeCount, + OTupleXactInfo xactInfo, OTuple tuple) +{ + uint64 prevState; + int extraWaits = 0; + OPageWaiterShmemState *lockerState = &lockerStates[MYPROCNUMBER]; + bool keySerialized = false; + PageImg img; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(*blkno)) + return OLockPageWithTupleResultLocked; + + img.load = false; + Assert(get_my_locked_page_index(*blkno) < 0); + + while (true) + { + LockPageResult lockResult; + + lockResult = lock_page_or_queue_or_split_detect(desc, blkno, + pageChangeCount, + MYPROCNUMBER, + &img, xactInfo, + tuple, &prevState, + &keySerialized); + + if (lockResult == LockPageResultLocked) + { + break; + } + else if (lockResult == LockPageResultSplitDetected) + { + return OLockPageWithTupleResultRefindNeeded; + } + Assert(lockResult == LockPageResultQueued); + + pgstat_report_wait_start(PG_WAIT_LWLOCK | LWTRANCHE_BUFFER_CONTENT); + + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (lockerState->status == OPageWaitWakeUp) + break; + extraWaits++; + } + pgstat_report_wait_end(); + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + + if (lockerState->inserted) + { + UndoLogType undoType = desc->undoType; + + Assert(keySerialized); + lockerState->inserted = false; + if (undoType != UndoLogNone) + { + giveup_reserved_undo_size(undoType); + if (UndoLocationIsValid(lockerState->undoLocation) && + !UndoLocationIsValid(curRetainUndoLocations[undoType])) + curRetainUndoLocations[undoType] = lockerState->undoLocation; + } + + return OLockPageWithTupleResultInserted; + } + } + + EA_LOCK_INC(*blkno); + + my_locked_page_add(*blkno, prevState | PAGE_STATE_LOCKED_FLAG); + + return OLockPageWithTupleResultLocked; +} + +void +page_wait_for_read_enable(OInMemoryBlkno blkno) +{ + uint32 prevState; + int extraWaits = 0; + OPageWaiterShmemState *lockerState = &lockerStates[MYPROCNUMBER]; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + while (true) + { + prevState = read_enabled_or_queue(blkno, MYPROCNUMBER); + + if (!(prevState & PAGE_STATE_NO_READ_FLAG)) + break; + + pgstat_report_wait_start(PG_WAIT_LWLOCK | LWTRANCHE_BUFFER_CONTENT); + + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (lockerState->status == OPageWaitWakeUp) + break; + extraWaits++; + } + + pgstat_report_wait_end(); + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + + return; +} + +static uint32 +page_wait_for_changecount(OInMemoryBlkno blkno, uint32 state) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageHeader *header = (OrioleDBPageHeader *) p; + uint64 curState; + int extraWaits = 0; + OPageWaiterShmemState *lockerState = &lockerStates[MYPROCNUMBER]; + + while (true) + { + bool exit_loop = false; + + curState = state_changed_or_queue(blkno, MYPROCNUMBER, state); + if ((curState & PAGE_STATE_CHANGE_COUNT_MASK) != + (state & PAGE_STATE_CHANGE_COUNT_MASK)) + { + return curState; + } + + pgstat_report_wait_start(PG_WAIT_LWLOCK | LWTRANCHE_BUFFER_CONTENT); + + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (lockerState->status == OPageWaitWakeUp) + { + curState = pg_atomic_read_u64(&header->state); + if ((curState & PAGE_STATE_CHANGE_COUNT_MASK) != + (state & PAGE_STATE_CHANGE_COUNT_MASK)) + exit_loop = true; + break; + } + extraWaits++; + } + if (exit_loop) + break; + + pgstat_report_wait_end(); + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + + return curState; +} + +bool +have_locked_pages(void) +{ + return (numberOfMyLockedPages > 0); +} + +/* Wait for a change of the page and lock it. */ +void +relock_page(OInMemoryBlkno blkno) +{ + uint64 state; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + state = my_locked_page_get_state(blkno); + unlock_page(blkno); + + STOPEVENT(STOPEVENT_RELOCK_PAGE, NULL); + + page_wait_for_changecount(blkno, state); + lock_page(blkno); +} + +/* + * Try to lock the given page from concurrent changes. Returns true on success. + */ +bool +try_lock_page(OInMemoryBlkno blkno) +{ + PagePool *ppool = get_ppool_by_blkno(blkno); + Page p = O_GET_IN_MEMORY_PAGE(blkno); + uint64 state; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return true; + + state = pg_atomic_fetch_or_u64(&(O_PAGE_HEADER(p)->state), + PAGE_STATE_LOCKED_FLAG); + + if (O_PAGE_STATE_IS_LOCKED(state)) + return false; + + EA_LOCK_INC(blkno); + my_locked_page_add(blkno, state | PAGE_STATE_LOCKED_FLAG); + ppool_ucm_inc_usage(ppool, blkno); + + return true; +} + +/* + * Declare newly created page as already locked by our process. + */ +void +delare_page_as_locked(OInMemoryBlkno blkno) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + my_locked_page_add(blkno, pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state))); +} + +/* + * Check if page is locked. + */ +bool +page_is_locked(OInMemoryBlkno blkno) +{ + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return false; + + return (get_my_locked_page_index(blkno) >= 0); +} + +/* + * Block reads on locked page to prepare it for the modification. + */ +void +page_block_reads(OInMemoryBlkno blkno) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + uint64 state; + int i; + + if (O_PAGE_IS_LOCAL(blkno)) + { + /* + * Local pages don't go through the lock_page / unlock_page path that + * bumps the change count on modification, so a same-backend + * partial_load_chunk() would otherwise miss writes to the page + * between the descent and the iterator's later reads (parentImg + * carries partial state across find_page calls). No concurrency, so + * a plain RMW on state is enough. + */ + OrioleDBPageHeader *hdr = (OrioleDBPageHeader *) p; + uint64 old = pg_atomic_read_u64(&hdr->state); + uint64 newChangeCount = ((old & PAGE_STATE_CHANGE_COUNT_MASK) + + PAGE_STATE_CHANGE_COUNT_ONE) & + PAGE_STATE_CHANGE_COUNT_MASK; + + pg_atomic_write_u64(&hdr->state, + (old & ~PAGE_STATE_CHANGE_COUNT_MASK) | newChangeCount); + return; + } + + i = get_my_locked_page_index(blkno); + + Assert((myLockedPages[i].state & PAGE_STATE_CHANGE_NON_WAITERS_MASK) == + (pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state)) & PAGE_STATE_CHANGE_NON_WAITERS_MASK)); + + state = pg_atomic_fetch_or_u64(&(O_PAGE_HEADER(p)->state), PAGE_STATE_NO_READ_FLAG); + Assert((state & PAGE_STATE_LOCKED_FLAG)); + myLockedPages[i].state = state | PAGE_STATE_NO_READ_FLAG; +} + +int +get_waiters_with_tuples(BTreeDescr *desc, + OInMemoryBlkno blkno, + int result[BTREE_PAGE_MAX_SPLIT_ITEMS]) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + uint32 pgprocnum; + int count = 0; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return 0; + + pgprocnum = pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state)) & PAGE_STATE_LIST_TAIL_MASK; + + while (pgprocnum != PAGE_STATE_INVALID_PROCNO) + { + OPageWaiterShmemState *lockerState = &lockerStates[pgprocnum]; + + if (lockerState->status == OPageWaitInsert && + lockerState->pageChangeCount == O_PAGE_HEADER(p)->pageChangeCount && + ORelOidsIsEqual(desc->oids, lockerState->reloids)) + { + result[count++] = pgprocnum; + if (count >= BTREE_PAGE_MAX_SPLIT_ITEMS) + { + Assert(count == BTREE_PAGE_MAX_SPLIT_ITEMS); + break; + } + } + + pgprocnum = lockerState->next; + } + + return count; +} + +void +mark_waiter_tuples_inserted(int procnums[BTREE_PAGE_MAX_SPLIT_ITEMS], + int count) +{ + int i; + + Assert(count > 0); + + for (i = 0; i < count; i++) + lockerStates[procnums[i]].inserted = true; + +} + +/* + * Check page before unlocking. + */ +static void +unlock_check_page(OInMemoryBlkno blkno) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + +#ifdef CHECK_PAGE_STRUCT + if (O_GET_IN_MEMORY_PAGEDESC(blkno)->type != oIndexInvalid) + o_check_page_struct(NULL, p); +#else + if (O_GET_IN_MEMORY_PAGEDESC(blkno)->type != oIndexInvalid) + { + BTreePageHeader *header = (BTreePageHeader *) p; + BTreePageChunkDesc *lastChunk = &header->chunkDesc[header->chunksCount - 1]; + + if (SHORT_GET_LOCATION(lastChunk->shortLocation) > header->dataSize || + header->dataSize > ORIOLEDB_BLCKSZ) + elog(PANIC, "broken page: (blkno: %u, p: %p, lastChunk: %u, dataSize: %u)", + blkno, p, SHORT_GET_LOCATION(lastChunk->shortLocation), + header->dataSize); + } +#endif + +#ifdef CHECK_PAGE_STATS + { + /* + * XXX: index_oids_get_btree_descr() might expand a hash table under + * critical section. + */ + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + + if (O_PAGE_IS(p, LEAF) && page_desc->type != oIndexInvalid) + { + ORelOids oids = page_desc->oids; + BTreeDescr *desc; + + if (!IS_SYS_TREE_OIDS(oids)) + desc = index_oids_get_btree_descr(oids, page_desc->type); + else + desc = get_sys_tree_no_init(oids.reloid); + if (desc) + o_check_btree_page_statistics(desc, p); + } + } +#endif + +#ifdef USE_ASSERT_CHECKING + if (!O_PAGE_IS(p, LEAF) && OidIsValid(O_GET_IN_MEMORY_PAGEDESC(blkno)->oids.reloid)) + { + int on_disk = 0; + BTreePageItemLocator loc; + + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (DOWNLINK_IS_ON_DISK(tuphdr->downlink)) + on_disk++; + } + Assert(on_disk == PAGE_GET_N_ONDISK(p)); + } +#endif + + VALGRIND_CHECK_MEM_IS_DEFINED(O_GET_IN_MEMORY_PAGE(blkno), ORIOLEDB_BLCKSZ); +} + +/* + * unlock_page_internal -- release a previously locked in‑memory page and wake + * any backends that can now proceed. + * + * The waiters are stored in a lock‑less, singly‑linked list. The tail + * (newest waiter) PGPROC number is packed into the low bits of the 64-bit + * page‑state word. A successful unlock therefore needs to: + * 1. Walk that list; + * 2. Move every suitable waiter (see `shouldWake`) and at most one + * exclusive waiter to a private wake list; + * 3. Patch the shared list so that the removed waiters vanish from it; + * 4. Publish a new page‑state word with the updated tail via atomic CAS; + * 5. If the CAS fails, process the newly added waiters (if any) and retry; + * 6. Finally, wake up all backends we collected on our private list. + * + * The two auxiliary variables `prevTail` and `prevTailPatch` are the key to + * the logic: if we fail the CAS, the list may already contain our previous + * patch (i.e. `prevTail->next` now points somewhere else). We detect that + * and re‑apply the patch in the next iteration instead of trying to start + * from scratch (the latter is not possible, because we might already have + * modified the list). + */ +static void +unlock_page_internal(OInMemoryBlkno blkno, bool split) +{ + Page page = O_GET_IN_MEMORY_PAGE(blkno); + OrioleDBPageHeader *hdr = (OrioleDBPageHeader *) page; + + /* Head of our private stack of waiters to wake once the page is unlocked */ + uint32 wakeListHead = PAGE_STATE_INVALID_PROCNO; + + /* Bookkeeping needed when the CAS fails and we must retry */ + uint32 prevTail = PAGE_STATE_INVALID_PROCNO; + uint32 prevTailPatch = PAGE_STATE_INVALID_PROCNO; + + /* We may wake **one** exclusive waiter per unlock attempt */ + bool exclusiveAlreadyWoken = false; + uint64 state; + + int expectedWakeCount PG_USED_FOR_ASSERTS_ONLY = 0; + int actualWakeCount PG_USED_FOR_ASSERTS_ONLY = 0; + + unlock_check_page(blkno); + + state = pg_atomic_read_u64(&hdr->state); + + for (;;) + { + /* Snapshot the tail encoded in the state word */ + uint32 tail = state & PAGE_STATE_LIST_TAIL_MASK; + uint32 cur = tail; + uint32 prev = PAGE_STATE_INVALID_PROCNO; + uint64 newState; + + uint32 newTail = tail; /* will become the new list tail */ + + /* Remember the first exclusive waiter we may decide to wake */ + uint32 exclusive = PAGE_STATE_INVALID_PROCNO; + uint32 exclusivePrev = PAGE_STATE_INVALID_PROCNO; + + /* -------------------------------------------------------------- + * 1. Walk the waiter list, unlinking suitable lockers on the fly + * --------------------------------------------------------------*/ + while (cur != prevTail) /* stop before the node we patched during the + * previous (failed) iteration */ + { + OPageWaiterShmemState *lock = &lockerStates[cur]; + + bool shouldWake = + lock->inserted || + lock->status == OPageWaitNonExclusive || + (split && lock->status == OPageWaitInsert); + + if (shouldWake) + { + uint32 next = lock->next; + + /* Unlink waiter from shared waiter list */ + if (prev == PAGE_STATE_INVALID_PROCNO) + newTail = next; /* removed the first element */ + else + lockerStates[prev].next = next; + + /* Push waiter onto our private wake list */ + lock->next = wakeListHead; + wakeListHead = cur; + expectedWakeCount++; + + cur = next; + continue; /* stay on the same `prev` */ + } + + /* Remember the first (oldest) exclusive waiter */ + if (!exclusiveAlreadyWoken && exclusive == PAGE_STATE_INVALID_PROCNO) + { + exclusive = cur; + exclusivePrev = prev; + } + + prev = cur; + cur = lock->next; + } + + /* ---------------------------------------------------------------- + * 2. Optionally move the first exclusive waiter to the wake list + * ----------------------------------------------------------------*/ + if (exclusive != PAGE_STATE_INVALID_PROCNO && !exclusiveAlreadyWoken) + { + OPageWaiterShmemState *lock = &lockerStates[exclusive]; + + exclusiveAlreadyWoken = true; + + if (exclusivePrev == PAGE_STATE_INVALID_PROCNO) + newTail = lock->next; /* exclusive was the first node */ + else + lockerStates[exclusivePrev].next = lock->next; + + /* push to wake list */ + lock->next = wakeListHead; + wakeListHead = exclusive; + expectedWakeCount++; + + if (prev == exclusive) + prev = exclusivePrev; + } + + /* ---------------------------------------------------------------- + * 3. Re‑apply the patch from the previous failed CAS attempt + * ----------------------------------------------------------------*/ + if (prevTail != prevTailPatch) + { + Assert(prevTail != PAGE_STATE_INVALID_PROCNO); + + if (prev == PAGE_STATE_INVALID_PROCNO) + newTail = prevTailPatch; /* new head is different */ + else + { + Assert(prev != prevTailPatch); + lockerStates[prev].next = prevTailPatch; + } + } + + /* ---------------------------------------------------------------- + * 4. Compose and try to publish the new page‑state word + * ----------------------------------------------------------------*/ + newState = state & + ~(PAGE_STATE_LIST_TAIL_MASK | + PAGE_STATE_LOCKED_FLAG | + PAGE_STATE_NO_READ_FLAG); + + /* Bump change‑counter if reads had been blocked */ + if (O_PAGE_STATE_READ_IS_BLOCKED(state)) + { + uint64 changeCount = (newState & PAGE_STATE_CHANGE_COUNT_MASK); + + newState &= ~PAGE_STATE_CHANGE_COUNT_MASK; + changeCount += PAGE_STATE_CHANGE_COUNT_ONE; + changeCount &= PAGE_STATE_CHANGE_COUNT_MASK; + newState |= changeCount; + } + + newState |= newTail; + + if (pg_atomic_compare_exchange_u64(&hdr->state, &state, newState)) + break; /* Success! Exit retry loop */ + + /* ---------------------------------------------------------------- + * 5. CAS failed – remember what we did and retry + * ----------------------------------------------------------------*/ + prevTail = tail; + prevTailPatch = newTail; + /* `state` now holds the value returned by the failed CAS */ + } + + /* Cleanup the local list of locked pages */ + my_locked_page_del(blkno); + + /* -------------------------------------------------------------------- + * 6. Waking collected waiters + * --------------------------------------------------------------------*/ + pg_write_barrier(); /* ensure list modifications are visible */ + + for (uint32 procno = wakeListHead; + procno != PAGE_STATE_INVALID_PROCNO;) + { + OPageWaiterShmemState *lockState = &lockerStates[procno]; + uint32 next; + PGPROC *proc = GetPGProcByNumber(procno); + + next = lockState->next; + + /* + * Ensure memory access ordering. The effect of statement above must + * materialize before waking up the waiter, which must see + * lockState->status == OPageWaitWakeUp and can modify + * lockState->next. + */ + pg_memory_barrier(); + + lockState->status = OPageWaitWakeUp; + + /* + * Also, ensure woken up waiter will see lockState->status == + * OPageWaitWakeUp. + */ + pg_memory_barrier(); + + PGSemaphoreUnlock(proc->sem); + actualWakeCount++; + + procno = next; + } + + Assert(actualWakeCount == expectedWakeCount); +} + +void +unlock_page(OInMemoryBlkno blkno) +{ + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + unlock_page_internal(blkno, false); +} + +/* + * Unlock the page after page split. Page should be locked before. + */ +void +unlock_page_after_split(OInMemoryBlkno blkno) +{ + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(blkno)) + return; + + unlock_page_internal(blkno, true); +} + +/* + * Release all previously acquired page locks one-by-one. + */ +void +release_all_page_locks(void) +{ + pg_write_barrier(); + + while (numberOfMyLockedPages > 0) + unlock_page(myLockedPages[0].blkno); +} + +/* + * Register in-progress split. This split will be marked as incomplete on + * errer cleanup unless it's unregistered before. + * + * Must be called within critical section. + */ +void +btree_register_inprogress_split(OInMemoryBlkno rightBlkno) +{ +#ifdef USE_ASSERT_CHECKING + int i; + + for (i = 0; i < numberOfMyInProgressSplitPages; i++) + Assert(myInProgressSplitPages[i] != rightBlkno); +#endif + Assert(CritSectionCount > 0); + Assert((numberOfMyInProgressSplitPages + 1) <= sizeof(myInProgressSplitPages) / sizeof(myInProgressSplitPages[0])); + myInProgressSplitPages[numberOfMyInProgressSplitPages++] = rightBlkno; +} + +/* + * Unregister in-progress split. + * + * Must be calles within critical section. + */ +void +btree_unregister_inprogress_split(OInMemoryBlkno rightBlkno) +{ + int i; + + Assert(CritSectionCount > 0); + Assert(numberOfMyInProgressSplitPages > 0); + for (i = 0; i < numberOfMyInProgressSplitPages; i++) + { + if (myInProgressSplitPages[i] == rightBlkno) + { + numberOfMyInProgressSplitPages--; + myInProgressSplitPages[i] = myInProgressSplitPages[numberOfMyInProgressSplitPages]; + return; + } + } + Assert(false); +} + +/* + * Marks all in-progress splits as incomplete. + */ +void +btree_mark_incomplete_splits(void) +{ + int i; + + for (i = 0; i < numberOfMyInProgressSplitPages; i++) + btree_split_mark_finished(myInProgressSplitPages[i], true, false); + numberOfMyInProgressSplitPages = 0; +} + +/* + * Marks the split as finished. + * + * It sets O_BTREE_FLAG_BROKEN_SPLIT if success = false or removes rightlink + * on the left page. + * + * It does not call modify_page if use_lock = false. + */ +void +btree_split_mark_finished(OInMemoryBlkno rightBlkno, bool use_lock, bool success) +{ + BTreePageHeader *leftHeader; + BTreePageHeader *rightHeader; + OrioleDBPageDesc *rightPageDesc = O_GET_IN_MEMORY_PAGEDESC(rightBlkno); + OInMemoryBlkno leftBlkno; + + /* Local pages do not need locking */ + if (O_PAGE_IS_LOCAL(rightBlkno)) + use_lock = false; + + leftBlkno = rightPageDesc->leftBlkno; + Assert(OInMemoryBlknoIsValid(leftBlkno)); + + /* + * Still need to lock th left page even if we're going to just set + * BROKEN_SPLIT on the right page, because we need to notify waiters in + * o_btree_split_is_incomplete(). + */ + if (use_lock) + { + while (true) + { + lock_page(leftBlkno); + + if (rightPageDesc->leftBlkno == leftBlkno) + break; + + unlock_page(leftBlkno); + leftBlkno = rightPageDesc->leftBlkno; + Assert(OInMemoryBlknoIsValid(leftBlkno)); + } + } + + lock_page(rightBlkno); + + if (use_lock) + page_block_reads(leftBlkno); + page_block_reads(rightBlkno); + + START_CRIT_SECTION(); + + leftHeader = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(leftBlkno); + rightHeader = (BTreePageHeader *) O_GET_IN_MEMORY_PAGE(rightBlkno); + + Assert(RightLinkIsValid(leftHeader->rightLink)); + Assert(use_lock || success); + + if (success) + { + rightHeader->flags &= ~O_BTREE_FLAG_BROKEN_SPLIT; + leftHeader->rightLink = InvalidRightLink; + rightPageDesc->leftBlkno = OInvalidInMemoryBlkno; + } + else + { + Assert(!O_PAGE_IS(O_GET_IN_MEMORY_PAGE(rightBlkno), BROKEN_SPLIT)); + rightHeader->flags |= O_BTREE_FLAG_BROKEN_SPLIT; + } + + END_CRIT_SECTION(); + + unlock_page(rightBlkno); + + if (use_lock) + unlock_page(leftBlkno); +} + +#ifdef CHECK_PAGE_STRUCT + +extern void log_btree(BTreeDescr *desc); + +/* + * Check if page has a consistent structure. + */ +void +o_check_page_struct(BTreeDescr *desc, Page p) +{ + BTreePageHeader *header = (BTreePageHeader *) p; + int i, + j, + itemsCount; + LocationIndex endLocation, + chunkSize; + OTuple prevChunkHikey; + + Assert(header->dataSize <= ORIOLEDB_BLCKSZ); + Assert(header->hikeysEnd <= header->dataSize); + + O_TUPLE_SET_NULL(prevChunkHikey); + + for (i = 0; i < header->chunksCount; i++) + { + BTreePageChunkDesc *chunk = &header->chunkDesc[i]; + BTreePageChunk *chunkData; + OTuple chunkHikey; + + if (O_PAGE_IS(p, RIGHTMOST) && i == header->chunksCount - 1) + { + O_TUPLE_SET_NULL(chunkHikey); + } + else + { + chunkHikey.formatFlags = header->chunkDesc[i].hikeyFlags; + chunkHikey.data = p + SHORT_GET_LOCATION(header->chunkDesc[i].hikeyShortLocation); + } + + if (!O_PAGE_IS(p, RIGHTMOST) || i < header->chunksCount - 1) + { + Assert((chunk->hikeyFlags & O_TUPLE_FLAGS_FIXED_FORMAT) || !(header->flags & O_BTREE_FLAG_HIKEYS_FIXED)); + } + + if (i > 0) + { + BTreePageChunkDesc *prevChunk = &header->chunkDesc[i - 1] PG_USED_FOR_ASSERTS_ONLY; + + Assert(chunk->shortLocation >= prevChunk->shortLocation); + Assert(chunk->offset >= prevChunk->offset); + Assert(chunk->hikeyShortLocation > prevChunk->hikeyShortLocation); + Assert(SHORT_GET_LOCATION(chunk->hikeyShortLocation) <= header->hikeysEnd); + Assert(SHORT_GET_LOCATION(chunk->shortLocation) <= header->dataSize); + Assert(chunk->offset <= header->itemsCount); + } + else + { + Assert(SHORT_GET_LOCATION(chunk->shortLocation) == header->hikeysEnd || SHORT_GET_LOCATION(chunk->shortLocation) == BTREE_PAGE_HIKEYS_END(NULL, p)); + Assert(chunk->offset == 0); + Assert(SHORT_GET_LOCATION(chunk->hikeyShortLocation) == MAXALIGN(offsetof(BTreePageHeader, chunkDesc) + sizeof(BTreePageChunkDesc) * header->chunksCount)); + } + + if (i == header->chunksCount - 1) + { + if (!O_PAGE_IS(p, RIGHTMOST)) + Assert(SHORT_GET_LOCATION(chunk->hikeyShortLocation) < header->hikeysEnd); + itemsCount = header->itemsCount - chunk->offset; + endLocation = header->dataSize; + } + else + { + Assert(header->chunkDesc[i + 1].offset <= header->itemsCount); + Assert(header->chunkDesc[i + 1].offset >= chunk->offset); + itemsCount = header->chunkDesc[i + 1].offset - chunk->offset; + endLocation = SHORT_GET_LOCATION(header->chunkDesc[i + 1].shortLocation); + Assert(endLocation <= header->dataSize); + } + + chunkData = (BTreePageChunk *) ((Pointer) p + SHORT_GET_LOCATION(chunk->shortLocation)); + chunkSize = endLocation - SHORT_GET_LOCATION(chunk->shortLocation); + Assert(MAXALIGN(sizeof(LocationIndex) * itemsCount) <= chunkSize); + + for (j = 0; j < itemsCount; j++) + { + if (!(i == 0 && j == 0 && !O_PAGE_IS(p, LEAF))) + { + Assert((ITEM_GET_FLAGS(chunkData->items[j]) & O_TUPLE_FLAGS_FIXED_FORMAT) || (chunk->chunkKeysFixed == 0)); + } + Assert(ITEM_GET_OFFSET(chunkData->items[j]) >= MAXALIGN(sizeof(LocationIndex) * itemsCount)); + Assert(ITEM_GET_OFFSET(chunkData->items[j]) <= chunkSize); + if (j > 0) + Assert(ITEM_GET_OFFSET(chunkData->items[j]) >= ITEM_GET_OFFSET(chunkData->items[j - 1])); + if (j < itemsCount - 1 && O_PAGE_IS(p, LEAF) && ITEM_GET_FLAGS(chunkData->items[j]) == 0) + Assert(ITEM_GET_OFFSET(chunkData->items[j]) < ITEM_GET_OFFSET(chunkData->items[j + 1])); + if (desc) + { + OTuple tuple; + int len; + + tuple.formatFlags = ITEM_GET_FLAGS(chunkData->items[j]); + if (O_PAGE_IS(p, LEAF)) + { + tuple.data = (Pointer) chunkData + ITEM_GET_OFFSET(chunkData->items[j]) + BTreeLeafTuphdrSize; + len = BTreeLeafTuphdrSize + o_btree_len(desc, tuple, OTupleLength); + if (!O_TUPLE_IS_NULL(chunkHikey)) + Assert(o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, &chunkHikey, BTreeKeyNonLeafKey) < 0); + if (!O_TUPLE_IS_NULL(prevChunkHikey)) + Assert(o_btree_cmp(desc, &tuple, BTreeKeyLeafTuple, &prevChunkHikey, BTreeKeyNonLeafKey) >= 0); + } + else + { +#ifdef ORIOLEDB_CUT_FIRST_KEY + if (i == 0 && j == 0) + { + len = BTreeNonLeafTuphdrSize; + O_TUPLE_SET_NULL(tuple); + } + else +#endif + { + tuple.data = (Pointer) chunkData + ITEM_GET_OFFSET(chunkData->items[j]) + BTreeNonLeafTuphdrSize; + len = BTreeNonLeafTuphdrSize + o_btree_len(desc, tuple, OKeyLength); + } + if (!O_TUPLE_IS_NULL(chunkHikey) && !O_TUPLE_IS_NULL(tuple)) + Assert(o_btree_cmp(desc, &tuple, BTreeKeyNonLeafKey, &chunkHikey, BTreeKeyNonLeafKey) < 0); + if (!O_TUPLE_IS_NULL(prevChunkHikey) && !O_TUPLE_IS_NULL(tuple)) + Assert(o_btree_cmp(desc, &tuple, BTreeKeyNonLeafKey, &prevChunkHikey, BTreeKeyNonLeafKey) >= 0); + } + + if (j < itemsCount - 1) + Assert(ITEM_GET_OFFSET(chunkData->items[j]) + len <= ITEM_GET_OFFSET(chunkData->items[j + 1])); + else + Assert(ITEM_GET_OFFSET(chunkData->items[j]) + len <= chunkSize); + + } + } + + prevChunkHikey = chunkHikey; + } + +} +#endif + +#ifdef CHECK_PAGE_STATS + +/* + * Check if precalculated number of vacated bytes for leaf pages and number + * of disk downlinks for non-leaf pages is correct. + */ +static void +o_check_btree_page_statistics(BTreeDescr *desc, Pointer p) +{ + if (O_PAGE_IS(p, LEAF)) + { + int nVacatedBytes; + + nVacatedBytes = PAGE_GET_N_VACATED(p); + o_btree_page_calculate_statistics(desc, p); + + Assert(nVacatedBytes == PAGE_GET_N_VACATED(p)); + } + else + { + int nDiskDownlinks; + + nDiskDownlinks = PAGE_GET_N_ONDISK(p); + o_btree_page_calculate_statistics(desc, p); + + Assert(nDiskDownlinks == PAGE_GET_N_ONDISK(p)); + } +} +#endif diff --git a/contrib/orioledb/src/btree/print.c b/contrib/orioledb/src/btree/print.c new file mode 100644 index 00000000000..5d545097186 --- /dev/null +++ b/contrib/orioledb/src/btree/print.c @@ -0,0 +1,896 @@ +/*------------------------------------------------------------------------- + * + * print.c + * Routines for printing orioledb B-tree structure and contents. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/print.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/btree.h" +#include "btree/merge.h" +#include "btree/page_chunks.h" +#include "btree/print.h" +#include "btree/undo.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "tuple/format.h" +#include "utils/page_pool.h" + +#include "access/transam.h" +#include "utils/builtins.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +typedef struct +{ + BTreePrintOptions *options; + /* Page number in NLR tree traversal */ + OInMemoryBlkno NLRPageNumber; + uint32 minCheckpointNum; + CommitSeqNo minCsn; + UndoLocation minUndoLoc; + /* Used for saving backend id number during NLR traversal. */ +#if PG_VERSION_NUM >= 170000 + ProcNumber backendIdInTraversal; +#else + BackendId backendIdInTraversal; +#endif + bool hasCsn; + /* hash mapping of the backend id with the number of */ + HTAB *backendIdHash; + + /* + * hash mapping of the page number in memory with number in the NLR tree + * traversal + */ + HTAB *pageHash; + /* sorted list of unique undo locations in ascending order */ + List *undosList[(int) UndoLogsCount]; +} BTreePrintData; + +typedef OInMemoryBlkno PageHashKey; + +#if PG_VERSION_NUM >= 170000 +typedef ProcNumber BackendIdHashKey; +#else +typedef BackendId BackendIdHashKey; +#endif + +typedef struct +{ + BackendIdHashKey backendId; +#if PG_VERSION_NUM >= 170000 + ProcNumber backendIdInTraversal; +#else + BackendId backendIdInTraversal; +#endif +} BackendIdHashEntry; + +typedef struct +{ + PageHashKey inMemoryPageNumber; + OInMemoryBlkno NLRPageNumber; +} PageHashEntry; + +static void print_page_contents_recursive(BTreeDescr *desc, + OInMemoryBlkno blkno, + PrintFunc keyPrintFunc, + PrintFunc tuplePrintFunc, + Pointer printArg, + BTreePrintData *printData, + int depthLeft, StringInfo outbuf); +static void btree_calculate_min_values(UndoLogType undoType, + OInMemoryBlkno blkno, + BTreePrintData *printData); +static bool btree_print_csn(CommitSeqNo csn, StringInfo outbuf, + BTreePrintData *printData, bool addComma); +static void btree_print_backend_id(OXid oxid, StringInfo outbuf, + BTreePrintData *printData); +static uint64 lundo_location(List *list, UndoLocation location); +static bool btree_print_undo_location(UndoLogType undoType, + UndoLocation undoLocation, + StringInfo outbuf, + BTreePrintData *printData, + bool addComma); +static bool btree_print_format_flags(int formatFlags, StringInfo outbuf, + BTreePrintData *printData, bool addComma); +static void btree_print_page_number(OInMemoryBlkno blkno, StringInfo outbuf, + BTreePrintData *printData); +static void btree_print_orioledb_downlink(uint64 downlink, StringInfo outbuf, + BTreePrintData *printData); +static void btree_print_rightlink(OInMemoryBlkno rightlink, StringInfo outbuf, + BTreePrintData *printData); +static void pdata_set_min_csn(BTreePrintData *printData, + CommitSeqNo csn); +static List *ladd_unique_undo(List *list, + UndoLogType undoType, + UndoLocation location); + + +/* + * Recursively print contents of B-tree pages with given depth. Uses + * callbacks for printing keys and tuples. + */ +void +o_print_btree_pages(BTreeDescr *desc, StringInfo outbuf, + PrintFunc keyPrintFunc, PrintFunc tuplePrintFunc, + Pointer printArg, BTreePrintOptions *options, int depth) +{ + HASHCTL ctl; + BTreePrintData printData = {0}; + int i; + + if (options->undoLogLocationPrintType != BTreeNotPrint && desc->undoType != UndoLogNone) + { + update_min_undo_locations(desc->undoType, false, true); + if (desc->undoType != GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)) + update_min_undo_locations(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType), false, true); + } + Assert(OInMemoryBlknoIsValid(desc->rootInfo.rootPageBlkno) && + OInMemoryBlknoIsValid(desc->rootInfo.metaPageBlkno)); + + printData.options = options; + + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(BackendIdHashKey); + ctl.entrysize = sizeof(BackendIdHashEntry); + ctl.hcxt = CurrentMemoryContext; + printData.backendIdHash = hash_create("backend id hash", GetMaxBackends(), &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + ctl.keysize = sizeof(PageHashKey); + ctl.entrysize = sizeof(PageHashEntry); + ctl.hcxt = CurrentMemoryContext; + printData.pageHash = hash_create("page hash", 8, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* calculate minimal values only if one of the options set */ + if (printData.options->pagePrintType == BTreePrintRelative || + printData.options->csnPrintType == BTreePrintRelative || + printData.options->undoLogLocationPrintType == BTreePrintRelative || + printData.options->backendIdPrintType == BTreePrintAbsolute) + { + printData.minCheckpointNum = UINT32_MAX; + printData.hasCsn = false; + printData.minUndoLoc = InvalidUndoLocation; + printData.backendIdInTraversal = 0; + printData.NLRPageNumber = 0; + for (i = 0; i < (int) UndoLogsCount; i++) + printData.undosList[i] = NIL; + btree_calculate_min_values(desc->undoType, + desc->rootInfo.rootPageBlkno, + &printData); + } + printData.NLRPageNumber = 0; + print_page_contents_recursive(desc, desc->rootInfo.rootPageBlkno, keyPrintFunc, tuplePrintFunc, + printArg, &printData, depth, outbuf); + + for (i = 0; i < (int) UndoLogsCount; i++) + list_free_deep(printData.undosList[i]); + hash_destroy(printData.pageHash); + hash_destroy(printData.backendIdHash); +} + +/* + * Print contents of give B-tree page. If non-leaf page is given, recursively + * print childredn. + */ +static void +print_page_contents_recursive(BTreeDescr *desc, OInMemoryBlkno blkno, + PrintFunc keyPrintFunc, + PrintFunc tuplePrintFunc, + Pointer printArg, + BTreePrintData *printData, + int depthLeft, StringInfo outbuf) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + OrioleDBPageDesc *page_desc = O_GET_IN_MEMORY_PAGEDESC(blkno); + BTreePageItemLocator loc; + OffsetNumber i, + j, + k; + + if (depthLeft <= 0) + return; + + btree_print_page_number(blkno, outbuf, printData); + appendStringInfo(outbuf, "level = %d, maxKeyLen = %d", + PAGE_GET_LEVEL(p), + header->maxKeyLen); + btree_print_csn(header->csn, outbuf, printData, true); + + if (UndoLocationIsValid(header->undoLocation)) + { + btree_print_undo_location(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType), + header->undoLocation, + outbuf, printData, true); + } + + if (O_PAGE_IS(p, LEAF)) + appendStringInfo(outbuf, ", nVacatedBytes = %u", PAGE_GET_N_VACATED(p)); + + if (is_page_too_sparse(desc, p)) + appendStringInfo(outbuf, ", sparse"); + + appendStringInfo(outbuf, "\n"); + + if (printData->options->printStateValue) + appendStringInfo(outbuf, "state = " UINT64_FORMAT, pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state))); + else + { + uint64 state = pg_atomic_read_u64(&(O_PAGE_HEADER(p)->state)); + + if (O_PAGE_STATE_READ_IS_BLOCKED(state)) + appendStringInfo(outbuf, "state = modify"); + else if (O_PAGE_STATE_IS_LOCKED(state)) + appendStringInfo(outbuf, "state = locked"); + else + appendStringInfo(outbuf, "state = free"); + } + + if (printData->options->changeCountPrintType == BTreePrintAbsolute) + appendStringInfo(outbuf, ", pageChangeCount = %u", O_PAGE_HEADER(p)->pageChangeCount); + + appendStringInfo(outbuf, ", datoid "); + if (printData->options->idsPrintType == BTreePrintRelative) + appendStringInfo(outbuf, "%sequal", page_desc->oids.datoid == desc->oids.datoid ? "" : "not "); + else + appendStringInfo(outbuf, "= %u", page_desc->oids.datoid); + + appendStringInfo(outbuf, ", relnode "); + if (printData->options->idsPrintType == BTreePrintRelative) + appendStringInfo(outbuf, "%sequal", page_desc->oids.relnode == desc->oids.relnode ? "" : "not "); + else + appendStringInfo(outbuf, "= %u", page_desc->oids.relnode); + + switch (desc->type) + { + case oIndexInvalid: + appendStringInfo(outbuf, ", ix_type = invalid"); + break; + case oIndexToast: + appendStringInfo(outbuf, ", ix_type = toast"); + break; + case oIndexPrimary: + appendStringInfo(outbuf, ", ix_type = primary"); + break; + case oIndexUnique: + appendStringInfo(outbuf, ", ix_type = unique"); + break; + case oIndexRegular: + appendStringInfo(outbuf, ", ix_type = regular"); + break; + case oIndexBridge: + appendStringInfo(outbuf, ", ix_type = bridge"); + break; + case oIndexExclusion: + appendStringInfo(outbuf, ", ix_type = exclusion"); + break; + default: + appendStringInfo(outbuf, ", ix_type = wrong"); + break; + } + + if (OCompressIsValid(desc->compress)) + appendStringInfo(outbuf, ", compression = %d", desc->compress); + + if (IS_DIRTY(blkno)) + { + if (!IS_DIRTY_CONCURRENT(blkno)) + appendStringInfo(outbuf, ", dirty, concurrent IO"); + else + appendStringInfo(outbuf, ", dirty"); + } + else + appendStringInfo(outbuf, ", clean"); + + if (printData->options->printFileOffset) + { + if (FileExtentIsValid(page_desc->fileExtent)) + appendStringInfo(outbuf, ", fileOffset = %lu", (long unsigned) page_desc->fileExtent.off); + else + appendStringInfo(outbuf, ", fileOffset is invalid"); + } + + if (printData->options->checkpointNumPrintType == BTreePrintAbsolute) + appendStringInfo(outbuf, ", checkpointNum = %u", header->o_header.checkpointNum); + else if (printData->options->checkpointNumPrintType == BTreePrintRelative) + appendStringInfo(outbuf, ", checkpointNum = %u", header->o_header.checkpointNum - printData->minCheckpointNum); + + if (printData->options->printFixedFlags && O_PAGE_IS(p, HIKEYS_FIXED)) + appendStringInfo(outbuf, ", hikeys fixed"); + + if (O_PAGE_IS(p, BROKEN_SPLIT)) + appendStringInfo(outbuf, ", broken split"); + + appendStringInfo(outbuf, "\n"); + + appendStringInfo(outbuf, O_PAGE_IS(p, LEFTMOST) ? " Leftmost, " : " "); + if (!O_PAGE_IS(p, RIGHTMOST)) + { + OTuple hikey; + + btree_print_rightlink(RIGHTLINK_GET_BLKNO(header->rightLink), outbuf, printData); + BTREE_PAGE_GET_HIKEY(hikey, p); + appendStringInfo(outbuf, " Hikey: offset = %d, key = ", + (int) ((Pointer) hikey.data - (Pointer) p)); + keyPrintFunc(desc, outbuf, hikey, printArg); + appendStringInfo(outbuf, "\n"); + } + else + { + appendStringInfo(outbuf, "Rightmost\n"); + } + + i = 0; + j = 0; + for (j = 0; j < header->chunksCount; j++) + { + appendStringInfo(outbuf, " Chunk %i: offset = %u, location = %u, hikey location = %u", + j, + header->chunkDesc[j].offset, + SHORT_GET_LOCATION(header->chunkDesc[j].shortLocation), + SHORT_GET_LOCATION(header->chunkDesc[j].hikeyShortLocation)); + if (!O_PAGE_IS(p, RIGHTMOST) || j < header->chunksCount - 1) + { + OTuple hikey; + + hikey.formatFlags = header->chunkDesc[j].hikeyFlags; + hikey.data = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[j].hikeyShortLocation); + appendStringInfo(outbuf, ", hikey = "); + keyPrintFunc(desc, outbuf, hikey, printArg); + } + if (printData->options->printFixedFlags && header->chunkDesc[j].chunkKeysFixed) + appendStringInfo(outbuf, ", items fixed"); + appendStringInfo(outbuf, "\n"); + page_chunk_fill_locator(p, j, &loc); + for (k = 0; k < loc.chunkItemsCount; k++) + { + loc.itemOffset = k; + if (O_PAGE_IS(p, LEAF)) + { + BTreeLeafTuphdr tuphdr, + *pageTuphdr; + OTuple tuple; + bool inUndo = false; + + BTREE_PAGE_READ_LEAF_ITEM(pageTuphdr, tuple, p, &loc); + tuphdr = *pageTuphdr; + appendStringInfo(outbuf, " Item %i: ", i); + + while (true) + { + bool needsComma = false; + + if (inUndo) + appendStringInfo(outbuf, " Undo item: "); + + if (XACT_INFO_IS_FINISHED(tuphdr.xactInfo)) + { + needsComma = btree_print_csn(XACT_INFO_MAP_CSN(tuphdr.xactInfo), outbuf, printData, false); + } + else + { + int lockMode = XACT_INFO_GET_LOCK_MODE(tuphdr.xactInfo); + + if (XACT_INFO_IS_LOCK_ONLY(tuphdr.xactInfo)) + { + appendStringInfo(outbuf, "lock only, "); + } + switch (lockMode) + { + case RowLockKeyShare: + appendStringInfo(outbuf, "mode = keyShare"); + break; + case RowLockShare: + appendStringInfo(outbuf, "mode = share"); + break; + case RowLockNoKeyUpdate: + appendStringInfo(outbuf, "mode = noKeyUpdate"); + break; + case RowLockUpdate: + appendStringInfo(outbuf, "mode = update"); + break; + default: + elog(ERROR, "Invalid lock mode: %u", lockMode); + break; + } + needsComma = true; + btree_print_backend_id(XACT_INFO_GET_OXID(tuphdr.xactInfo), outbuf, printData); + } + + needsComma |= btree_print_format_flags(tuphdr.formatFlags, + outbuf, printData, + needsComma); + + if (tuphdr.deleted != BTreeLeafTupleNonDeleted) + { + if (needsComma) + appendStringInfo(outbuf, ", "); + else + needsComma = true; + if (tuphdr.deleted == BTreeLeafTupleDeleted) + appendStringInfo(outbuf, "deleted"); + else if (tuphdr.deleted == BTreeLeafTupleMovedPartitions) + appendStringInfo(outbuf, "moved partitions"); + else if (tuphdr.deleted == BTreeLeafTuplePKChanged) + appendStringInfo(outbuf, "PK changed"); + } + needsComma |= btree_print_undo_location(desc->undoType, (UndoLocation) tuphdr.undoLocation, outbuf, printData, needsComma); + + if (!inUndo) + { + if (needsComma) + appendStringInfo(outbuf, ", "); + else + needsComma = true; + appendStringInfo(outbuf, "offset = %u", + BTREE_PAGE_GET_ITEM_OFFSET(p, &loc)); + } + + if (tuphdr.chainHasLocks) + { + if (needsComma) + appendStringInfo(outbuf, ", "); + else + needsComma = true; + appendStringInfo(outbuf, "chainHasLocks"); + } + + if (!O_TUPLE_IS_NULL(tuple)) + { + if (needsComma) + appendStringInfo(outbuf, ", "); + else + needsComma = true; + appendStringInfo(outbuf, "tuple = "); + tuplePrintFunc(desc, outbuf, tuple, printArg); + } + appendStringInfo(outbuf, "\n"); + + if ((!XACT_INFO_IS_FINISHED(tuphdr.xactInfo) || tuphdr.chainHasLocks) && + UndoLocationIsValid(tuphdr.undoLocation)) + { + if (!UNDO_REC_EXISTS(desc->undoType, tuphdr.undoLocation)) + { + appendStringInfo(outbuf, "INVALID UNDO LOCATION: %llu\n", (unsigned long long) tuphdr.undoLocation); + break; + } + if (inUndo && !O_TUPLE_IS_NULL(tuple)) + pfree(tuple.data); + if (!tuphdr.deleted && !XACT_INFO_IS_LOCK_ONLY(tuphdr.xactInfo)) + { + get_prev_leaf_header_and_tuple_from_undo(desc->undoType, + &tuphdr, &tuple, 0); + } + else + { + get_prev_leaf_header_from_undo(desc->undoType, &tuphdr, false); + O_TUPLE_SET_NULL(tuple); + } + inUndo = true; + } + else + { + break; + } + } + if (inUndo && !O_TUPLE_IS_NULL(tuple)) + pfree(tuple.data); + } + else + { + BTreeNonLeafTuphdr *tuphdr; + OTuple tuple; + + BTREE_PAGE_READ_INTERNAL_ITEM(tuphdr, tuple, p, &loc); + + appendStringInfo(outbuf, " Item %i: ", i); + appendStringInfo(outbuf, "offset = %u", + BTREE_PAGE_GET_ITEM_OFFSET(p, &loc)); + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + btree_print_orioledb_downlink(tuphdr->downlink, outbuf, printData); + else if (DOWNLINK_IS_IN_IO(tuphdr->downlink)) + appendStringInfo(outbuf, ", in-progress (%u)", + DOWNLINK_GET_IO_LOCKNUM(tuphdr->downlink)); + else + appendStringInfo(outbuf, ", downlink = on-disk (%lu, %u)", + DOWNLINK_GET_DISK_OFF(tuphdr->downlink), + DOWNLINK_GET_DISK_LEN(tuphdr->downlink)); + if (i != 0) + { + appendStringInfo(outbuf, ", key = "); + keyPrintFunc(desc, outbuf, tuple, printArg); + } + appendStringInfo(outbuf, "\n"); + } + i++; + } + } + appendStringInfo(outbuf, "\n"); + + printData->NLRPageNumber++; + + if (!O_PAGE_IS(p, LEAF)) + { + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + Pointer ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) ptr; + + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + print_page_contents_recursive(desc, + DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink), + keyPrintFunc, tuplePrintFunc, printArg, printData, + depthLeft - 1, outbuf); + } + } + + blkno = RIGHTLINK_GET_BLKNO(BTREE_PAGE_GET_RIGHTLINK(p)); + if (OInMemoryBlknoIsValid(blkno)) + print_page_contents_recursive(desc, blkno, keyPrintFunc, tuplePrintFunc, + printArg, printData, depthLeft, outbuf); +} + +/* + * Calculate values needed for printing page positions in NLR traversal, + * relative CSNs and Undo locations. + */ +static void +btree_calculate_min_values(UndoLogType undoType, OInMemoryBlkno blkno, + BTreePrintData *printData) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + PageHashEntry *pageHashEntry; + BackendIdHashEntry *backendIdHashEntry; + BTreePageItemLocator loc; + bool found; + UndoLogType pageUndoType = GET_PAGE_LEVEL_UNDO_TYPE(undoType); + + + /* if page number is not in hash, then add new value to hash */ + pageHashEntry = (PageHashEntry *) hash_search(printData->pageHash, + &blkno, HASH_ENTER, &found); + if (!found) + { + pageHashEntry->NLRPageNumber = printData->NLRPageNumber; + printData->NLRPageNumber++; + } + + printData->minCheckpointNum = Min(printData->minCheckpointNum, + header->o_header.checkpointNum); + printData->undosList[(int) pageUndoType] = ladd_unique_undo(printData->undosList[(int) pageUndoType], + pageUndoType, + header->undoLocation); + + /* Iterate over the child nodes */ + BTREE_PAGE_FOREACH_ITEMS(p, &loc) + { + Pointer ptr = BTREE_PAGE_LOCATOR_GET_ITEM(p, &loc); + + if (O_PAGE_IS(p, LEAF)) + { + BTreeLeafTuphdr tuphdr = *((BTreeLeafTuphdr *) ptr); + + while (true) + { + printData->undosList[(int) undoType] = ladd_unique_undo(printData->undosList[(int) undoType], + undoType, + tuphdr.undoLocation); + + if (XACT_INFO_IS_FINISHED(tuphdr.xactInfo)) + { + pdata_set_min_csn(printData, XACT_INFO_MAP_CSN(tuphdr.xactInfo)); + } + else + { + OXid oxid = XACT_INFO_GET_OXID(tuphdr.xactInfo); + uint32 procnum = oxid_get_procnum(oxid); + + backendIdHashEntry = (BackendIdHashEntry *) hash_search(printData->backendIdHash, + &procnum, + HASH_ENTER, + &found); + + /* + * if backend id wasn't in hash that means it first + * appearence of backend saving id in traversal to hash + * needed + */ + if (!found) + { + backendIdHashEntry->backendIdInTraversal = printData->backendIdInTraversal; + printData->backendIdInTraversal++; + } + } + if ((!XACT_INFO_IS_FINISHED(tuphdr.xactInfo) || tuphdr.chainHasLocks) && + UndoLocationIsValid(tuphdr.undoLocation)) + get_prev_leaf_header_from_undo(undoType, &tuphdr, false); + else + break; + } + } + else + { + BTreeNonLeafTuphdr *tuphdr = (BTreeNonLeafTuphdr *) ptr; + + /* If tuple is downlink in memory */ + if (DOWNLINK_IS_IN_MEMORY(tuphdr->downlink)) + { + /* recursively traverse to every child */ + btree_calculate_min_values(undoType, + DOWNLINK_GET_IN_MEMORY_BLKNO(tuphdr->downlink), + printData); + } + } + } + + /* If node has valid rightlink also traverse to it */ + blkno = RIGHTLINK_GET_BLKNO(BTREE_PAGE_GET_RIGHTLINK(p)); + if (OInMemoryBlknoIsValid(blkno)) + btree_calculate_min_values(undoType, blkno, printData); +} + +/* + * Print in memory downlink for child node + */ +static bool +btree_print_csn(CommitSeqNo csn, StringInfo outbuf, BTreePrintData *printData, bool addComma) +{ + CommitSeqNo printedCsn = csn; + + /* print csn if option has another value then BTreePrintAbsolute */ + if (printData->options->csnPrintType != BTreeNotPrint) + { + if (addComma) + appendStringInfo(outbuf, ", "); + appendStringInfo(outbuf, "csn = "); + if (COMMITSEQNO_IS_FROZEN(printedCsn)) + appendStringInfo(outbuf, "FROZEN"); + else if (COMMITSEQNO_IS_INPROGRESS(printedCsn)) + appendStringInfo(outbuf, "INPROGRESS"); + else + { + /* + * If relative csn option set, then substract min csn from + * absolute node csn + */ + if (printData->options->csnPrintType == BTreePrintRelative) + printedCsn = csn - printData->minCsn; + appendStringInfo(outbuf, UINT64_FORMAT, printedCsn); + } + return true; + } + return false; +} + +/* + * Print node backend id + */ +static void +btree_print_backend_id(OXid oxid, StringInfo outbuf, BTreePrintData *printData) +{ +#if PG_VERSION_NUM >= 170000 + ProcNumber backendId = oxid_get_procnum(oxid); +#else + BackendId backendId = oxid_get_procnum(oxid); +#endif + + BackendIdHashEntry *hentry; + bool found; + + if (printData->options->backendIdPrintType != BTreeNotPrint) + { + /* find backend id in traversal by backend id */ + hentry = (BackendIdHashEntry *) hash_search(printData->backendIdHash, &backendId, HASH_FIND, &found); + Assert(found); + appendStringInfo(outbuf, ", backend = %d", hentry->backendIdInTraversal); + } +} + +static uint64 +lundo_location(List *list, UndoLocation location) +{ + ListCell *lc; + uint64 i = 0; + + foreach(lc, list) + { + if (*((UndoLocation *) lfirst(lc)) == location) + break; + i++; + } + return i; +} + +static bool +btree_print_undo_location(UndoLogType undoType, UndoLocation undoLocation, + StringInfo outbuf, BTreePrintData *printData, + bool addComma) +{ + UndoLocation printedUndoLoc = undoLocation; + BTreePrintOption printType = printData->options->undoLogLocationPrintType; + + if (printType != BTreeNotPrint && undoType != UndoLogNone) + { + /* print undo location only if it is valid */ + if (UndoLocationIsValid(undoLocation) && + (((UNDO_REC_EXISTS(undoType, undoLocation) && printType == BTreePrintAbsolute)) || + ((UNDO_REC_XACT_RETAIN(undoType, undoLocation) && printType == BTreePrintRelative)))) + { + /* + * if ascending number option set, then it gets number of undo + * location in sorted list + */ + if (printData->options->undoLogLocationPrintType == + BTreePrintRelative) + { + printedUndoLoc = lundo_location(printData->undosList[(int) undoType], undoLocation); + Assert(!printedUndoLoc || + printedUndoLoc != list_length(printData->undosList[(int) undoType])); + } + if (addComma) + appendStringInfo(outbuf, ", "); + appendStringInfo(outbuf, "undoLocation = " UINT64_FORMAT, printedUndoLoc); + return true; + } + } + return false; +} + +static bool +btree_print_format_flags(int formatFlags, StringInfo outbuf, + BTreePrintData *printData, bool addComma) +{ + if (printData->options->printFormatFlags) + { + if (addComma) + appendStringInfo(outbuf, ", "); + appendStringInfo(outbuf, "format = %sFIXED", + formatFlags == O_TUPLE_FLAGS_FIXED_FORMAT ? "" : + "NOT "); + return true; + } + return false; +} + +/* + * Print page number for node + */ +static void +btree_print_page_number(OInMemoryBlkno blkno, StringInfo outbuf, BTreePrintData *printData) +{ + PageHashEntry *hentry; + bool found; + OInMemoryBlkno printedPageNumber = blkno; + + /* print page number in NLR traverse only if corresponding option set */ + if (printData->options->pagePrintType == BTreePrintRelative) + { + /* find the corresponding page number in NLR traversal */ + hentry = (PageHashEntry *) hash_search(printData->pageHash, &blkno, HASH_FIND, &found); + Assert(found); + printedPageNumber = hentry->NLRPageNumber; + } + appendStringInfo(outbuf, "Page %u: ", printedPageNumber); +} + +/* + * Print in memory downlink for child node + */ +static void +btree_print_orioledb_downlink(uint64 downlink, StringInfo outbuf, BTreePrintData *printData) +{ + PageHashEntry *hentry; + bool found; + OInMemoryBlkno printedPageNumber = DOWNLINK_GET_IN_MEMORY_BLKNO(downlink); + + /* print page number in NLR traverse only if corresponding option set */ + if (printData->options->pagePrintType == BTreePrintRelative) + { + /* find the corresponding page number in NLR traversal */ + hentry = (PageHashEntry *) hash_search(printData->pageHash, &printedPageNumber, HASH_FIND, &found); + Assert(found); + printedPageNumber = hentry->NLRPageNumber; + } + appendStringInfo(outbuf, ", downlink = %u", printedPageNumber); + if (printData->options->changeCountPrintType == BTreePrintAbsolute) + appendStringInfo(outbuf, " (%u)", DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(downlink)); +} + +/* + * Print rightlink for node + */ +static void +btree_print_rightlink(OInMemoryBlkno rightlink, StringInfo outbuf, BTreePrintData *printData) +{ + PageHashEntry *hentry; + bool found; + OInMemoryBlkno printedPageNumber = rightlink; + + /* + * print rightlink page number in NLR traverse only if corresponding + * option set + */ + if (OInMemoryBlknoIsValid(rightlink)) + { + if (printData->options->pagePrintType == BTreePrintRelative) + { + /* find the corresponding page number in NLR traversal */ + hentry = (PageHashEntry *) hash_search(printData->pageHash, &printedPageNumber, HASH_FIND, &found); + Assert(found); + printedPageNumber = hentry->NLRPageNumber; + } + appendStringInfo(outbuf, "Rightlink = %u\n", printedPageNumber); + } + else + appendStringInfo(outbuf, "Rightlink is invalid\n"); +} + +static void +pdata_set_min_csn(BTreePrintData *printData, CommitSeqNo csn) +{ + if (!COMMITSEQNO_IS_NORMAL(csn)) + return; + + if (!printData->hasCsn || printData->minCsn > csn) + { + printData->hasCsn = true; + printData->minCsn = csn; + } +} + +/* adds unique undo location in ascending order */ +static List * +ladd_unique_undo(List *list, UndoLogType undoType, UndoLocation location) +{ + ListCell *lc; + UndoLocation lLoc, + *copyLoc; + int insertAt = -1, + i; + + if (!UndoLocationIsValid(location) || + !UNDO_REC_XACT_RETAIN(undoType, location)) + return list; + + copyLoc = palloc(sizeof(UndoLocation)); + *copyLoc = location; + + /* lappend_cell does not work with NIL list */ + if (list == NIL) + return lappend(list, copyLoc); + + lLoc = *((UndoLocation *) linitial(list)); + if (lLoc > location) + return lcons(copyLoc, list); + + i = 0; + foreach(lc, list) + { + lLoc = *((UndoLocation *) lfirst(lc)); + if (lLoc == location) + { + pfree(copyLoc); + return list; + } + if (lLoc > location) + break; + insertAt = i; + i++; + } + Assert(insertAt >= 0); + list = list_insert_nth(list, insertAt, copyLoc); + return list; +} diff --git a/contrib/orioledb/src/btree/scan.c b/contrib/orioledb/src/btree/scan.c new file mode 100644 index 00000000000..2924a4b7e2a --- /dev/null +++ b/contrib/orioledb/src/btree/scan.c @@ -0,0 +1,1979 @@ +/*------------------------------------------------------------------------- + * + * scan.c + * Routines for sequential scan of orioledb B-tree + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/scan.c + * + * ALGORITHM + * + * The big picture algorithm of sequential scan is following. + * 1. Scan all the internal pages with level == 1. The total amount of + * internal pages are expected to be small. So, it should be OK to + * scan them in logical order. + * 1.1. Immediately scan children's leaves and return their contents. + * 1.2. Edge cases are handled using iterators. They are expected to + * be very rare. + * 1.3. Collect on-disk downlinks into an array together with CSN at + * the moment of the corresponding internal page read. + * 2. Ascending sort array of downlinks providing as sequential access + * pattern as possible. + * 3. Scan sorted downlink and apply the corresponding CSN. + * + * PARALLEL SCAN + * + * The parallel sequential scan is implemented as follows. + * 1. The scan leader creates a shared DSM array for on-disk downlinks, + * initially sized to TREE_NUM_LEAF_PAGES. + * 2. Two internal page images (level == 1) are kept in shared memory. + * 3. Workers are iterating the downlinks of these pages in parallel + * one by one. On-disk downlinks are written directly to the shared + * DSM array. If the array is full, it is reallocated under lock. + * 4. Once the internal page is finished, one worker loads the next page in + * its place. Other workers continue to process the downlink of the + * remaining page. + * 5. Once internal page processing is finished, one worker sorts the + * shared on-disk downlinks array under lock. + * 6. Workers process on-disk downlinks in parallel one by one. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/btree.h" +#include "btree/find.h" +#include "btree/io.h" +#include "btree/iterator.h" +#include "btree/page_chunks.h" +#include "btree/scan.h" +#include "btree/undo.h" +#include "tableam/descr.h" +#include "transam/oxid.h" +#include "tableam/descr.h" +#include "tuple/slot.h" +#include "utils/page_pool.h" +#include "utils/resowner.h" +#include "utils/sampling.h" +#include "utils/stopevent.h" + +#include "miscadmin.h" +#include "utils/wait_event.h" + +typedef enum +{ + BTreeSeqScanInMemory, + BTreeSeqScanDisk, + BTreeSeqScanFinished +} BTreeSeqScanStatus; + +typedef struct +{ + uint64 downlink; + CommitSeqNo csn; +} BTreeSeqScanDiskDownlink; + +struct BTreeSeqScan +{ + BTreeDescr *desc; + + char leafImg[ORIOLEDB_BLCKSZ]; + char histImg[ORIOLEDB_BLCKSZ]; + + bool initialized; + bool checkpointNumberSet; + OSnapshot oSnapshot; + OBTreeFindPageContext context; + OFixedKey prevHikey; + BTreeLocationHint hint; + + BTreePageItemLocator intLoc; + + /* + * The page offset we started with according to `prevHikey`; + */ + OffsetNumber intStartOffset; + + BTreePageItemLocator leafLoc; + + bool haveHistImg; + BTreePageItemLocator histLoc; + + BTreeSeqScanStatus status; + MemoryContext mctx; + + BTreeSeqScanDiskDownlink *diskDownlinks; + int64 downlinksCount; /* Used only for serial scan */ + int64 downlinkIndex; + int64 allocatedDownlinks; + + BTreeIterator *iter; + OTuple iterEnd; + + /* + * Number of the last completed checkpoint when scan was started. We need + * on-disk pages of this checkpoint to be not overridden until scan + * finishes. This means we shouldn't start using free blocks of later + * checkpoints before this scan is finished. + */ + uint32 checkpointNumber; + + BTreeMetaPage *metaPageBlkno; + dlist_node listNode; + + OFixedKey nextKey; + + bool needSampling; + BlockSampler sampler; + BlockNumber samplingNumber; + BlockNumber samplingNext; + + BTreeSeqScanCallbacks *cb; + void *arg; + bool isSingleLeafPage; /* Scan couldn't read first internal page */ + OFixedKey keyRangeLow, + keyRangeHigh; + bool firstPageIsLoaded; + + /* Private parallel worker info in a backend */ + ParallelOScanDesc poscan; + int workerNumber; + dsm_segment *dsmSeg; + + /* Ensures scan cleanup on transaction abort or resource owner release */ + ResourceOwner resowner; +}; + +static dlist_head listOfScans = DLIST_STATIC_INIT(listOfScans); + +static void scan_make_iterator(BTreeSeqScan *scan, OTuple startKey, OTuple keyRangeHigh); +static void get_next_key(BTreeSeqScan *scan, BTreePageItemLocator *intLoc, OFixedKey *nextKey, Page page); +static void ResourceOwnerRememberBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan); +static void ResourceOwnerForgetBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan); + +/* + * Resource owner integration for BTreeSeqScan. + * + * Previously seq_scans_cleanup() only ran after transaction finish, so seq + * scans were not released correctly on subtransaction finish, release of + * prepared statements, etc. Binding seq scans to ResourceOwner solves this. + * + * PG >= 17 uses custom ResourceOwner resources. PG 16 uses a release + * callback. + */ +#if PG_VERSION_NUM >= 170000 +static void ResOwnerReleaseBTreeSeqScan(Datum res); +static char *ResOwnerPrintBTreeSeqScan(Datum res); + +static const ResourceOwnerDesc btree_seq_scan_resowner_desc = +{ + .name = "OrioleDB BTreeSeqScans", + .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, + .release_priority = RELEASE_PRIO_RELCACHE_REFS - 1, + .ReleaseResource = ResOwnerReleaseBTreeSeqScan, + .DebugPrint = ResOwnerPrintBTreeSeqScan +}; +#endif + +BTreeScanShmem *btreeScanShmem; + +Size +btree_scan_shmem_needs(void) +{ + return CACHELINEALIGN(sizeof(BTreeScanShmem)); +} + +void +btree_scan_init_shmem(Pointer ptr, bool found) +{ + btreeScanShmem = (BTreeScanShmem *) ptr; + + if (!found) + { + btreeScanShmem->pageLoadTrancheId = LWLockNewTrancheId(); + btreeScanShmem->downlinksPublishTrancheId = LWLockNewTrancheId(); + } + + LWLockRegisterTranche(btreeScanShmem->pageLoadTrancheId, + "OBTreeScanPageLoadTrancheId"); + LWLockRegisterTranche(btreeScanShmem->downlinksPublishTrancheId, + "OBTreeScanDownlinksPublishTrancheId"); +} + + +static void +load_first_historical_page(BTreeSeqScan *scan) +{ + BTreePageHeader *header = (BTreePageHeader *) scan->leafImg; + Pointer key = NULL; + BTreeKeyType kind = BTreeKeyNone; + OFixedKey lokey, + *lokeyPtr = &lokey; + OFixedKey hikey; + + scan->haveHistImg = false; + if (!COMMITSEQNO_IS_NORMAL(scan->oSnapshot.csn)) + return; + + if (!O_PAGE_IS(scan->leafImg, RIGHTMOST)) + copy_fixed_hikey(scan->desc, &hikey, scan->leafImg); + else + O_TUPLE_SET_NULL(hikey.tuple); + O_TUPLE_SET_NULL(lokey.tuple); + + while (COMMITSEQNO_IS_NORMAL(header->csn) && + header->csn >= scan->oSnapshot.csn) + { + if (!UNDO_REC_EXISTS(GET_PAGE_LEVEL_UNDO_TYPE(scan->desc->undoType), + header->undoLocation)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("snapshot too old"))); + } + + (void) get_page_from_undo(scan->desc, header->undoLocation, key, kind, + scan->histImg, NULL, NULL, NULL, + lokeyPtr, &hikey.tuple); + + if (!O_PAGE_IS(scan->histImg, RIGHTMOST)) + copy_fixed_hikey(scan->desc, &hikey, scan->histImg); + else + O_TUPLE_SET_NULL(hikey.tuple); + + scan->haveHistImg = true; + header = (BTreePageHeader *) scan->histImg; + if (!O_TUPLE_IS_NULL(lokey.tuple)) + { + key = (Pointer) &lokey.tuple; + kind = BTreeKeyNonLeafKey; + lokeyPtr = NULL; + } + } + + if (!scan->haveHistImg) + return; + + if (!O_TUPLE_IS_NULL(lokey.tuple)) + { + (void) btree_page_search(scan->desc, scan->histImg, + (Pointer) &lokey.tuple, + BTreeKeyNonLeafKey, NULL, + &scan->histLoc); + (void) page_locator_find_real_item(scan->histImg, NULL, &scan->histLoc); + } + else + { + BTREE_PAGE_LOCATOR_FIRST(scan->histImg, &scan->histLoc); + } + +} + +static void +load_next_historical_page(BTreeSeqScan *scan) +{ + BTreePageHeader *header = (BTreePageHeader *) scan->leafImg; + OFixedKey prevHikey; + + copy_fixed_hikey(scan->desc, &prevHikey, scan->histImg); + + while (COMMITSEQNO_IS_NORMAL(header->csn) && + header->csn >= scan->oSnapshot.csn) + { + if (!UNDO_REC_EXISTS(GET_PAGE_LEVEL_UNDO_TYPE(scan->desc->undoType), + header->undoLocation)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("snapshot too old"))); + } + (void) get_page_from_undo(scan->desc, header->undoLocation, + (Pointer) &prevHikey.tuple, BTreeKeyNonLeafKey, + scan->histImg, NULL, NULL, NULL, + NULL, NULL); + header = (BTreePageHeader *) scan->histImg; + } + BTREE_PAGE_LOCATOR_FIRST(scan->histImg, &scan->histLoc); +} + +static Jsonb * +btree_lokey_stopevent_params(BTreeDescr *desc, OTuple lokey, + bool prevIsLeftmostOrNone) +{ + JsonbParseState *state = NULL; + Jsonb *res; + MemoryContext mctx = MemoryContextSwitchTo(stopevents_cxt); + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + btree_desc_stopevent_params_internal(desc, &state); + jsonb_push_key(&state, "lokey"); + (void) o_btree_key_to_jsonb(desc, lokey, &state); + jsonb_push_bool_key(&state, "prevIsLeftmostOrNone", prevIsLeftmostOrNone); + res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL)); + MemoryContextSwitchTo(mctx); + + return res; +} + +/* + * Loads next internal page and. Outputs page, start locator and offset. + * + * In case of parallel scan the caller should hold a lock preventing the other workers from modifying + * a page in a shared state and updating prevHikey. + */ +static bool +load_next_internal_page(BTreeSeqScan *scan, OTuple prevHikey, + Page page, + BTreePageItemLocator *intLoc, + OffsetNumber *startOffset, + const bool prevIsLeftmostOrNone) +{ + bool has_next = false; + OFindPageResult findResult PG_USED_FOR_ASSERTS_ONLY; + + CHECK_FOR_INTERRUPTS(); + elog(DEBUG3, "load_next_internal_page"); + scan->context.flags |= BTREE_PAGE_FIND_DOWNLINK_LOCATION; + + if (!O_TUPLE_IS_NULL(prevHikey)) + { + STOPEVENT(STOPEVENT_SEQ_SCAN_LOAD_INTERNAL_PAGE, + btree_lokey_stopevent_params(scan->desc, prevHikey, prevIsLeftmostOrNone)); + findResult = find_page(&scan->context, &prevHikey, BTreeKeyNonLeafKey, 1); + } + else + { + findResult = find_page(&scan->context, NULL, BTreeKeyNone, 1); + } + Assert(findResult == OFindPageResultSuccess); + + /* In case of parallel scan copy page image into shared state */ + if (page) + { + Assert(scan->poscan); + memcpy(page, scan->context.img, ORIOLEDB_BLCKSZ); + } + else + { + Assert(!scan->poscan); + scan->firstPageIsLoaded = true; + page = scan->context.img; + } + + if (PAGE_GET_LEVEL(page) == 1) + { + /* + * Check if the left bound of the found keyrange corresponds to the + * previous hikey. Otherwise, use iterator to correct the situation. + */ + *intLoc = scan->context.items[scan->context.index].locator; + *startOffset = BTREE_PAGE_LOCATOR_GET_OFFSET(page, intLoc); + if (!O_TUPLE_IS_NULL(prevHikey)) + { + OTuple intTup; + + if (*startOffset > 0) + BTREE_PAGE_READ_INTERNAL_TUPLE(intTup, page, intLoc); + else + intTup = scan->context.lokey.tuple; + + if (O_TUPLE_IS_NULL(intTup) || + o_btree_cmp(scan->desc, + &prevHikey, BTreeKeyNonLeafKey, + &intTup, BTreeKeyNonLeafKey) != 0) + { + get_next_key(scan, intLoc, &scan->keyRangeHigh, page); + elog(DEBUG3, "scan_make_iterator"); + + scan_make_iterator(scan, prevHikey, scan->keyRangeHigh.tuple); + } + } + has_next = true; + } + else + { + Assert(PAGE_GET_LEVEL(page) == 0); + memcpy(scan->leafImg, page, ORIOLEDB_BLCKSZ); + BTREE_PAGE_LOCATOR_FIRST(scan->leafImg, &scan->leafLoc); + scan->hint.blkno = scan->context.items[0].blkno; + scan->hint.pageChangeCount = scan->context.items[0].pageChangeCount; + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->intLoc); + O_TUPLE_SET_NULL(scan->nextKey.tuple); + load_first_historical_page(scan); + has_next = false; + } + return has_next; +} + +static void +add_on_disk_downlink(BTreeSeqScan *scan, uint64 downlink, CommitSeqNo csn) +{ + ParallelOScanDesc poscan = scan->poscan; + + if (!poscan) + { + /* Non-parallel: use local array */ + if (scan->downlinksCount >= scan->allocatedDownlinks) + { + scan->allocatedDownlinks *= 2; + scan->diskDownlinks = (BTreeSeqScanDiskDownlink *) repalloc_huge(scan->diskDownlinks, + sizeof(scan->diskDownlinks[0]) * scan->allocatedDownlinks); + } + scan->diskDownlinks[scan->downlinksCount].downlink = downlink; + scan->diskDownlinks[scan->downlinksCount].csn = csn; + scan->downlinksCount++; + } + else + { + /* Parallel: write directly to shared DSM array */ + while (true) + { + uint64 index; + BTreeSeqScanDiskDownlink *shared; + + LWLockAcquire(&poscan->downlinksPublish, LW_SHARED); + + /* Re-attach to DSM if it was reallocated */ + if (scan->dsmSeg == NULL || + dsm_segment_handle(scan->dsmSeg) != poscan->dsmHandle) + { + if (scan->dsmSeg) + dsm_detach(scan->dsmSeg); + scan->dsmSeg = dsm_attach(poscan->dsmHandle); + } + + index = pg_atomic_fetch_add_u64(&poscan->downlinksCount, 1); + + if (index < poscan->dsmAllocated) + { + shared = (BTreeSeqScanDiskDownlink *) dsm_segment_address(scan->dsmSeg); + shared[index].downlink = downlink; + shared[index].csn = csn; + LWLockRelease(&poscan->downlinksPublish); + return; + } + + /* Over capacity: undo increment, grow under exclusive lock */ + pg_atomic_fetch_sub_u64(&poscan->downlinksCount, 1); + LWLockRelease(&poscan->downlinksPublish); + + LWLockAcquire(&poscan->downlinksPublish, LW_EXCLUSIVE); + + /* Re-check: another worker may have already grown it */ + if (poscan->dsmAllocated <= (uint64) index) + { + dsm_segment *newSeg; + uint64 newAllocated = poscan->dsmAllocated * 2; + uint64 oldCount = pg_atomic_read_u64(&poscan->downlinksCount); + + newSeg = dsm_create(MAXALIGN(newAllocated * sizeof(BTreeSeqScanDiskDownlink)), DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (newSeg == NULL) + { + if (scan->dsmSeg) + dsm_detach(scan->dsmSeg); + LWLockRelease(&poscan->downlinksPublish); + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("parallel scan failed: too many dynamic shared memory segments"))); + } + + if (oldCount > 0) + memcpy(dsm_segment_address(newSeg), + dsm_segment_address(scan->dsmSeg), + oldCount * sizeof(BTreeSeqScanDiskDownlink)); + + if (scan->dsmSeg) + dsm_detach(scan->dsmSeg); + scan->dsmSeg = newSeg; + poscan->dsmHandle = dsm_segment_handle(newSeg); + poscan->dsmAllocated = newAllocated; + } + + LWLockRelease(&poscan->downlinksPublish); + /* Retry the insert */ + } + } +} + +static int +cmp_downlinks(const void *p1, const void *p2) +{ + uint64 d1 = ((BTreeSeqScanDiskDownlink *) p1)->downlink; + uint64 d2 = ((BTreeSeqScanDiskDownlink *) p2)->downlink; + + if (d1 < d2) + return -1; + else if (d1 == d2) + return 0; + else + return 1; +} + +static void +switch_to_disk_scan(BTreeSeqScan *scan) +{ + ParallelOScanDesc poscan = scan->poscan; + + scan->status = BTreeSeqScanDisk; + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->leafLoc); + if (!poscan) + { + /* Serial scan */ + qsort(scan->diskDownlinks, + scan->downlinksCount, + sizeof(scan->diskDownlinks[0]), + cmp_downlinks); + } + else + { + /* Parallel scan */ + + /* + * Wait for any in-flight add_on_disk_downlink() calls to complete. A + * worker that already got a disk downlink from get_next_downlink() + * but hasn't finished writing it yet will have incremented + * downlinksWritersInProgress. + */ + while (pg_atomic_read_u32(&poscan->downlinksWritersInProgress) > 0) + { + pg_usleep(10L); + CHECK_FOR_INTERRUPTS(); + } + + /* + * First worker to grab the exclusive lock sorts the shared downlinks + * array. Other workers wait on the lock and then see the sorted + * flag. + */ + LWLockAcquire(&poscan->downlinksPublish, LW_EXCLUSIVE); + + if (!(poscan->flags & O_PARALLEL_DOWNLINKS_SORTED)) + { + uint64 count = pg_atomic_read_u64(&poscan->downlinksCount); + + /* Re-attach to DSM if it was reallocated */ + if (scan->dsmSeg == NULL || + dsm_segment_handle(scan->dsmSeg) != poscan->dsmHandle) + { + if (scan->dsmSeg) + dsm_detach(scan->dsmSeg); + scan->dsmSeg = dsm_attach(poscan->dsmHandle); + } + + if (count > 0) + { + qsort(dsm_segment_address(scan->dsmSeg), count, + sizeof(BTreeSeqScanDiskDownlink), cmp_downlinks); + } + + pg_atomic_write_u64(&poscan->downlinkIndex, 0); + pg_write_barrier(); + poscan->flags |= O_PARALLEL_DOWNLINKS_SORTED; + } + + LWLockRelease(&poscan->downlinksPublish); + + /* Ensure attached to current DSM for disk scan phase */ + if (poscan->dsmHandle && + (scan->dsmSeg == NULL || + dsm_segment_handle(scan->dsmSeg) != poscan->dsmHandle)) + { + if (scan->dsmSeg) + dsm_detach(scan->dsmSeg); + scan->dsmSeg = dsm_attach(poscan->dsmHandle); + } + } +} + +/* + * Make an interator to read the key range from `startKey` to the next + * downlink or hikey of internal page hikey if we're considering the last + * downlink. + */ +static void +scan_make_iterator(BTreeSeqScan *scan, OTuple keyRangeLow, OTuple keyRangeHigh) +{ + MemoryContext mctx; + + mctx = MemoryContextSwitchTo(scan->mctx); + if (!O_TUPLE_IS_NULL(keyRangeLow)) + scan->iter = o_btree_iterator_create(scan->desc, &keyRangeLow, BTreeKeyNonLeafKey, + &scan->oSnapshot, ForwardScanDirection); + else + scan->iter = o_btree_iterator_create(scan->desc, NULL, BTreeKeyNone, + &scan->oSnapshot, ForwardScanDirection); + MemoryContextSwitchTo(mctx); + + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->leafLoc); + scan->haveHistImg = false; + scan->iterEnd = keyRangeHigh; +} + +/* Output item downlink and key using provided page and current locator */ +static void +get_current_downlink_key(BTreeSeqScan *scan, + BTreePageItemLocator *loc, + OffsetNumber startOffset, + OTuple prevHiKey, + OFixedKey *curKey, + uint64 *downlink, + Page page) +{ + BTreeNonLeafTuphdr *tuphdr; + OTuple tuple; + + STOPEVENT(STOPEVENT_STEP_DOWN, btree_downlink_stopevent_params(scan->desc, + page, loc)); + + BTREE_PAGE_READ_INTERNAL_ITEM(tuphdr, tuple, page, loc); + *downlink = tuphdr->downlink; + + if (BTREE_PAGE_LOCATOR_GET_OFFSET(page, loc) != startOffset) + { + copy_fixed_key(scan->desc, curKey, tuple); + } + else if (!O_PAGE_IS(page, LEFTMOST)) + { + Assert(!O_TUPLE_IS_NULL(prevHiKey)); + copy_fixed_key(scan->desc, curKey, prevHiKey); + } + else + { + /* + * It might happen that due to concurrent page merge, we're visiting + * the leftmost page the second time. In this case, prevHiKey is not + * NULL, so there is no assertion here. + */ + clear_fixed_key(curKey); + } +} + +/* Output next key and locator on a provided internal page */ +static void +get_next_key(BTreeSeqScan *scan, BTreePageItemLocator *intLoc, OFixedKey *nextKey, Page page) +{ + BTREE_PAGE_LOCATOR_NEXT(page, intLoc); + if (BTREE_PAGE_LOCATOR_IS_VALID(page, intLoc)) + copy_fixed_page_key(scan->desc, nextKey, page, intLoc); + else if (!O_PAGE_IS(page, RIGHTMOST)) + copy_fixed_hikey(scan->desc, nextKey, page); + else + clear_fixed_key(nextKey); +} + +/* + * Gets the next downlink with it's keyrange (low and high keys of the + * keyrange). + * + * Returns true on success. False result can be caused by one of three reasons: + * 1) The rightmost internal page is processed; + * 2) There is just single leaf page in the tree (and it's loaded into + * scan->context.img); + * 3) There is scan->iter to be processed before we can get downlinks from the + * current internal page. + */ +static bool +get_next_downlink(BTreeSeqScan *scan, uint64 *downlink, + OFixedKey *keyRangeLow, OFixedKey *keyRangeHigh) +{ + ParallelOScanDesc poscan = scan->poscan; + + if (!poscan) + { + /* Non-parallel case */ + bool pageIsLoaded = scan->firstPageIsLoaded; + bool prevIsLeftmostOrNone = true; + + while (true) + { + + /* Try to load next internal page if needed */ + if (!pageIsLoaded) + { + if (scan->firstPageIsLoaded) + { + Assert(!O_PAGE_IS(scan->context.img, RIGHTMOST)); + if (scan->context.img) + prevIsLeftmostOrNone = O_PAGE_IS(scan->context.img, LEFTMOST); + copy_fixed_hikey(scan->desc, &scan->prevHikey, scan->context.img); + } + + if (!load_next_internal_page(scan, scan->prevHikey.tuple, + NULL, + &scan->intLoc, + &scan->intStartOffset, + prevIsLeftmostOrNone)) + { + /* first page only */ + Assert(O_PAGE_IS(scan->context.img, LEFTMOST)); + scan->isSingleLeafPage = true; + clear_fixed_key(keyRangeLow); + clear_fixed_key(keyRangeHigh); + return false; + } + + if (scan->iter) + return false; + } + + if (BTREE_PAGE_LOCATOR_IS_VALID(scan->context.img, &scan->intLoc)) + { + get_current_downlink_key(scan, &scan->intLoc, scan->intStartOffset, + scan->prevHikey.tuple, keyRangeLow, + downlink, scan->context.img); + + /* + * construct fixed hikey of internal item and get next + * internal locator + */ + get_next_key(scan, &scan->intLoc, keyRangeHigh, scan->context.img); + return true; + } + + if (O_PAGE_IS(scan->context.img, RIGHTMOST)) + return false; + + pageIsLoaded = false; + } + } + else + { + /* Parallel case */ + while (true) + { + BTreeIntPageParallelData *curPage; + BTreeIntPageParallelData *nextPage; + BTreePageItemLocator loc; + + SpinLockAcquire(&poscan->intpageAccess); + curPage = CUR_PAGE(poscan); + nextPage = NEXT_PAGE(poscan); + + if (poscan->flags & O_PARALLEL_IS_SINGLE_LEAF_PAGE) + { + SpinLockRelease(&poscan->intpageAccess); + scan->haveHistImg = false; + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->leafLoc); + return false; + } + + if (curPage->status == OParallelScanPageInvalid) + { + bool next_loaded; + + Assert(nextPage->status == OParallelScanPageInvalid); + + if (!(poscan->flags & O_PARALLEL_FIRST_PAGE_LOADED)) + { + clear_fixed_shmem_key(&curPage->prevHikey); + } + else + { + Assert(O_PAGE_IS(nextPage->img, RIGHTMOST)); + SpinLockRelease(&poscan->intpageAccess); + return false; + } + curPage->status = OParallelScanPageInProgress; + LWLockAcquire(&poscan->intpageLoad, LW_EXCLUSIVE); + SpinLockRelease(&poscan->intpageAccess); + + next_loaded = load_next_internal_page(scan, + fixed_shmem_key_get_tuple(&curPage->prevHikey), + curPage->img, + &loc, + &curPage->startOffset, + false); + if (!next_loaded) + { + SpinLockAcquire(&poscan->intpageAccess); + poscan->flags |= O_PARALLEL_IS_SINGLE_LEAF_PAGE; + clear_fixed_key(keyRangeLow); + clear_fixed_key(keyRangeHigh); + SpinLockRelease(&poscan->intpageAccess); + LWLockRelease(&poscan->intpageLoad); + return false; + } + + SpinLockAcquire(&poscan->intpageAccess); + curPage->imgReadCsn = scan->context.imgReadCsn; + curPage->offset = BTREE_PAGE_LOCATOR_GET_OFFSET(curPage->img, &loc); + curPage->status = OParallelScanPageValid; + poscan->flags |= O_PARALLEL_FIRST_PAGE_LOADED; + SpinLockRelease(&poscan->intpageAccess); + LWLockRelease(&poscan->intpageLoad); + + if (scan->iter) + return false; + continue; + } + else if (curPage->status == OParallelScanPageInProgress) + { + SpinLockRelease(&poscan->intpageAccess); + if (LWLockAcquireOrWait(&poscan->intpageLoad, LW_EXCLUSIVE)) + LWLockRelease(&poscan->intpageLoad); + continue; + } + + if (nextPage->status == OParallelScanPageInvalid && + !O_PAGE_IS(curPage->img, RIGHTMOST)) + { + bool next_loaded PG_USED_FOR_ASSERTS_ONLY; + + copy_fixed_shmem_hikey(scan->desc, &nextPage->prevHikey, curPage->img); + nextPage->status = OParallelScanPageInProgress; + LWLockAcquire(&poscan->intpageLoad, LW_EXCLUSIVE); + SpinLockRelease(&poscan->intpageAccess); + + next_loaded = load_next_internal_page(scan, + fixed_shmem_key_get_tuple(&nextPage->prevHikey), + nextPage->img, + &loc, + &nextPage->startOffset, + false); + Assert(next_loaded); + + SpinLockAcquire(&poscan->intpageAccess); + nextPage->imgReadCsn = scan->context.imgReadCsn; + nextPage->offset = BTREE_PAGE_LOCATOR_GET_OFFSET(nextPage->img, &loc); + nextPage->status = OParallelScanPageValid; + SpinLockRelease(&poscan->intpageAccess); + LWLockRelease(&poscan->intpageLoad); + + if (scan->iter) + return false; + continue; + } + + BTREE_PAGE_OFFSET_GET_LOCATOR(curPage->img, curPage->offset, &loc); + + if (BTREE_PAGE_LOCATOR_IS_VALID(curPage->img, &loc)) /* inside int page */ + { + get_current_downlink_key(scan, &loc, curPage->startOffset, + fixed_shmem_key_get_tuple(&curPage->prevHikey), + keyRangeLow, downlink, curPage->img); + /* Get next internal page locator and next internal item hikey */ + get_next_key(scan, &loc, keyRangeHigh, curPage->img); + + /* Push next internal item page offset into shared state */ + curPage->offset = BTREE_PAGE_LOCATOR_GET_OFFSET(curPage->img, &loc); + scan->context.imgReadCsn = curPage->imgReadCsn; + + /* + * Become the shared downlink writer. This is to be cleared by + * the caller: immediately for in-memory and in IO downlinks, + * after downlink is written to shared DSM array for disk + * downlinks. + */ + pg_atomic_fetch_add_u32(&poscan->downlinksWritersInProgress, 1); + + SpinLockRelease(&poscan->intpageAccess); + return true; + } + else + { + curPage->status = OParallelScanPageInvalid; + poscan->flags ^= O_PARALLEL_CURRENT_PAGE; + SpinLockRelease(&poscan->intpageAccess); + } + } + } +} + +/* + * Checks if loaded leaf page matches downlink of internal page. Makes iterator + * to read the considered key range if check failed. + * + * Hikey of leaf page should match to next downlink or internal page hikey if + * we're considering the last downlink. + */ +static void +check_in_memory_leaf_page(BTreeSeqScan *scan, OTuple keyRangeLow, OTuple keyRangeHigh) +{ + OTuple leafHikey; + bool result = false; + + if (!O_PAGE_IS(scan->leafImg, RIGHTMOST)) + BTREE_PAGE_GET_HIKEY(leafHikey, scan->leafImg); + else + O_TUPLE_SET_NULL(leafHikey); + + if (O_TUPLE_IS_NULL(keyRangeHigh) && O_TUPLE_IS_NULL(leafHikey)) + return; + + if (O_TUPLE_IS_NULL(keyRangeHigh) || O_TUPLE_IS_NULL(leafHikey)) + { + result = true; + } + else + { + if (o_btree_cmp(scan->desc, + &keyRangeHigh, BTreeKeyNonLeafKey, + &leafHikey, BTreeKeyNonLeafKey) != 0) + result = true; + } + + if (result) + { + elog(DEBUG3, "scan_make_iterator 2"); + scan_make_iterator(scan, keyRangeLow, keyRangeHigh); + } +} + + +/* + * Interates the internal page till we either: + * - Successfully read the next in-memory leaf page; + * - Made an iterator to read key range, which belongs to current downlink; + * - Reached the end of internal page. + */ +static bool +iterate_internal_page(BTreeSeqScan *scan) +{ + uint64 downlink = 0; + + while (get_next_downlink(scan, &downlink, &scan->keyRangeLow, &scan->keyRangeHigh)) + { + bool valid_downlink = true; + + if (scan->cb && scan->cb->isRangeValid) + valid_downlink = scan->cb->isRangeValid(scan->keyRangeLow.tuple, scan->keyRangeHigh.tuple, + scan->arg); + else if (scan->needSampling) + { + if (scan->samplingNumber < scan->samplingNext) + { + valid_downlink = false; + } + else + { + if (BlockSampler_HasMore(scan->sampler)) + scan->samplingNext = BlockSampler_Next(scan->sampler); + else + scan->samplingNext = InvalidBlockNumber; + } + scan->samplingNumber++; + } + + if (valid_downlink) + { + if (DOWNLINK_IS_ON_DISK(downlink)) + { + add_on_disk_downlink(scan, downlink, scan->context.imgReadCsn); + if (scan->poscan) + pg_atomic_fetch_sub_u32(&scan->poscan->downlinksWritersInProgress, 1); + } + else if (DOWNLINK_IS_IN_MEMORY(downlink)) + { + ReadPageResult result; + + if (scan->poscan) + pg_atomic_fetch_sub_u32(&scan->poscan->downlinksWritersInProgress, 1); + + result = o_btree_try_read_page(scan->desc, + DOWNLINK_GET_IN_MEMORY_BLKNO(downlink), + DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(downlink), + scan->leafImg, + scan->context.imgReadCsn, + NULL, + BTreeKeyNone, + NULL, + true, + NULL); + + if (result == ReadPageResultOK) + { + check_in_memory_leaf_page(scan, scan->keyRangeLow.tuple, scan->keyRangeHigh.tuple); + if (scan->iter) + return true; + + scan->hint.blkno = DOWNLINK_GET_IN_MEMORY_BLKNO(downlink); + scan->hint.pageChangeCount = DOWNLINK_GET_IN_MEMORY_CHANGECOUNT(downlink); + BTREE_PAGE_LOCATOR_FIRST(scan->leafImg, &scan->leafLoc); + O_TUPLE_SET_NULL(scan->nextKey.tuple); + load_first_historical_page(scan); + return true; + } + else + { + scan_make_iterator(scan, scan->keyRangeLow.tuple, scan->keyRangeHigh.tuple); + Assert(scan->iter); + return true; + } + } + else if (DOWNLINK_IS_IN_IO(downlink)) + { + /* + * Downlink has currently IO in-progress. Wait for IO + * completion and refind this downlink. + */ + int ionum = DOWNLINK_GET_IO_LOCKNUM(downlink); + + if (scan->poscan) + pg_atomic_fetch_sub_u32(&scan->poscan->downlinksWritersInProgress, 1); + + wait_for_io_completion(ionum); + + elog(DEBUG3, "DOWNLINK_IS_IN_IO"); + scan_make_iterator(scan, scan->keyRangeLow.tuple, scan->keyRangeHigh.tuple); + Assert(scan->iter); + return true; + } + } + else if (scan->poscan) + { + pg_atomic_fetch_sub_u32(&scan->poscan->downlinksWritersInProgress, 1); + } + } + + if (scan->iter) + return true; + + elog(DEBUG3, "Worker %d iterate_internal_page complete", scan->workerNumber); + return false; +} + +static bool +load_next_disk_leaf_page(BTreeSeqScan *scan) +{ + FileExtent extent; + bool success; + BTreePageHeader *header; + BTreeSeqScanDiskDownlink downlink; + ParallelOScanDesc poscan = scan->poscan; + + if (!poscan) + { + if (scan->downlinkIndex >= scan->downlinksCount) + return false; + + downlink = scan->diskDownlinks[scan->downlinkIndex]; + } + else + { + uint64 index = pg_atomic_fetch_add_u64(&poscan->downlinkIndex, 1); + + if (index >= pg_atomic_read_u64(&poscan->downlinksCount)) + { + if (scan->dsmSeg) + { + dsm_detach(scan->dsmSeg); + scan->dsmSeg = NULL; + } + return false; + } + downlink = ((BTreeSeqScanDiskDownlink *) dsm_segment_address(scan->dsmSeg))[index]; + } + + success = read_page_from_disk(scan->desc, + scan->leafImg, + downlink.downlink, + &extent); + header = (BTreePageHeader *) scan->leafImg; + if (header->csn >= downlink.csn) + read_page_from_undo(scan->desc, scan->leafImg, header->undoLocation, + downlink.csn, NULL, BTreeKeyNone, NULL); + + STOPEVENT(STOPEVENT_SCAN_DISK_PAGE, + btree_page_stopevent_params(scan->desc, + scan->leafImg)); + + if (!success) + elog(ERROR, "can not read leaf page from disk"); + + BTREE_PAGE_LOCATOR_FIRST(scan->leafImg, &scan->leafLoc); + scan->downlinkIndex++; + scan->hint.blkno = OInvalidInMemoryBlkno; + scan->hint.pageChangeCount = InvalidOPageChangeCount; + O_TUPLE_SET_NULL(scan->nextKey.tuple); + load_first_historical_page(scan); + return true; +} + +static inline bool +single_leaf_page_rel(BTreeSeqScan *scan) +{ + if (scan->poscan) + return (scan->poscan->flags & O_PARALLEL_IS_SINGLE_LEAF_PAGE) != 0; + else + return scan->isSingleLeafPage; +} + +static void +init_checkpoit_number(BTreeSeqScan *scan) +{ + uint32 checkpointNumberBefore, + checkpointNumberAfter; + bool checkpointConcurrent; + BTreeMetaPage *metaPage; + BTreeDescr *desc = scan->desc; + + o_btree_load_shmem(scan->desc); + metaPage = BTREE_GET_META(scan->desc); + + START_CRIT_SECTION(); + + /* + * Get the checkpoint number for the scan. There is race condition with + * concurrent switching tree to the next checkpoint. So, we have to + * workaround this with recheck-retry loop, + */ + checkpointNumberBefore = get_cur_checkpoint_number(&desc->oids, + desc->type, + &checkpointConcurrent); + while (true) + { + (void) pg_atomic_fetch_add_u32(&metaPage->numSeqScans[checkpointNumberBefore % NUM_SEQ_SCANS_ARRAY_SIZE], 1); + checkpointNumberAfter = get_cur_checkpoint_number(&desc->oids, + desc->type, + &checkpointConcurrent); + if (checkpointNumberAfter == checkpointNumberBefore) + { + scan->checkpointNumber = checkpointNumberBefore; + scan->checkpointNumberSet = true; + break; + } + (void) pg_atomic_fetch_sub_u32(&metaPage->numSeqScans[checkpointNumberBefore % NUM_SEQ_SCANS_ARRAY_SIZE], 1); + checkpointNumberBefore = checkpointNumberAfter; + } + END_CRIT_SECTION(); +} + +static void +init_btree_seq_scan(BTreeSeqScan *scan) +{ + ParallelOScanDesc poscan = scan->poscan; + BlockSampler sampler = scan->sampler; + BTreeDescr *desc = scan->desc; + + if (poscan) + { + /* + * Scan worker numbers are assigned by the order of workers init of + * local seqscan. In case of call seqscan in an index build worker, + * the numbers of scan workers, and who is a scan leader is not + * related to index build leader (who merges workers sort results + * after all workers completed their scans). + */ + SpinLockAcquire(&poscan->workerStart); +#ifdef USE_ASSERT_CHECKING + for (scan->workerNumber = 0; poscan->worker_active[scan->workerNumber] == true; scan->workerNumber++) + { + } + + poscan->worker_active[scan->workerNumber] = true; + poscan->nworkers = scan->workerNumber + 1; +#else + scan->workerNumber = poscan->nworkers; + poscan->nworkers++; +#endif + /* Scan leader */ + if (scan->workerNumber == 0) + { + uint32 numLeafPages; + uint64 allocSize; + + Assert(!(poscan->flags & O_PARALLEL_LEADER_STARTED)); + poscan->flags |= O_PARALLEL_LEADER_STARTED; + init_checkpoit_number(scan); + + /* + * Create a shared DSM segment for on-disk downlinks upfront, + * sized to hold as many downlinks as there are leaf pages in the + * tree. + */ + numLeafPages = TREE_NUM_LEAF_PAGES(desc); + if (numLeafPages < 16) + numLeafPages = 16; + allocSize = MAXALIGN((uint64) numLeafPages * sizeof(BTreeSeqScanDiskDownlink)); + scan->dsmSeg = dsm_create(allocSize, DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (scan->dsmSeg == NULL) + { + SpinLockRelease(&poscan->workerStart); + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("parallel scan failed: too many dynamic shared memory segments"))); + } + poscan->dsmHandle = dsm_segment_handle(scan->dsmSeg); + poscan->dsmAllocated = numLeafPages; + pg_write_barrier(); + poscan->flags |= O_PARALLEL_DSM_CREATED; + } + SpinLockRelease(&poscan->workerStart); + + /* Non-leader workers: wait for DSM creation and attach */ + Assert(scan->workerNumber >= 0); + if (scan->workerNumber > 0) + { + while (!(poscan->flags & O_PARALLEL_DSM_CREATED)) + { + pg_usleep(100L); + CHECK_FOR_INTERRUPTS(); + } + pg_read_barrier(); + if (poscan->dsmHandle) + scan->dsmSeg = dsm_attach(poscan->dsmHandle); + } + + elog(DEBUG3, "init_btree_seq_scan. %s %d started", poscan ? "Parallel worker" : "Worker", scan->workerNumber); + } + else + { + scan->workerNumber = -1; + init_checkpoit_number(scan); + } + + if (sampler) + { + scan->needSampling = true; + if (BlockSampler_HasMore(scan->sampler)) + scan->samplingNext = BlockSampler_Next(scan->sampler); + else + scan->samplingNext = InvalidBlockNumber; + } + else + { + scan->needSampling = false; + scan->samplingNext = InvalidBlockNumber; + } + + O_TUPLE_SET_NULL(scan->nextKey.tuple); + + init_page_find_context(&scan->context, desc, scan->oSnapshot.csn, + BTREE_PAGE_FIND_IMAGE | + BTREE_PAGE_FIND_KEEP_LOKEY | + BTREE_PAGE_FIND_READ_CSN); + clear_fixed_key(&scan->prevHikey); + clear_fixed_key(&scan->keyRangeHigh); + clear_fixed_key(&scan->keyRangeLow); + scan->isSingleLeafPage = false; + o_btree_load_shmem(desc); + if (!iterate_internal_page(scan) && !single_leaf_page_rel(scan)) + { + switch_to_disk_scan(scan); + if (!load_next_disk_leaf_page(scan)) + scan->status = BTreeSeqScanFinished; + } + + scan->initialized = true; +} + +static BTreeSeqScan * +make_btree_seq_scan_internal(BTreeDescr *desc, OSnapshot *oSnapshot, + BTreeSeqScanCallbacks *cb, void *arg, + BlockSampler sampler, ParallelOScanDesc poscan) +{ + BTreeSeqScan *scan = (BTreeSeqScan *) MemoryContextAlloc(btree_seqscan_context, + sizeof(BTreeSeqScan)); + + scan->poscan = poscan; + scan->desc = desc; + if (!IS_SYS_TREE_OIDS(desc->oids)) + ((OIndexDescr *) desc->arg)->refcnt++; + scan->oSnapshot = *oSnapshot; + scan->status = BTreeSeqScanInMemory; + scan->allocatedDownlinks = 16; + scan->downlinksCount = 0; + scan->downlinkIndex = 0; + scan->diskDownlinks = (BTreeSeqScanDiskDownlink *) MemoryContextAlloc(btree_seqscan_context, + sizeof(scan->diskDownlinks[0]) * scan->allocatedDownlinks); + scan->mctx = CurrentMemoryContext; + scan->iter = NULL; + scan->cb = cb; + scan->arg = arg; + scan->firstPageIsLoaded = false; + scan->intStartOffset = 0; + scan->samplingNumber = 0; + scan->sampler = sampler; + scan->dsmSeg = NULL; + scan->initialized = false; + scan->checkpointNumberSet = false; + scan->haveHistImg = false; + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->leafLoc); + + dlist_push_tail(&listOfScans, &scan->listNode); + scan->resowner = NULL; +#if PG_VERSION_NUM >= 170000 + ResourceOwnerEnlarge(CurrentResourceOwner); +#endif + ResourceOwnerRememberBTreeSeqScan(CurrentResourceOwner, scan); + scan->resowner = CurrentResourceOwner; + + return scan; +} + +BTreeSeqScan * +make_btree_seq_scan(BTreeDescr *desc, OSnapshot *oSnapshot, void *poscan) +{ + return make_btree_seq_scan_internal(desc, oSnapshot, NULL, NULL, NULL, poscan); +} + +BTreeSeqScan * +make_btree_seq_scan_cb(BTreeDescr *desc, OSnapshot *oSnapshot, + BTreeSeqScanCallbacks *cb, void *arg) +{ + return make_btree_seq_scan_internal(desc, oSnapshot, cb, arg, NULL, NULL); +} + +BTreeSeqScan * +make_btree_sampling_scan(BTreeDescr *desc, BlockSampler sampler) +{ + return make_btree_seq_scan_internal(desc, &o_in_progress_snapshot, + NULL, NULL, sampler, NULL); +} + +static OTuple +btree_seq_scan_get_tuple_from_iterator(BTreeSeqScan *scan, + CommitSeqNo *tupleCsn, + BTreeLocationHint *hint) +{ + OTuple result; + + if (!O_TUPLE_IS_NULL(scan->iterEnd)) + result = o_btree_iterator_fetch(scan->iter, tupleCsn, + &scan->iterEnd, BTreeKeyNonLeafKey, + false, hint); + else + result = o_btree_iterator_fetch(scan->iter, tupleCsn, + NULL, BTreeKeyNone, + false, hint); + + if (O_TUPLE_IS_NULL(result)) + { + btree_iterator_free(scan->iter); + scan->iter = NULL; + scan->haveHistImg = false; + } + return result; +} + +static bool +adjust_location_with_next_key(BTreeSeqScan *scan, + Page p, BTreePageItemLocator *loc) +{ + BTreeDescr *desc = scan->desc; + BTreePageHeader *header = (BTreePageHeader *) p; + int cmp; + OTuple key; + + if (!BTREE_PAGE_LOCATOR_IS_VALID(p, loc)) + return false; + + BTREE_PAGE_READ_LEAF_TUPLE(key, p, loc); + + cmp = o_btree_cmp(desc, &key, BTreeKeyLeafTuple, + &scan->nextKey.tuple, BTreeKeyNonLeafKey); + if (cmp == 0) + return true; + if (cmp > 0) + return false; + + while (true) + { + if (loc->chunkOffset == (header->chunksCount - 1)) + break; + + key.formatFlags = header->chunkDesc[loc->chunkOffset].hikeyFlags; + key.data = (Pointer) p + SHORT_GET_LOCATION(header->chunkDesc[loc->chunkOffset].hikeyShortLocation); + cmp = o_btree_cmp(desc, &key, BTreeKeyNonLeafKey, + &scan->nextKey.tuple, BTreeKeyNonLeafKey); + if (cmp > 0) + break; + loc->itemOffset = loc->chunkItemsCount; + if (!page_locator_next_chunk(p, loc)) + { + BTREE_PAGE_LOCATOR_SET_INVALID(loc); + return false; + } + } + + while (BTREE_PAGE_LOCATOR_IS_VALID(p, loc)) + { + BTREE_PAGE_READ_LEAF_TUPLE(key, p, loc); + cmp = o_btree_cmp(desc, + &key, BTreeKeyLeafTuple, + &scan->nextKey.tuple, BTreeKeyNonLeafKey); + if (cmp == 0) + return true; + if (cmp > 0) + break; + BTREE_PAGE_LOCATOR_NEXT(p, loc); + } + + return false; +} + +static void +apply_next_key(BTreeSeqScan *scan) +{ + BTreeDescr *desc = scan->desc; + + Assert(BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc) || + (scan->haveHistImg && BTREE_PAGE_LOCATOR_IS_VALID(scan->histImg, &scan->histLoc))); + + while (true) + { + OTuple key; + bool leafResult, + histResult; + + if (BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc)) + BTREE_PAGE_READ_LEAF_TUPLE(key, scan->leafImg, &scan->leafLoc); + else + O_TUPLE_SET_NULL(key); + + if (scan->haveHistImg && + BTREE_PAGE_LOCATOR_IS_VALID(scan->histImg, &scan->histLoc)) + { + if (O_TUPLE_IS_NULL(key)) + { + BTREE_PAGE_READ_LEAF_TUPLE(key, scan->histImg, &scan->histLoc); + } + else + { + OTuple histKey; + + BTREE_PAGE_READ_LEAF_TUPLE(histKey, scan->histImg, &scan->histLoc); + if (o_btree_cmp(desc, + &key, BTreeKeyLeafTuple, + &histKey, BTreeKeyNonLeafKey) > 0) + key = histKey; + } + } + + scan->nextKey.tuple = key; + if (O_TUPLE_IS_NULL(key) || + !scan->cb->getNextKey(&scan->nextKey, true, scan->arg)) + { + BTREE_PAGE_LOCATOR_SET_INVALID(&scan->leafLoc); + return; + } + + leafResult = adjust_location_with_next_key(scan, + scan->leafImg, + &scan->leafLoc); + if (scan->haveHistImg) + { + histResult = adjust_location_with_next_key(scan, + scan->histImg, + &scan->histLoc); + if (leafResult || histResult) + return; + } + else if (leafResult) + return; + + if (!BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc) && + !(scan->haveHistImg && + BTREE_PAGE_LOCATOR_IS_VALID(scan->histImg, &scan->histLoc))) + return; + } +} + +static OTuple +btree_seq_scan_getnext_internal(BTreeSeqScan *scan, MemoryContext mctx, + CommitSeqNo *tupleCsn, BTreeLocationHint *hint) +{ + OTuple tuple; + + if (scan->iter) + { + tuple = btree_seq_scan_get_tuple_from_iterator(scan, tupleCsn, hint); + if (!O_TUPLE_IS_NULL(tuple)) + return tuple; + } + + while (true) + { + while (scan->haveHistImg) + { + OTuple histTuple; + + while (!BTREE_PAGE_LOCATOR_IS_VALID(scan->histImg, &scan->histLoc)) + { + if (O_PAGE_IS(scan->histImg, RIGHTMOST)) + { + scan->haveHistImg = false; + break; + } + if (!O_PAGE_IS(scan->leafImg, RIGHTMOST)) + { + OTuple leafHikey, + histHikey; + + BTREE_PAGE_GET_HIKEY(leafHikey, scan->leafImg); + BTREE_PAGE_GET_HIKEY(histHikey, scan->histImg); + if (o_btree_cmp(scan->desc, + &histHikey, BTreeKeyNonLeafKey, + &leafHikey, BTreeKeyNonLeafKey) >= 0) + { + scan->haveHistImg = false; + break; + } + } + load_next_historical_page(scan); + } + + if (!scan->haveHistImg) + break; + + if (scan->cb && scan->cb->getNextKey) + apply_next_key(scan); + + if (!BTREE_PAGE_LOCATOR_IS_VALID(scan->histImg, &scan->histLoc)) + continue; + + BTREE_PAGE_READ_LEAF_TUPLE(histTuple, scan->histImg, + &scan->histLoc); + if (!BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc)) + { + OTuple leafHikey; + + if (!O_PAGE_IS(scan->leafImg, RIGHTMOST)) + { + BTREE_PAGE_GET_HIKEY(leafHikey, scan->leafImg); + if (o_btree_cmp(scan->desc, + &histTuple, BTreeKeyLeafTuple, + &leafHikey, BTreeKeyNonLeafKey) >= 0) + { + scan->haveHistImg = false; + break; + } + } + } + else + { + BTreeLeafTuphdr *tuphdr; + OTuple leafTuple; + int cmp; + + BTREE_PAGE_READ_LEAF_ITEM(tuphdr, leafTuple, + scan->leafImg, &scan->leafLoc); + + cmp = o_btree_cmp(scan->desc, + &histTuple, BTreeKeyLeafTuple, + &leafTuple, BTreeKeyLeafTuple); + if (cmp > 0) + break; + + if (cmp == 0) + { + if (XACT_INFO_OXID_IS_CURRENT(tuphdr->xactInfo)) + { + BTREE_PAGE_LOCATOR_NEXT(scan->histImg, &scan->histLoc); + break; + } + else + { + BTREE_PAGE_LOCATOR_NEXT(scan->leafImg, &scan->leafLoc); + } + } + } + + tuple = o_find_tuple_version(scan->desc, + scan->histImg, + &scan->histLoc, + &scan->oSnapshot, + tupleCsn, + mctx, + NULL, + NULL); + BTREE_PAGE_LOCATOR_NEXT(scan->histImg, &scan->histLoc); + if (!O_TUPLE_IS_NULL(tuple)) + { + if (hint) + *hint = scan->hint; + return tuple; + } + } + + if (scan->cb && scan->cb->getNextKey && + BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc)) + apply_next_key(scan); + + if (!BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc)) + { + if (scan->status == BTreeSeqScanInMemory) + { + if (iterate_internal_page(scan)) + { + if (scan->iter) + { + tuple = btree_seq_scan_get_tuple_from_iterator(scan, + tupleCsn, + hint); + if (!O_TUPLE_IS_NULL(tuple)) + return tuple; + } + } + else + { + switch_to_disk_scan(scan); + } + } + if (scan->status == BTreeSeqScanDisk) + { + if (!load_next_disk_leaf_page(scan)) + { + scan->status = BTreeSeqScanFinished; + O_TUPLE_SET_NULL(tuple); + return tuple; + } + } + continue; + } + + tuple = o_find_tuple_version(scan->desc, + scan->leafImg, + &scan->leafLoc, + &scan->oSnapshot, + tupleCsn, + mctx, + NULL, + NULL); + BTREE_PAGE_LOCATOR_NEXT(scan->leafImg, &scan->leafLoc); + if (!O_TUPLE_IS_NULL(tuple)) + { + if (hint) + *hint = scan->hint; + return tuple; + } + } + + /* keep compiler quiet */ + O_TUPLE_SET_NULL(tuple); + return tuple; +} + +OTuple +btree_seq_scan_getnext(BTreeSeqScan *scan, MemoryContext mctx, + CommitSeqNo *tupleCsn, BTreeLocationHint *hint) +{ + OTuple tuple; + + Assert(scan); + if (!scan->initialized) + init_btree_seq_scan(scan); + + if (scan->status == BTreeSeqScanInMemory || + scan->status == BTreeSeqScanDisk) + { + tuple = btree_seq_scan_getnext_internal(scan, mctx, tupleCsn, hint); + + if (!O_TUPLE_IS_NULL(tuple)) + return tuple; + } + Assert(scan->status == BTreeSeqScanFinished); + + O_TUPLE_SET_NULL(tuple); + return tuple; +} + +static OTuple +btree_seq_scan_get_tuple_from_iterator_raw(BTreeSeqScan *scan, + bool *end, + BTreeLocationHint *hint) +{ + OTuple result; + + if (!O_TUPLE_IS_NULL(scan->iterEnd)) + result = btree_iterate_raw(scan->iter, &scan->iterEnd, BTreeKeyNonLeafKey, + false, end, hint); + else + result = btree_iterate_raw(scan->iter, NULL, BTreeKeyNone, + false, end, hint); + + if (*end) + { + btree_iterator_free(scan->iter); + scan->iter = NULL; + scan->haveHistImg = false; + } + return result; +} + +static OTuple +btree_seq_scan_getnext_raw_internal(BTreeSeqScan *scan, MemoryContext mctx, + BTreeLocationHint *hint) +{ + BTreeLeafTuphdr *tupHdr; + OTuple tuple; + + if (scan->iter) + { + bool end; + + tuple = btree_seq_scan_get_tuple_from_iterator_raw(scan, &end, hint); + if (!end) + return tuple; + } + + while (!BTREE_PAGE_LOCATOR_IS_VALID(scan->leafImg, &scan->leafLoc)) + { + if (scan->status == BTreeSeqScanInMemory) + { + elog(DEBUG3, "load_next_in_memory_leaf_page START3"); + if (iterate_internal_page(scan)) + { + if (scan->iter) + { + bool end; + + tuple = btree_seq_scan_get_tuple_from_iterator_raw(scan, &end, hint); + if (!end) + return tuple; + } + } + else + { + switch_to_disk_scan(scan); + } + } + if (scan->status == BTreeSeqScanDisk) + { + if (!load_next_disk_leaf_page(scan)) + { + scan->status = BTreeSeqScanFinished; + O_TUPLE_SET_NULL(tuple); + return tuple; + } + } + } + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, tuple, scan->leafImg, &scan->leafLoc); + BTREE_PAGE_LOCATOR_NEXT(scan->leafImg, &scan->leafLoc); + + if (!tupHdr->deleted) + { + if (hint) + *hint = scan->hint; + + return tuple; + } + else + { + O_TUPLE_SET_NULL(tuple); + return tuple; + } +} + +OTuple +btree_seq_scan_getnext_raw(BTreeSeqScan *scan, MemoryContext mctx, + bool *end, BTreeLocationHint *hint) +{ + OTuple tuple; + + if (!scan->initialized) + init_btree_seq_scan(scan); + + if (scan->status == BTreeSeqScanInMemory || + scan->status == BTreeSeqScanDisk) + { + tuple = btree_seq_scan_getnext_raw_internal(scan, mctx, hint); + if (scan->status == BTreeSeqScanInMemory || + scan->status == BTreeSeqScanDisk) + { + *end = false; + return tuple; + } + } + Assert(scan->status == BTreeSeqScanFinished); + + O_TUPLE_SET_NULL(tuple); + *end = true; + return tuple; +} + +/* + * Internal cleanup for a sequential scan: decrements the numSeqScans counter + * and completes deferred meta page free if this was the last scan. Called + * from both the normal free path and the resource owner release callback. + */ +static void +free_btree_seq_scan_internal(BTreeSeqScan *scan, bool fromResowner) +{ + BTreeDescr *desc = scan->desc; + + START_CRIT_SECTION(); + + if (scan->resowner) + { + ResourceOwnerForgetBTreeSeqScan(scan->resowner, scan); + scan->resowner = NULL; + } + + if (scan->checkpointNumberSet && OInMemoryBlknoIsValid(desc->rootInfo.metaPageBlkno)) + { + BTreeMetaPage *metaPage = BTREE_GET_META(scan->desc); + + (void) pg_atomic_fetch_sub_u32(&metaPage->numSeqScans[scan->checkpointNumber % NUM_SEQ_SCANS_ARRAY_SIZE], 1); + + /* Complete deferred meta page free if this was the last scan. */ + if (metaPage->toBeFreedOnSeqScanRelease && meta_page_get_num_seq_scans(desc->rootInfo.metaPageBlkno) == 0) + ppool_free_page(desc->ppool, desc->rootInfo.metaPageBlkno, false); + + scan->checkpointNumberSet = false; + } + + if (scan->dsmSeg) + { + /* + * Skip dsm_detach when called from ResourceOwner release: the DSM + * segment is also registered as a resource and will be detached by + * ResourceOwner independently. Calling dsm_detach here would attempt + * ResourceOwnerForget on a DSM that may have already been released. + */ + if (!fromResowner) + dsm_detach(scan->dsmSeg); + scan->dsmSeg = NULL; + } + + if (scan->iter) + { + btree_iterator_free(scan->iter); + scan->iter = NULL; + } + + if (scan->diskDownlinks) + { + pfree(scan->diskDownlinks); + scan->diskDownlinks = NULL; + } + + if (!IS_SYS_TREE_OIDS(desc->oids)) + ((OIndexDescr *) desc->arg)->refcnt--; + scan->status = BTreeSeqScanFinished; + + if (!fromResowner) + { + dlist_delete_from_thoroughly(&listOfScans, &scan->listNode); + pfree(scan); + } + + END_CRIT_SECTION(); +} + +void +free_btree_seq_scan(BTreeSeqScan *scan) +{ + free_btree_seq_scan_internal(scan, false); +} + +/* + * Error cleanup for sequential scans. No scans survives the error, but they + * are't cleaned up individually. Thus, we have to walk trough all the scans + * and revert changes made to the metaPageBlkno->numSeqScans. + */ +void +seq_scans_cleanup(void) +{ + START_CRIT_SECTION(); + while (!dlist_is_empty(&listOfScans)) + { + BTreeSeqScan *scan = dlist_head_element(BTreeSeqScan, listNode, &listOfScans); + + free_btree_seq_scan_internal(scan, false); + } + END_CRIT_SECTION(); +} + +/* + * Return the total number of active sequential scans across all checkpoint + * number slots for the given meta page. + */ +int +meta_page_get_num_seq_scans(OInMemoryBlkno metaPageBlkno) +{ + BTreeMetaPage *metaPage = (BTreeMetaPage *) O_GET_IN_MEMORY_PAGE(metaPageBlkno); + int result = 0; + int i; + + for (i = 0; i < NUM_SEQ_SCANS_ARRAY_SIZE; i++) + result += pg_atomic_read_u32(&metaPage->numSeqScans[i]); + + return result; +} + +#if PG_VERSION_NUM >= 170000 + +static void +ResourceOwnerRememberBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan) +{ + ResourceOwnerRemember(owner, PointerGetDatum(scan), &btree_seq_scan_resowner_desc); +} +static void +ResourceOwnerForgetBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan) +{ + ResourceOwnerForget(owner, PointerGetDatum(scan), &btree_seq_scan_resowner_desc); +} + +static void +ResOwnerReleaseBTreeSeqScan(Datum res) +{ + BTreeSeqScan *scan = (BTreeSeqScan *) DatumGetPointer(res); + + scan->resowner = NULL; + free_btree_seq_scan_internal(scan, true); +} + +static char * +ResOwnerPrintBTreeSeqScan(Datum res) +{ + BTreeSeqScan *scan = (BTreeSeqScan *) DatumGetPointer(res); + ORelOids oids = scan->desc->oids; + + return psprintf("OrioleDB BTreeSeqScans (%u, %u, %u)", + oids.datoid, oids.reloid, oids.relnode); +} + +#else + +/* + * PG16 lacks the per-owner ResourceOwnerRemember API, so we fall back to + * RegisterResourceReleaseCallback. The callback fires for every + * ResourceOwner release, so it filters by scan->resowner to only free the + * scan when its own binding owner is being released. + */ +static void +ResOwnerReleaseBTreeSeqScanCallback(ResourceReleasePhase phase, + bool isCommit, bool isTopLevel, void *arg) +{ + BTreeSeqScan *scan = (BTreeSeqScan *) arg; + + if (phase != RESOURCE_RELEASE_BEFORE_LOCKS) + return; + if (scan->resowner != CurrentResourceOwner) + return; + + /* + * Unregister this callback before letting free_btree_seq_scan_internal + * clear scan->resowner. The scan itself is not pfreed here (fromResowner + * skips dlist_delete/pfree and leaves cleanup to seq_scans_cleanup), and + * that later pfree would leave a dangling arg pointer in the global + * callback list if we did not drop the entry now. Self-removal during + * the release walk is safe: resowner.c captures the next pointer before + * invoking each callback. + */ + UnregisterResourceReleaseCallback(ResOwnerReleaseBTreeSeqScanCallback, + scan); + scan->resowner = NULL; + free_btree_seq_scan_internal(scan, true); +} + +static void +ResourceOwnerRememberBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan) +{ + RegisterResourceReleaseCallback(ResOwnerReleaseBTreeSeqScanCallback, scan); +} + +static void +ResourceOwnerForgetBTreeSeqScan(ResourceOwner owner, BTreeSeqScan *scan) +{ + UnregisterResourceReleaseCallback(ResOwnerReleaseBTreeSeqScanCallback, scan); +} + +#endif diff --git a/contrib/orioledb/src/btree/split.c b/contrib/orioledb/src/btree/split.c new file mode 100644 index 00000000000..36c3c67164a --- /dev/null +++ b/contrib/orioledb/src/btree/split.c @@ -0,0 +1,512 @@ +/*------------------------------------------------------------------------- + * + * split.c + * Routines for implementation of splitting B-tree page. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/split.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/split.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "checkpoint/checkpoint.h" +#include "recovery/recovery.h" +#include "transam/undo.h" +#include "utils/page_pool.h" +#include "utils/stopevent.h" + +#include "miscadmin.h" +#include "utils/memutils.h" + +void +make_split_items(BTreeDescr *desc, Page page, + BTreeSplitItems *items, + OffsetNumber *offset, Pointer tupleheader, OTuple tuple, + LocationIndex tuplesize, bool replace, CommitSeqNo csn) +{ + BTreePageItemLocator loc; + bool leaf = O_PAGE_IS(page, LEAF); + LocationIndex tuple_header_size = leaf ? BTreeLeafTuphdrSize : BTreeNonLeafTuphdrSize; + int i; + static char newItem[Max(BTreeLeafTuphdrSize, BTreeNonLeafTuphdrSize) + O_BTREE_MAX_TUPLE_SIZE]; + int maxKeyLen = MAXALIGN(((BTreePageHeader *) page)->maxKeyLen); + + i = 0; + BTREE_PAGE_LOCATOR_FIRST(page, &loc); + while (BTREE_PAGE_LOCATOR_IS_VALID(page, &loc) || i == *offset) + { + if (i == *offset) + { + int newKeyLen; + + memcpy(newItem, tupleheader, tuple_header_size); + memcpy(&newItem[tuple_header_size], tuple.data, tuplesize); + if (tuplesize != MAXALIGN(tuplesize)) + memset(&newItem[tuple_header_size + tuplesize], 0, MAXALIGN(tuplesize) - tuplesize); + items->items[i].data = newItem; + items->items[i].flags = tuple.formatFlags; + items->items[i].size = tuple_header_size + MAXALIGN(tuplesize); + newKeyLen = o_btree_len(desc, tuple, leaf ? OTupleKeyLengthNoVersion : OKeyLength); + maxKeyLen = Max(maxKeyLen, newKeyLen); + i++; + if (replace) + { + BTREE_PAGE_LOCATOR_NEXT(page, &loc); + continue; + } + } + + if (!BTREE_PAGE_LOCATOR_IS_VALID(page, &loc)) + break; + + /* + * In leaf pages, get rid of tuples deleted by finished transactions. + * Also, resize tuples to minimal size. In non-leaf pages, copy + * tuples as-is. + */ + if (leaf) + { + BTreeLeafTuphdr *tupHdr; + OTuple tup; + bool finished; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, tup, page, &loc); + finished = COMMITSEQNO_IS_FROZEN(csn) ? false : XACT_INFO_FINISHED_FOR_EVERYBODY(tupHdr->xactInfo); + if (finished && tupHdr->deleted && + (COMMITSEQNO_IS_INPROGRESS(csn) || XACT_INFO_MAP_CSN(tupHdr->xactInfo) < csn)) + { + if (i < *offset) + (*offset)--; + BTREE_PAGE_LOCATOR_NEXT(page, &loc); + continue; + } + + items->items[i].data = (Pointer) tupHdr; + items->items[i].flags = tup.formatFlags; + items->items[i].size = finished ? + (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, tup, OTupleLength))) : + BTREE_PAGE_GET_ITEM_SIZE(page, &loc); + } + else + { + items->items[i].data = BTREE_PAGE_LOCATOR_GET_ITEM(page, &loc); + items->items[i].flags = BTREE_PAGE_GET_ITEM_FLAGS(page, &loc); + items->items[i].size = BTREE_PAGE_GET_ITEM_SIZE(page, &loc); + } + + i++; + BTREE_PAGE_LOCATOR_NEXT(page, &loc); + } + items->itemsCount = i; + items->maxKeyLen = maxKeyLen; + items->hikeySize = O_PAGE_IS(page, RIGHTMOST) ? 0 : BTREE_PAGE_GET_HIKEY_SIZE(page); + items->hikeysEnd = BTREE_PAGE_HIKEYS_END(desc, page); + items->leaf = O_PAGE_IS(page, LEAF); +} + +void +perform_page_compaction(BTreeDescr *desc, OInMemoryBlkno blkno, + BTreeSplitItems *items, bool needsUndo, + CommitSeqNo csn) +{ + Page p = O_GET_IN_MEMORY_PAGE(blkno); + BTreePageHeader *header = (BTreePageHeader *) p; + UndoLocation undoLocation; + OFixedKey hikey; + LocationIndex hikeySize; + + START_CRIT_SECTION(); + + Assert(O_PAGE_IS(p, LEAF)); + + /* Make a page-level undo item if needed */ + if (needsUndo) + { + undoLocation = page_add_image_to_undo(desc, p, csn, NULL, 0); + + /* + * Start page modification. It contains the required memory barrier + * between making undo image and setting the undo location. + */ + page_block_reads(blkno); + + /* Update the old page meta-data */ + + header->undoLocation = undoLocation; + header->prevInsertOffset = MaxOffsetNumber; + + /* + * Memory barrier between write undo location and csn. See comment in + * the o_btree_read_page() for details. + */ + pg_write_barrier(); + + header->csn = csn; + } + else + { + page_block_reads(blkno); + } + + if (O_PAGE_IS(p, RIGHTMOST)) + { + O_TUPLE_SET_NULL(hikey.tuple); + hikeySize = 0; + } + else + { + copy_fixed_hikey(desc, &hikey, p); + hikeySize = BTREE_PAGE_GET_HIKEY_SIZE(p); + } + + btree_page_reorg(desc, p, items->items, + items->itemsCount, hikeySize, hikey.tuple); + Assert(header->dataSize <= ORIOLEDB_BLCKSZ); + o_btree_page_calculate_statistics(desc, p); + + END_CRIT_SECTION(); +} + +/* + * Check if all split items fit on a single page. Used after + * make_split_items() reclaims deleted tuples to determine whether compaction + * is sufficient instead of a page split. + */ +bool +split_items_fit_single_page(BTreeSplitItems *items) +{ + int totalDataSize = 0; + int hikeysEnd; + int spaceAvailable; + + for (int i = 0; i < items->itemsCount; i++) + totalDataSize += items->items[i].size; + + hikeysEnd = Max(items->hikeysEnd, + MAXALIGN(sizeof(BTreePageHeader)) + items->maxKeyLen); + + spaceAvailable = ORIOLEDB_BLCKSZ - hikeysEnd + - totalDataSize + - MAXALIGN(sizeof(LocationIndex) * items->itemsCount); + + return spaceAvailable >= 0; +} + +/* + * Shared core of the page-split partitioning algorithm. Walk the items + * placing them onto the prospective left or right page (chosen by + * targetLocation / spaceRatio, with overflowed sides forced opposite), + * until the boundary settles at a single location. Returns true and + * writes that location through *splitLocation on success; returns false + * if no valid two-page split exists (first/last item doesn't fit its + * page, or both sides exhaust simultaneously mid-loop). + * + * Both btree_page_split_location() (which expects a valid split and + * asserts on failure) and btree_page_split_can_succeed() (used by + * merge_waited_tuples() as a non-asserting dry-run) call this. + */ +static bool +btree_page_split_find_location(BTreeSplitItems *items, + OffsetNumber targetLocation, + float4 spaceRatio, + OffsetNumber *splitLocation) +{ + int leftPageSpaceLeft, + rightPageSpaceLeft, + minLeftPageItemsCount, + maxLeftPageItemsCount; + + if (items->itemsCount < 2) + return false; + + leftPageSpaceLeft = ORIOLEDB_BLCKSZ - + Max(items->hikeysEnd, + MAXALIGN(sizeof(BTreePageHeader)) + items->maxKeyLen); + rightPageSpaceLeft = ORIOLEDB_BLCKSZ - + Max(items->hikeysEnd, + MAXALIGN(sizeof(BTreePageHeader)) + items->hikeySize); + + /* + * Left page must contain at least one item and leave at least one for the + * right page. + */ + minLeftPageItemsCount = 1; + maxLeftPageItemsCount = items->itemsCount - 1; + leftPageSpaceLeft -= items->items[0].size + MAXALIGN(sizeof(LocationIndex)); + rightPageSpaceLeft -= items->items[items->itemsCount - 1].size + + MAXALIGN(sizeof(LocationIndex)); + + /* First / last items must individually fit their respective pages. */ + if (leftPageSpaceLeft < 0 || rightPageSpaceLeft < 0) + return false; + + /* + * Shift minimal and maximal left-page item counts until they are equal. + */ + while (minLeftPageItemsCount != maxLeftPageItemsCount) + { + Assert(minLeftPageItemsCount < maxLeftPageItemsCount); + + /* + * Choose page to add item. At first only we try place new item to + * the page that have a space yet. Then, we try to follow + * `targetLocation`. If `targetLocation` isn't given, then follow + * `spaceRatio`. + */ + if (rightPageSpaceLeft <= 0 || (leftPageSpaceLeft > 0 && + (targetLocation == 0 ? + (float4) leftPageSpaceLeft * spaceRatio > (float4) rightPageSpaceLeft * (1.0f - spaceRatio) : + minLeftPageItemsCount < targetLocation))) + { + /* Place item on the left page. */ + if (leftPageSpaceLeft <= 0) + return false; + leftPageSpaceLeft -= items->items[minLeftPageItemsCount].size + + MAXALIGN(sizeof(LocationIndex) * (minLeftPageItemsCount + 1)) - + MAXALIGN(sizeof(LocationIndex) * minLeftPageItemsCount); + if (leftPageSpaceLeft < 0) + continue; + minLeftPageItemsCount++; + } + else + { + /* Place item on the right page. */ + if (rightPageSpaceLeft <= 0) + return false; + rightPageSpaceLeft -= items->items[maxLeftPageItemsCount - 1].size + + MAXALIGN(sizeof(LocationIndex) * + (items->itemsCount - maxLeftPageItemsCount + 1)) - + MAXALIGN(sizeof(LocationIndex) * + (items->itemsCount - maxLeftPageItemsCount)); + if (rightPageSpaceLeft < 0) + continue; + maxLeftPageItemsCount--; + } + } + + if (splitLocation) + *splitLocation = minLeftPageItemsCount; + return true; +} + +/* + * Non-asserting test for whether btree_page_split_location() would + * succeed on `items` — a valid two-page split exists. Used by + * merge_waited_tuples() as the post-pass gate after greedy waiter + * acceptance: drop the most recently accepted waiter and re-check + * until this returns true. + */ +bool +btree_page_split_can_succeed(BTreeSplitItems *items) +{ + return btree_page_split_find_location(items, 0, 0.5f, NULL); +} + +/* + * Find the location for B-tree page split. This function take into accouint + * insertion of new tuple or replacement of existing one. It tries to keep + * as close as possible to `targetLocation`, or if `targetLocation == 0` close + * to `spaceRatio`. Also, this function takes advantage of reclaiming unused + * space according to `csn`. Returns number of items in new left page and + * sets the first tuple of right page to `*split_item`. + */ +OffsetNumber +btree_page_split_location(BTreeDescr *desc, + BTreeSplitItems *items, + OffsetNumber targetLocation, float4 spaceRatio, + OTuple *split_item) +{ + OffsetNumber splitLocation; + bool ok PG_USED_FOR_ASSERTS_ONLY; + + Assert(spaceRatio >= 0.0f && spaceRatio <= 1.0f); + + ok = btree_page_split_find_location(items, targetLocation, spaceRatio, + &splitLocation); + Assert(ok); + + if (split_item) + { + split_item->formatFlags = items->items[splitLocation].flags; + split_item->data = items->items[splitLocation].data + + (items->leaf ? BTreeLeafTuphdrSize : BTreeNonLeafTuphdrSize); + } + + return splitLocation; +} + +OffsetNumber +btree_get_split_left_count(BTreeDescr *desc, Page page, + OffsetNumber offset, bool replace, + BTreeSplitItems *items, + OTuple *split_key, LocationIndex *split_key_len) +{ + BTreePageHeader *header = (BTreePageHeader *) page; + OffsetNumber targetCount; + OffsetNumber result; + float4 spaceRatio; + float4 fillfactorRatio = ((float4) desc->fillfactor) / 100.0f; + OTuple split_item; + + /* The default target is to split the page 50%/50% */ + targetCount = 0; + spaceRatio = 0.5f; + + /* + * Try to autodetect ordered inserts and split near the insertion point. + * If we're close to the end of the page, split already inserted data away + * from the insertion point (if it gives at least 90% utilization). + * Otherwise, place already inserted data together with the insertion + * point. Hopefuly, we still have many tuple to insert and that will give + * us the good utilization. + */ + if (offset == header->prevInsertOffset + 1) + { + if ((float) offset / (float) header->itemsCount > fillfactorRatio) + spaceRatio = fillfactorRatio; + else if ((float) offset / (float) header->itemsCount >= 0.9f) + targetCount = offset; + else + targetCount = offset + 1; + } + else if ((!replace && offset == header->prevInsertOffset) || + (replace && offset == header->prevInsertOffset - 1)) + { + if ((float) offset / (float) header->itemsCount < 1.0f - fillfactorRatio) + spaceRatio = 1.0f - fillfactorRatio; + else if ((float) offset / (float) header->itemsCount <= 0.1f) + targetCount = offset + 1; + else + targetCount = offset; + } + + /* + * If we don't autodetect the insertion order, we still assume TOAST and + * rightmost inserts are always assumed to be ordered ascendingly. + */ + else if ((desc->type == oIndexToast && O_PAGE_IS(page, LEAF)) || O_PAGE_IS(page, RIGHTMOST)) + spaceRatio = fillfactorRatio; + + result = btree_page_split_location(desc, items, targetCount, spaceRatio, + &split_item); + + /* + * Fill the split key. Convert tuple to key if needed. + */ + if (split_key) + { + bool allocated = true; + + if (O_PAGE_IS(page, LEAF)) + split_item = o_btree_tuple_make_key(desc, split_item, NULL, + false, &allocated); + + *split_key_len = o_btree_len(desc, split_item, OKeyLength); + if (!O_PAGE_IS(page, LEAF) || !allocated) + { + split_key->data = (Pointer) palloc(*split_key_len); + split_key->formatFlags = split_item.formatFlags; + memcpy(split_key->data, split_item.data, *split_key_len); + } + else + { + *split_key = split_item; + } + } + + return result; +} + +/* + * Split B-tree page into two. + * + * Returns OInvalidInMemoryBlkno if the page can not be split due to the fact that + * it is under processing by the checkpointer worker. + */ +void +perform_page_split(BTreeDescr *desc, OInMemoryBlkno blkno, + OInMemoryBlkno new_blkno, + BTreeSplitItems *items, + OffsetNumber left_count, + OTuple splitkey, LocationIndex splitkey_len, + CommitSeqNo csn, UndoLocation undoLoc) +{ + Page left_page = O_GET_IN_MEMORY_PAGE(blkno), + right_page = O_GET_IN_MEMORY_PAGE(new_blkno); + BTreePageHeader *left_header = (BTreePageHeader *) left_page, + *right_header = (BTreePageHeader *) right_page; + bool leaf = O_PAGE_IS(left_page, LEAF); + OTuple hikey; + uint64 rightlink; + LocationIndex hikeySize; + + rightlink = left_header->rightLink; + init_new_btree_page(desc, new_blkno, + left_header->flags & ~(O_BTREE_FLAG_LEFTMOST), + PAGE_GET_LEVEL(left_page), false); + +#ifdef ORIOLEDB_CUT_FIRST_KEY + if (!leaf) + items->items[left_count].size = BTreeNonLeafTuphdrSize; +#endif + + if (O_PAGE_IS(left_page, RIGHTMOST)) + { + hikeySize = 0; + O_TUPLE_SET_NULL(hikey); + } + else + { + hikeySize = BTREE_PAGE_GET_HIKEY_SIZE(left_page); + BTREE_PAGE_GET_HIKEY(hikey, left_page); + } + + btree_page_reorg(desc, right_page, &items->items[left_count], + items->itemsCount - left_count, + hikeySize, hikey); + + /* + * Start page modification. It contains the required memory barrier + * between making undo image and setting the undo location. + */ + page_block_reads(blkno); + + /* Link undo record with pages */ + left_header->undoLocation = undoLoc; + right_header->undoLocation = undoLoc; + + /* + * Memory barrier between write undo location and csn. See comment in the + * o_btree_read_page() for details. + */ + pg_write_barrier(); + + left_header->csn = csn; + right_header->csn = csn; + right_header->rightLink = rightlink; + left_header->rightLink = MAKE_IN_MEMORY_RIGHTLINK(new_blkno, + O_PAGE_GET_CHANGE_COUNT(right_page)); + left_header->flags &= ~(O_BTREE_FLAG_RIGHTMOST); + if (RightLinkIsValid(rightlink)) + O_GET_IN_MEMORY_PAGEDESC(RIGHTLINK_GET_BLKNO(rightlink))->leftBlkno = new_blkno; + O_GET_IN_MEMORY_PAGEDESC(new_blkno)->leftBlkno = blkno; + + btree_page_reorg(desc, left_page, &items->items[0], left_count, + splitkey_len, splitkey); + + o_btree_page_calculate_statistics(desc, left_page); + o_btree_page_calculate_statistics(desc, right_page); + + MARK_DIRTY(desc, blkno); + MARK_DIRTY(desc, new_blkno); +} diff --git a/contrib/orioledb/src/btree/undo.c b/contrib/orioledb/src/btree/undo.c new file mode 100644 index 00000000000..513d58d3650 --- /dev/null +++ b/contrib/orioledb/src/btree/undo.c @@ -0,0 +1,1794 @@ +/*------------------------------------------------------------------------- + * + * undo.c + * Routines dealing with undo records of orioledb B-tree. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/btree/undo.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/find.h" +#include "btree/io.h" +#include "btree/merge.h" +#include "btree/page_chunks.h" +#include "btree/undo.h" +#include "catalog/o_sys_cache.h" +#include "recovery/recovery.h" +#include "rewind/rewind.h" +#include "tableam/descr.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "utils/memutils.h" +#include "utils/palloc.h" +#include "utils/stopevent.h" +#include "utils/page_pool.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "utils/inval.h" +#include "utils/wait_event.h" + +static void clean_chain_has_locks_flag(UndoLogType undoType, + UndoLocation location, + BTreeLeafTuphdr *pageTuphdr, + OInMemoryBlkno blkno); +static bool update_leaf_header_in_undo_if_exists(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + UndoLocation location); + +/* + * Add page image to the undo log. + */ +UndoLocation +page_add_image_to_undo(BTreeDescr *desc, Pointer p, CommitSeqNo imageCsn, + OTuple *splitKey, LocationIndex splitKeyLen) +{ + UndoPageImageHeader *header; + UndoLocation undoLocation; + Pointer ptr; + + Assert(O_PAGE_IS(p, LEAF)); + + Assert(desc->undoType != UndoLogNone); + if (splitKey) + ptr = get_undo_record(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType), + &undoLocation, + O_SPLIT_UNDO_IMAGE_SIZE(splitKeyLen)); + else + ptr = get_undo_record(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType), + &undoLocation, + O_COMPACT_UNDO_IMAGE_SIZE); + + header = (UndoPageImageHeader *) ptr; + if (splitKey) + { + header->type = UndoPageImageSplit; + header->splitKeyFlags = splitKey->formatFlags; + header->splitKeyLen = splitKeyLen; + } + else + { + header->type = UndoPageImageCompact; + } + ptr += MAXALIGN(sizeof(UndoPageImageHeader)); + memcpy(ptr, p, ORIOLEDB_BLCKSZ); + if (splitKey) + { + ptr += ORIOLEDB_BLCKSZ; + memcpy(ptr, splitKey->data, splitKeyLen); + } + + release_reserved_undo_location(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)); + + return undoLocation; +} + +/* + * Given page item modified by in-progress transaction. Rollback changes + * using undo chain. Specify 'wholeChain' flag to revert all in-progress + * changes from the chain. Otherise, only last change item is reverted. + * + * Return true if page item still exists. + * + * 'nonLockTuphdrPtr' and 'nonLockUndoLocation' are a hint to the first + * non-lock-only undo record in the chain. + */ +bool +page_item_rollback(BTreeDescr *desc, Page p, BTreePageItemLocator *locator, + bool wholeChain, BTreeLeafTuphdr *nonLockTuphdrPtr, + UndoLocation nonLockUndoLocation) +{ + Pointer item; + BTreeLeafTuphdr *tuphdr, + nonLockTuphdr; + + item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locator); + tuphdr = (BTreeLeafTuphdr *) item; + + if (!nonLockTuphdrPtr) + { + nonLockTuphdr = *tuphdr; + nonLockTuphdrPtr = &nonLockTuphdr; + nonLockUndoLocation = find_non_lock_only_undo_record(desc->undoType, + nonLockTuphdrPtr); + } + +retry: + + Assert(O_PAGE_IS(p, LEAF)); + + if (tuphdr->deleted != BTreeLeafTupleNonDeleted) + { + OTuple prev_tuple; + + /* + * Revert deletion. Assuming tuple is deleted, we shouldn't have any + * row-level lock on this tuple. + */ + Assert(!UndoLocationIsValid(nonLockUndoLocation)); + Assert(UndoLocationIsValid(tuphdr->undoLocation)); + Assert(UNDO_REC_EXISTS(desc->undoType, tuphdr->undoLocation)); + + get_prev_leaf_header_from_undo(desc->undoType, tuphdr, true); + BTREE_PAGE_READ_TUPLE(prev_tuple, p, locator); + /* Bridge index deleted tuples not treated as vacated */ + if (desc->type != oIndexBridge) + { + PAGE_SUB_N_VACATED(p, + BTreeLeafTuphdrSize + + MAXALIGN(o_btree_len(desc, prev_tuple, OTupleLength))); + } + tuphdr->formatFlags = 0; + + if (!UndoLocationIsValid(nonLockUndoLocation)) + *nonLockTuphdrPtr = *tuphdr; + + if (!XACT_INFO_IS_FINISHED(tuphdr->xactInfo) && wholeChain) + goto retry; + } + else if (UndoLocationIsValid(nonLockTuphdrPtr->undoLocation)) + { + /* + * Current tuple is not deleted. And there is a pointer to previous + * version in chain. This must be update (or insert to previously + * deleted tuple). + */ + OTuple tuple; + int prev_tuplen, + tuplen, + itemlen; + BTreeLeafTuphdr prev_header; + + prev_header = *nonLockTuphdrPtr; + tuple.formatFlags = BTREE_PAGE_GET_ITEM_FLAGS(p, locator); + tuple.data = item + BTreeLeafTuphdrSize; + prev_tuplen = o_btree_len(desc, tuple, OTupleLength); + + tuplen = BTREE_PAGE_GET_ITEM_SIZE(p, locator) - BTreeLeafTuphdrSize; + get_prev_leaf_header_and_tuple_from_undo(desc->undoType, + &prev_header, + &tuple, + tuplen); + tuplen = o_btree_len(desc, tuple, OTupleLength); + itemlen = BTreeLeafTuphdrSize + MAXALIGN(tuplen); + + Assert(itemlen <= BTREE_PAGE_GET_ITEM_SIZE(p, locator)); + if (XACT_INFO_IS_FINISHED(prev_header.xactInfo)) + { + PAGE_SUB_N_VACATED(p, BTREE_PAGE_GET_ITEM_SIZE(p, locator) - + (BTreeLeafTuphdrSize + MAXALIGN(prev_tuplen))); + page_locator_resize_item(p, locator, itemlen); + } + else + { + PAGE_ADD_N_VACATED(p, MAXALIGN(prev_tuplen)); + PAGE_SUB_N_VACATED(p, MAXALIGN(tuplen)); + } + if (prev_header.deleted != BTreeLeafTupleNonDeleted) + PAGE_ADD_N_VACATED(p, BTreeLeafTuphdrSize + MAXALIGN(tuplen)); + + if (!UndoLocationIsValid(nonLockUndoLocation)) + { + *nonLockTuphdrPtr = *tuphdr = prev_header; + } + else + { + tuphdr->deleted = prev_header.deleted; + nonLockTuphdrPtr->undoLocation = prev_header.undoLocation; + nonLockTuphdrPtr->xactInfo = prev_header.xactInfo; + update_leaf_header_in_undo(desc->undoType, + nonLockTuphdrPtr, + nonLockUndoLocation); + } + + BTREE_PAGE_SET_ITEM_FLAGS(p, locator, tuple.formatFlags); + + /* Follow the row-level undo chain if needed */ + if ((UndoLocationIsValid(nonLockUndoLocation) || + !XACT_INFO_IS_FINISHED(prev_header.xactInfo)) && wholeChain) + { + /* Find the next item in the chain */ + nonLockUndoLocation = find_non_lock_only_undo_record(desc->undoType, + nonLockTuphdrPtr); + if (XACT_INFO_IS_FINISHED(nonLockTuphdrPtr->xactInfo)) + return true; + item = BTREE_PAGE_LOCATOR_GET_ITEM(p, locator); + tuphdr = (BTreeLeafTuphdr *) item; + goto retry; + } + } + else + { + OTuple prev_tuple; + + /* + * Revert insertion of new tuple. Assuming insertion is in-progress, + * we shouldn't have any row-level lock on this tuple. + */ + Assert(!UndoLocationIsValid(nonLockUndoLocation)); + + if (desc->type == oIndexBridge) + { + /* + * A special case for bridge index: we must keep entries for + * VACUUM purposes. Just mark tuple as deleted. + */ + tuphdr->deleted = BTreeLeafTupleDeleted; + return true; + } + + BTREE_PAGE_READ_TUPLE(prev_tuple, p, locator); + PAGE_SUB_N_VACATED(p, BTREE_PAGE_GET_ITEM_SIZE(p, locator) - + (BTreeLeafTuphdrSize + MAXALIGN(o_btree_len(desc, prev_tuple, OTupleLength)))); + + page_locator_delete_item(p, locator); + return false; + } + return true; +} + +static Jsonb * +undo_record_key_stopevent_params(BTreeOperationType action, + BTreeDescr *desc, + OTuple tuple, OXid oxid) +{ + JsonbParseState *state = NULL; + Jsonb *res; + MemoryContext mctx = MemoryContextSwitchTo(stopevents_cxt); + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + if (action == BTreeOperationInsert) + jsonb_push_string_key(&state, "action", "insert"); + else if (action == BTreeOperationUpdate) + jsonb_push_string_key(&state, "action", "update"); + else if (action == BTreeOperationDelete) + jsonb_push_string_key(&state, "action", "delete"); + else if (action == BTreeOperationLock) + jsonb_push_string_key(&state, "action", "lock"); + jsonb_push_int8_key(&state, "oxid", oxid); + btree_desc_stopevent_params_internal(desc, &state); + jsonb_push_key(&state, "key"); + if (action == BTreeOperationUpdate) + { + OTuple key; + bool allocated; + + key = o_btree_tuple_make_key(desc, tuple, NULL, true, &allocated); + (void) o_btree_key_to_jsonb(desc, key, &state); + if (allocated) + pfree(key.data); + } + else + { + (void) o_btree_key_to_jsonb(desc, tuple, &state); + } + res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL)); + MemoryContextSwitchTo(mctx); + + return res; +} + +/* + * Make undo record associated with give tuple and operation. + */ +UndoLocation +make_undo_record(BTreeDescr *desc, OTuple tuple, bool is_tuple, + BTreeOperationType action, OInMemoryBlkno blkno, + uint32 pageChangeCount, + BTreeLeafTuphdr *curTupHdr) +{ + LocationIndex tuplelen; + BTreeModifyUndoStackItem *item; + LocationIndex size; + CommandId commandId; + UndoLocation undoLocation; + + if (action == BTreeOperationUpdate) + { + Assert(is_tuple); + tuplelen = o_btree_len(desc, tuple, OTupleLength); + } + else + { + tuplelen = o_btree_len(desc, tuple, is_tuple ? OTupleKeyLength : OKeyLength); + } + + size = sizeof(BTreeModifyUndoStackItem) + tuplelen; + item = (BTreeModifyUndoStackItem *) get_undo_record(desc->undoType, + &undoLocation, + MAXALIGN(size)); + item->header.itemSize = size; + if (action == BTreeOperationLock) + item->header.type = RowLockUndoItemType; + else + item->header.type = ModifyUndoItemType; + item->header.indexType = desc->type; + item->action = action; + item->blkno = blkno; + item->pageChangeCount = pageChangeCount; + item->oids = desc->oids; + + if (action == BTreeOperationUpdate || !is_tuple) + { + memcpy((Pointer) item + sizeof(BTreeModifyUndoStackItem), + tuple.data, + tuplelen); + item->tuphdr.formatFlags = tuple.formatFlags; + } + else + { + bool key_palloc = false; + OTuple key; + + memset((Pointer) item + sizeof(BTreeModifyUndoStackItem), 0, tuplelen); + key = o_btree_tuple_make_key(desc, tuple, + (Pointer) item + sizeof(BTreeModifyUndoStackItem), + true, &key_palloc); + item->tuphdr.formatFlags = key.formatFlags; + Assert(!key_palloc); + } + + if (curTupHdr) + { + item->tuphdr.xactInfo = curTupHdr->xactInfo; + item->tuphdr.undoLocation = curTupHdr->undoLocation; + item->tuphdr.deleted = curTupHdr->deleted; + item->tuphdr.chainHasLocks = curTupHdr->chainHasLocks; + } + + add_new_undo_stack_item(desc->undoType, undoLocation); + + undoLocation += offsetof(BTreeModifyUndoStackItem, tuphdr); + + commandId = o_get_current_command(); + if (desc->undoType == UndoLogRegular && + commandId != InvalidCommandId && + !is_recovery_process()) + update_command_undo_location(commandId, undoLocation); + + return undoLocation; +} + +/* + * Create an undo record for a tuple insert on behalf of a waiting process + * (group insert optimization). Called by the lock holder after it decides + * to insert the waiter's tuple into the page. + * + * The undo record is allocated from the current process's undo space (via + * get_undo_record on our undoType), but then linked into the *waiter's* + * undo stack via add_new_undo_stack_item_to_process(). The waiter's + * autonomousNestingLevel (captured when it queued) is used to select the + * correct undo stack slot. + * + * The waiter process is blocked on a semaphore in lock_page_with_tuple() + * throughout this operation, so its shared state is stable. + */ +void +make_waiter_undo_record(BTreeDescr *desc, OInMemoryBlkno blkno, int pgprocno, + OPageWaiterShmemState *lockerState) +{ + LocationIndex tuplelen; + UndoLocation undoLocation; + BTreeModifyUndoStackItem *item; + LocationIndex size; + OTuple tuple; + bool key_palloc = false; + OTuple key; + + tuple.formatFlags = lockerState->tupleFlags; + tuple.data = &lockerState->tupleData.fixedData[BTreeLeafTuphdrSize]; + + tuplelen = o_btree_len(desc, tuple, OTupleKeyLength); + + size = sizeof(BTreeModifyUndoStackItem) + tuplelen; + item = (BTreeModifyUndoStackItem *) get_undo_record(desc->undoType, + &undoLocation, + MAXALIGN(size)); + item->header.itemSize = size; + item->header.type = ModifyUndoItemType; + item->header.indexType = desc->type; + item->action = BTreeOperationInsert; + item->blkno = blkno; + item->pageChangeCount = lockerState->pageChangeCount; + item->oids = desc->oids; + + memset((Pointer) item + sizeof(BTreeModifyUndoStackItem), 0, tuplelen); + key = o_btree_tuple_make_key(desc, tuple, + (Pointer) item + sizeof(BTreeModifyUndoStackItem), + true, &key_palloc); + item->tuphdr.formatFlags = key.formatFlags; + Assert(!key_palloc); + + lockerState->undoLocation = undoLocation; + add_new_undo_stack_item_to_process(desc->undoType, undoLocation, pgprocno, + lockerState->autonomousNestingLevel); +} + +static BTreeDescr * +get_tree_descr(ORelOids oids, OIndexType type) +{ + if (IS_SYS_TREE_OIDS(oids)) + { + return get_sys_tree(oids.relnode); + } + else + { + OIndexDescr *descr = o_fetch_index_descr(oids, type, false, NULL); + + if (!descr) + return NULL; + return &descr->desc; + } +} + +/* + * Callback for aborting B-tree record modification. + */ +void +modify_undo_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, OXid oxid, + OUndoCallbackStage stage, bool changeCountsValid) +{ + BTreeModifyUndoStackItem *item = (BTreeModifyUndoStackItem *) baseItem; + BTreeDescr *desc = get_tree_descr(item->oids, item->header.indexType); + OTuple tuple; + Page p; + int cmp; + OInMemoryBlkno blkno; + BTreePageItemLocator *loc; + BTreeLeafTuphdr *tupHdr, + nonLockTupHdr; + UndoLocation nonLockUndoLocation; + OBTreeFindPageContext context; + BTreeKeyType keyType = item->action == BTreeOperationUpdate ? BTreeKeyLeafTuple : BTreeKeyNonLeafKey; + OFindPageResult findResult; + + Assert(stage == OUndoCallbackStageAbort); + + if (!desc) + return; + + tuple.formatFlags = item->tuphdr.formatFlags; + tuple.data = (Pointer) item + sizeof(BTreeModifyUndoStackItem); + + if (STOPEVENTS_ENABLED()) + { + Jsonb *params = undo_record_key_stopevent_params(item->action, + desc, + tuple, oxid); + + STOPEVENT(STOPEVENT_APPLY_UNDO, params); + } + + init_page_find_context(&context, desc, + COMMITSEQNO_INPROGRESS, + BTREE_PAGE_FIND_MODIFY); + + if (!changeCountsValid) + item->pageChangeCount = InvalidOPageChangeCount; + + o_set_syscache_hooks(); + findResult = refind_page(&context, (Pointer) &tuple, keyType, + 0, item->blkno, item->pageChangeCount); + o_unset_syscache_hooks(); + if (findResult == OFindPageResultFailure) + { + /* + * BTree can be already deleted and cleaned by + * btree_relnode_undo_callback(). + */ + return; + } + Assert(findResult == OFindPageResultSuccess); + + blkno = context.items[context.index].blkno; + p = O_GET_IN_MEMORY_PAGE(blkno); + loc = &context.items[context.index].locator; + + if (BTREE_PAGE_LOCATOR_IS_VALID(p, loc)) + { + OTuple leafTup; + + BTREE_PAGE_READ_LEAF_ITEM(tupHdr, leafTup, p, loc); + cmp = o_btree_cmp(desc, &tuple, keyType, &leafTup, BTreeKeyLeafTuple); + } + else + cmp = 1; + + if (cmp != 0) + { + /* + * We can't find the required key. This might happend if operation + * was already "undone" earlier. + */ + unlock_page(blkno); + return; + } + + nonLockTupHdr = *tupHdr; + nonLockUndoLocation = find_non_lock_only_undo_record(desc->undoType, + &nonLockTupHdr); + + if (!XACT_INFO_OXID_EQ(nonLockTupHdr.xactInfo, oxid)) + { + /* + * The key is found, but it doesn't belong to our transaction. Again, + * this might happend if operation was already "undone" earlier. + */ + unlock_page(blkno); + return; + } + + page_block_reads(blkno); + + /* + * Check that undo chain item matches to the tuple item. + */ + if (nonLockTupHdr.undoLocation == location + offsetof(BTreeModifyUndoStackItem, tuphdr) || + (!UndoLocationIsValid(nonLockTupHdr.undoLocation) && item->action == BTreeOperationInsert)) + { + (void) page_item_rollback(desc, p, loc, false, + &nonLockTupHdr, nonLockUndoLocation); + } + + MARK_DIRTY(desc, blkno); + if (blkno != desc->rootInfo.rootPageBlkno && is_page_too_sparse(desc, p)) + { + /* We can try to merge this page */ + btree_try_merge_and_unlock(context.desc, blkno, true, true); + } + else + unlock_page(blkno); +} + +/* + * Callback for aborting B-tree tuple lock. + */ +void +lock_undo_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, OXid oxid, + OUndoCallbackStage stage, bool changeCountsValid) +{ + BTreeModifyUndoStackItem *item = (BTreeModifyUndoStackItem *) baseItem; + BTreeDescr *desc = get_tree_descr(item->oids, item->header.indexType); + OTuple key; + Page p; + int cmp; + OInMemoryBlkno blkno; + BTreeLeafTuphdr *page_tuphdr, + tuphdr; + BTreePageItemLocator *locptr; + OBTreeFindPageContext context; + UndoLocation tuphdrUndoLocation, + lastLockOnlyUndoLocation = InvalidUndoLocation; + OFindPageResult findResult; + + Assert(stage == OUndoCallbackStageAbort); + + if (!desc) + return; + + key.formatFlags = item->tuphdr.formatFlags; + key.data = (Pointer) item + sizeof(BTreeModifyUndoStackItem); + + if (STOPEVENTS_ENABLED()) + { + Jsonb *params = undo_record_key_stopevent_params(BTreeOperationLock, + desc, key, oxid); + + STOPEVENT(STOPEVENT_APPLY_UNDO, params); + } + + init_page_find_context(&context, desc, COMMITSEQNO_INPROGRESS, BTREE_PAGE_FIND_MODIFY); + if (!changeCountsValid) + item->pageChangeCount = InvalidOPageChangeCount; + + findResult = refind_page(&context, (Pointer) &key, + BTreeKeyNonLeafKey, 0, item->blkno, + item->pageChangeCount); + + if (findResult == OFindPageResultFailure) + { + /* + * BTree can be already deleted and cleaned by + * btree_relnode_undo_callback(). + */ + return; + } + Assert(findResult == OFindPageResultSuccess); + + blkno = context.items[context.index].blkno; + p = O_GET_IN_MEMORY_PAGE(blkno); + locptr = &context.items[context.index].locator; + + if (BTREE_PAGE_LOCATOR_GET_OFFSET(p, locptr) < BTREE_PAGE_ITEMS_COUNT(p)) + { + OTuple leafTup; + + BTREE_PAGE_READ_TUPLE(leafTup, p, locptr); + cmp = o_btree_cmp(desc, &key, BTreeKeyNonLeafKey, &leafTup, BTreeKeyLeafTuple); + } + else + cmp = 1; + + if (cmp != 0) + { + /* Row already gone. Nothing to do. */ + unlock_page(blkno); + return; + } + + page_tuphdr = (BTreeLeafTuphdr *) BTREE_PAGE_LOCATOR_GET_ITEM(p, locptr); + tuphdr = *page_tuphdr; + tuphdrUndoLocation = InvalidUndoLocation; + + while (!XACT_INFO_IS_FINISHED(tuphdr.xactInfo) || tuphdr.chainHasLocks) + { + bool delete_record = false; + UndoLocation undoLocation = tuphdr.undoLocation; + BTreeLeafTuphdr prev_tuphdr = tuphdr; + + /* + * A concurrent transaction may have committed and released its undo + * while we are walking the chain. Treat this the same as reaching a + * committed record — stop walking. + */ + if (!get_prev_leaf_header_from_undo_if_exists(desc->undoType, &prev_tuphdr)) + break; + + if (XACT_INFO_IS_LOCK_ONLY(tuphdr.xactInfo) && XACT_INFO_GET_OXID(tuphdr.xactInfo) == oxid) + { + if (tuphdr.undoLocation == location + offsetof(BTreeModifyUndoStackItem, tuphdr)) + delete_record = true; + } + + if (delete_record) + { + if (!tuphdr.chainHasLocks && + XACT_INFO_IS_LOCK_ONLY(tuphdr.xactInfo)) + clean_chain_has_locks_flag(desc->undoType, + lastLockOnlyUndoLocation, + page_tuphdr, blkno); + + if (!UndoLocationIsValid(tuphdrUndoLocation)) + { + page_block_reads(blkno); + page_tuphdr->xactInfo = prev_tuphdr.xactInfo; + page_tuphdr->undoLocation = prev_tuphdr.undoLocation; + page_tuphdr->chainHasLocks = prev_tuphdr.chainHasLocks; + tuphdr = *page_tuphdr; + MARK_DIRTY(desc, blkno); + } + else + { + tuphdr.xactInfo = prev_tuphdr.xactInfo; + tuphdr.undoLocation = prev_tuphdr.undoLocation; + tuphdr.chainHasLocks = prev_tuphdr.chainHasLocks; + update_leaf_header_in_undo_if_exists(desc->undoType, &tuphdr, + tuphdrUndoLocation); + } + } + + if (XACT_INFO_IS_LOCK_ONLY(tuphdr.xactInfo)) + lastLockOnlyUndoLocation = tuphdrUndoLocation; + + tuphdr = prev_tuphdr; + tuphdrUndoLocation = undoLocation; + undoLocation = tuphdr.undoLocation; + } + unlock_page(blkno); +} + +#define PENDING_TRUNCATES_FILENAME (ORIOLEDB_DATA_DIR "/pending_truncates") + +static void +add_pending_truncate(ORelOids relOids, int numTrees, OIndexKey *trees) +{ + File pendingTruncatesFile; + uint64 offset; + uint64 length; + + LWLockAcquire(&pending_truncates_meta->pendingTruncatesLock, LW_EXCLUSIVE); + + pendingTruncatesFile = PathNameOpenFile(PENDING_TRUNCATES_FILENAME, + O_RDWR | O_CREAT | PG_BINARY); + if (pendingTruncatesFile < 0) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not open pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset = pending_truncates_meta->pendingTruncatesLocation; + length = sizeof(relOids); + + if (FileWrite(pendingTruncatesFile, (Pointer) &relOids, length, offset, + WAIT_EVENT_BUFFILE_WRITE) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not write pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset += length; + length = sizeof(numTrees); + + if (FileWrite(pendingTruncatesFile, (Pointer) &numTrees, length, 0, + WAIT_EVENT_BUFFILE_WRITE) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not write pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset += length; + length = sizeof(*trees) * numTrees; + + if (FileWrite(pendingTruncatesFile, (Pointer) &trees, length, 0, + WAIT_EVENT_BUFFILE_WRITE) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not write pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset += length; + pending_truncates_meta->pendingTruncatesLocation = offset; + + FileClose(pendingTruncatesFile); + + LWLockRelease(&pending_truncates_meta->pendingTruncatesLock); +} + +void +check_pending_truncates(void) +{ + uint64 offset; + uint64 maxOffset; + OIndexKey *trees = NULL; + int treesAllocated = 0; + File pendingTruncatesFile; + + if (have_backup_in_progress() || pending_truncates_meta->pendingTruncatesLocation == 0) + return; + + if (!LWLockConditionalAcquire(&pending_truncates_meta->pendingTruncatesLock, + LW_EXCLUSIVE)) + return; + + if (have_backup_in_progress() || pending_truncates_meta->pendingTruncatesLocation == 0) + { + LWLockRelease(&pending_truncates_meta->pendingTruncatesLock); + return; + } + + pendingTruncatesFile = PathNameOpenFile(PENDING_TRUNCATES_FILENAME, + O_RDONLY | PG_BINARY); + if (pendingTruncatesFile < 0) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not open pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset = 0; + maxOffset = pending_truncates_meta->pendingTruncatesLocation; + while (offset < maxOffset) + { + uint64 length; + int numTrees; + ORelOids relOids; + + length = sizeof(relOids); + if (FileRead(pendingTruncatesFile, (Pointer) &relOids, length, offset, + WAIT_EVENT_BUFFILE_READ) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not read pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + offset += length; + length = sizeof(numTrees); + + if (FileRead(pendingTruncatesFile, (Pointer) &numTrees, length, offset, + WAIT_EVENT_BUFFILE_READ) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not read pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + if (numTrees > treesAllocated) + { + if (!trees) + trees = palloc(sizeof(OIndexKey) * numTrees); + else + trees = repalloc(trees, sizeof(OIndexKey) * numTrees); + treesAllocated = numTrees; + } + + offset += length; + length = sizeof(OIndexKey) * numTrees; + + if (FileRead(pendingTruncatesFile, (Pointer) trees, length, offset, + WAIT_EVENT_BUFFILE_READ) != length) + ereport(FATAL, (errcode_for_file_access(), + errmsg("could not read pending truncates file %s: %m", + PENDING_TRUNCATES_FILENAME))); + + for (int i = 0; i < numTrees; i++) + cleanup_btree_files(trees[i], true); + } + + pending_truncates_meta->pendingTruncatesLocation = 0; + + LWLockRelease(&pending_truncates_meta->pendingTruncatesLock); + + if (trees) + pfree(trees); +} + +void +btree_relnode_undo_callback(UndoLogType undoType, UndoLocation location, + UndoStackItem *baseItem, + OXid oxid, OUndoCallbackStage stage, + bool changeCountsValid) +{ + RelnodeUndoStackItem *relnode_item = (RelnodeUndoStackItem *) baseItem; + Oid datoid, + reloid, + dropRelnode, + remainRelnode; + int dropNumTrees; + OIndexKey *dropTrees; + bool doCleanup; + bool cleanupFiles = true; + + /* + * Fsync new files on precommit, before the commit WAL record is written, + * to guarantee durability in case of a crash between WAL write and fsync. + */ + if (stage == OUndoCallbackStagePreCommit) + { + if (OidIsValid(relnode_item->newRelnode) && relnode_item->fsync) + { + int numTrees = relnode_item->newNumTrees; + OIndexKey *trees = &relnode_item->trees[relnode_item->oldNumTrees]; + int i; + + for (i = 0; i < numTrees; i++) + fsync_btree_files(trees[i]); + } + return; + } + + if (!enable_rewind || stage == OUndoCallbackStageAbort) + doCleanup = true; + else + doCleanup = is_rewind_worker(); + + datoid = relnode_item->datoid; + reloid = relnode_item->relid; + + if (stage == OUndoCallbackStageCommit) + { + remainRelnode = relnode_item->newRelnode; + dropRelnode = relnode_item->oldRelnode; + dropTrees = &relnode_item->trees[0]; + dropNumTrees = relnode_item->oldNumTrees; + + if (have_backup_in_progress() && doCleanup) + { + ORelOids oids = {datoid, reloid, relnode_item->oldRelnode}; + + dropRelnode = InvalidOid; + add_pending_truncate(oids, relnode_item->oldNumTrees, + &relnode_item->trees[0]); + cleanupFiles = false; + } + } + else + { + remainRelnode = relnode_item->oldRelnode; + dropRelnode = relnode_item->newRelnode; + dropTrees = &relnode_item->trees[relnode_item->oldNumTrees]; + dropNumTrees = relnode_item->newNumTrees; + } + + if (OidIsValid(dropRelnode)) + { + ORelOids oids = {datoid, reloid, dropRelnode}; + bool recovery = is_recovery_in_progress(); + int i; + + if (!recovery) + o_tables_rel_lock_exclusive_no_inval_no_log(&oids); + o_tables_rel_lock_extended_no_inval(&oids, AccessExclusiveLock, true); + CacheInvalidateRelcacheByDbidRelid(oids.datoid, oids.reloid); + o_invalidate_oids(oids); + if (!recovery) + o_tables_rel_unlock_extended(&oids, AccessExclusiveLock, false); + o_tables_rel_unlock_extended(&oids, AccessExclusiveLock, true); + + for (i = 0; i < dropNumTrees; i++) + { + if (!recovery) + o_tables_rel_lock_exclusive_no_inval_no_log(&dropTrees[i].oids); + o_tables_rel_lock_extended_no_inval(&dropTrees[i].oids, + AccessExclusiveLock, true); + if (doCleanup) + { + cleanup_btree(dropTrees[i], cleanupFiles, true); + o_delete_chkp_num(dropTrees[i].oids.datoid, + dropTrees[i].oids.relnode); + } + o_invalidate_oids(dropTrees[i].oids); + if (!recovery) + o_tables_rel_unlock_extended(&dropTrees[i].oids, AccessExclusiveLock, false); + o_tables_rel_unlock_extended(&dropTrees[i].oids, AccessExclusiveLock, true); + } + } + + if (OidIsValid(remainRelnode)) + { + ORelOids oids = {datoid, reloid, remainRelnode}; + + o_invalidate_oids(oids); + } +} + +/* + * oldTrees and newTrees should be allocated in CurTransactionContext. + */ +static inline void +add_undo_relnode(ORelOids oldOids, OIndexKey *oldTrees, int oldNumTrees, + ORelOids newOids, OIndexKey *newTrees, int newNumTrees, + bool fsync) +{ + LocationIndex size; + UndoLocation location; + RelnodeUndoStackItem *item; + int stepItemsCapacity = (O_MAX_UNDO_RECORD_SIZE - offsetof(RelnodeUndoStackItem, trees)) / sizeof(OIndexKey); + + /* + * This might happend before we accessed oxid. So, ensure we've assigned + * it. + */ + (void) get_current_oxid(); + + oxid_needs_wal_flush = true; + + Assert(oldNumTrees >= 0 && newNumTrees >= 0); + + while (oldNumTrees + newNumTrees > 0) + { + int stepOldTrees; + int stepNewTrees; + + stepOldTrees = Min(oldNumTrees, stepItemsCapacity); + stepNewTrees = Min(newNumTrees, stepItemsCapacity - stepOldTrees); + + size = offsetof(RelnodeUndoStackItem, trees) + sizeof(OIndexKey) * (stepOldTrees + stepNewTrees); + item = (RelnodeUndoStackItem *) get_undo_record_unreserved(UndoLogSystem, &location, MAXALIGN(size)); + + item->header.base.type = RelnodeUndoItemType; + item->header.base.itemSize = size; + item->header.base.indexType = oIndexPrimary; + Assert(ORelOidsIsValid(oldOids) || ORelOidsIsValid(newOids)); + if (ORelOidsIsValid(oldOids)) + { + item->datoid = oldOids.datoid; + item->relid = oldOids.reloid; + } + else + { + item->datoid = newOids.datoid; + item->relid = newOids.reloid; + } + item->oldRelnode = oldOids.relnode; + item->oldNumTrees = stepOldTrees; + item->newRelnode = newOids.relnode; + item->newNumTrees = stepNewTrees; + item->fsync = fsync; + + if (oldNumTrees > 0) + { + Assert(oldTrees); + memcpy(item->trees, + oldTrees, + sizeof(OIndexKey) * stepOldTrees); + } + if (newNumTrees > 0) + { + Assert(newTrees); + memcpy(&item->trees[oldNumTrees], + newTrees, + sizeof(OIndexKey) * stepNewTrees); + } + + add_new_undo_stack_item(UndoLogSystem, location); + + release_undo_size(UndoLogSystem); + + oldTrees += stepOldTrees; + oldNumTrees -= stepOldTrees; + newTrees += stepNewTrees; + newNumTrees -= stepNewTrees; + } +} + +void +add_undo_truncate_relnode(ORelOids oldOids, OIndexKey *oldTrees, + int oldNumTrees, + ORelOids newOids, OIndexKey *newTrees, + int newNumTrees, bool fsync) +{ + Assert(ORelOidsIsValid(oldOids) && ORelOidsIsValid(newOids)); + Assert(oldOids.datoid == newOids.datoid); + Assert(oldOids.reloid == newOids.reloid); + + add_undo_relnode(oldOids, oldTrees, oldNumTrees, + newOids, newTrees, newNumTrees, fsync); +} + +void +add_undo_drop_relnode(ORelOids oids, OIndexKey *trees, int numTrees) +{ + ORelOids invalid = {InvalidOid, InvalidOid, InvalidOid}; + + Assert(ORelOidsIsValid(oids)); + add_undo_relnode(oids, trees, numTrees, invalid, NULL, 0, false); +} + +void +add_undo_create_relnode(ORelOids oids, OIndexKey *trees, int numTrees, bool fsync) +{ + ORelOids invalid = {InvalidOid, InvalidOid, InvalidOid}; + + Assert(ORelOidsIsValid(oids)); + add_undo_relnode(invalid, NULL, 0, oids, trees, numTrees, fsync); +} + +static void +read_hikey_from_undo(UndoLogType undoType, UndoLocation location, + Page dest, LocationIndex *loc) +{ + undo_read(undoType, location, sizeof(BTreePageHeader), dest); + *loc = sizeof(BTreePageHeader); + undo_read(undoType, + location + *loc, + ((BTreePageHeader *) dest)->hikeysEnd - *loc, + dest + *loc); + *loc = ((BTreePageHeader *) dest)->hikeysEnd; +} + +/* + * Finds page image in undoLocation. + */ +void +get_page_from_undo(BTreeDescr *desc, UndoLocation undoLocation, Pointer key, + BTreeKeyType kind, Pointer dest, + bool *is_left, bool *is_right, OFixedKey *lokey, + OFixedKey *page_lokey, OTuple *page_hikey) +{ + UndoPageImageHeader header = {UndoPageImageInvalid, 0, 0}; + int cmp, + cmp_expected; + OTuple hikey; + UndoLocation left_loc, + right_loc; + LocationIndex loc = 0; + UndoLogType undoType = GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType); + + undo_read(undoType, undoLocation, + sizeof(UndoPageImageHeader), (Pointer) &header); + left_loc = undoLocation + MAXALIGN(sizeof(UndoPageImageHeader)); + + if (is_left != NULL) + *is_left = false; + + if (is_right != NULL) + *is_right = false; + + /* there is only one page, no need to choose */ + if (header.type == UndoPageImageSplit || + header.type == UndoPageImageCompact) + { + if (is_left != NULL) + *is_left = true; + if (is_right != NULL) + *is_right = true; + undo_read(undoType, left_loc, ORIOLEDB_BLCKSZ, dest); + if (page_lokey && header.type == UndoPageImageSplit) + { + bool set_page_lokey = false; + + if (!page_hikey || O_TUPLE_IS_NULL(*page_hikey)) + { + set_page_lokey = true; + } + else if (!O_PAGE_IS(dest, RIGHTMOST)) + { + BTREE_PAGE_GET_HIKEY(hikey, dest); + cmp = o_btree_cmp(desc, page_hikey, BTreeKeyNonLeafKey, &hikey, BTreeKeyNonLeafKey); + Assert(cmp <= 0); + if (cmp == 0) + set_page_lokey = true; + } + + if (set_page_lokey) + { + undo_read(undoType, + left_loc + ORIOLEDB_BLCKSZ, + header.splitKeyLen, + page_lokey->fixedData); + page_lokey->tuple.formatFlags = header.splitKeyFlags; + page_lokey->tuple.data = (Pointer) &page_lokey->fixedData; + } + } + return; + } + + right_loc = left_loc + ORIOLEDB_BLCKSZ; + + /* + * It's dual undo log page image. We should make decision which page (left + * or right) should be returned. + */ + Assert(header.type == UndoPageImageMerge); + switch (kind) + { + case BTreeKeyNone: + if (is_left != NULL) + *is_left = true; + undo_read(undoType, left_loc, ORIOLEDB_BLCKSZ, dest); + break; + case BTreeKeyRightmost: + if (is_right != NULL) + *is_right = true; + if (lokey != NULL) + { + read_hikey_from_undo(undoType, left_loc, dest, &loc); + copy_fixed_hikey(desc, lokey, dest); + } + undo_read(undoType, right_loc, ORIOLEDB_BLCKSZ, dest); + break; + case BTreeKeyLeafTuple: + case BTreeKeyNonLeafKey: + case BTreeKeyBound: + case BTreeKeyPageHiKey: + Assert(key != NULL); + + read_hikey_from_undo(undoType, left_loc, dest, &loc); + + cmp_expected = kind == BTreeKeyPageHiKey ? 1 : 0; + kind = kind == BTreeKeyPageHiKey ? BTreeKeyNonLeafKey : kind; + BTREE_PAGE_GET_HIKEY(hikey, dest); + + cmp = o_btree_cmp(desc, key, kind, &hikey, BTreeKeyNonLeafKey); + + if (cmp >= cmp_expected) + { + if (is_right != NULL) + *is_right = true; + if (lokey != NULL) + copy_fixed_hikey(desc, lokey, dest); + undo_read(undoType, right_loc, ORIOLEDB_BLCKSZ, dest); + } + else + { + if (is_left != NULL) + *is_left = true; + undo_read(undoType, left_loc + loc, ORIOLEDB_BLCKSZ - loc, dest + loc); + } + break; + default: + Assert(false); + } +} + +/* + * Copy images of the left and the right pages into undo log. + */ +UndoLocation +make_merge_undo_image(BTreeDescr *desc, Pointer left, + Pointer right, CommitSeqNo imageCsn) +{ + UndoPageImageHeader *header; + UndoLocation undoLocation; + Pointer undo_rec; + UndoLogType undoType = GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType); + + Assert(O_PAGE_IS(left, LEAF) && O_PAGE_IS(right, LEAF)); + + Assert(undoType != UndoLogNone); + undo_rec = get_undo_record(GET_PAGE_LEVEL_UNDO_TYPE(undoType), + &undoLocation, O_MERGE_UNDO_IMAGE_SIZE); + + header = (UndoPageImageHeader *) undo_rec; + header->type = UndoPageImageMerge; + undo_rec = undo_rec + MAXALIGN(sizeof(UndoPageImageHeader)); + + memcpy(undo_rec, left, ORIOLEDB_BLCKSZ); + memcpy(undo_rec + ORIOLEDB_BLCKSZ, right, ORIOLEDB_BLCKSZ); + + release_reserved_undo_location(GET_PAGE_LEVEL_UNDO_TYPE(desc->undoType)); + + return undoLocation; +} + +/* + * Clean `chainHasLocks` flag on given and previous undo locations. + */ +static void +clean_chain_has_locks_flag(UndoLogType undoType, UndoLocation location, + BTreeLeafTuphdr *pageTuphdr, OInMemoryBlkno blkno) +{ + BTreeLeafTuphdr tuphdr = {0, 0}; + UndoLocation retainedUndoLocation; + + if (!is_recovery_process()) + retainedUndoLocation = get_snapshot_retained_undo_location(undoType); + else + retainedUndoLocation = pg_atomic_read_u64(&get_undo_meta_by_type(undoType)->checkpointRetainStartLocation); + + /* + * Invalid location means that we should update starting from the + * pageTuphdr. Clean `chainHasLocks` flag there if needed. + */ + if (!UndoLocationIsValid(location) || location < retainedUndoLocation) + { + if (!pageTuphdr->chainHasLocks) + return; + + page_block_reads(blkno); + + pageTuphdr->chainHasLocks = false; + location = pageTuphdr->undoLocation; + } + + /* + * Iteratively clean `chainHasLocks` flag in the rest of chain. + */ + while (UndoLocationIsValid(location) && location >= retainedUndoLocation) + { + if (!undo_read_if_exists(undoType, location, sizeof(tuphdr), (Pointer) &tuphdr)) + break; + + if (!tuphdr.chainHasLocks) + break; + + tuphdr.chainHasLocks = false; + if (!undo_write_if_exists(undoType, location, sizeof(tuphdr), (Pointer) &tuphdr)) + break; + + location = tuphdr.undoLocation; + } +} + + +/* + * Check for row-level lock conflict + * + * Returns true if lock conflict. On lock conflict places the conflicting undo + * record info *conflictTuphdr. + * + * Otherwise, places the first csn undo record info *conflictTuphdr. + * If there is no such undo records, then *conflictTuphdr is set to + * *pageTuphdr. + * + * Lock-only undo records from committed and aborted transactions are removed. + * Own lock-only undo records of the same or weaker level are removed. + */ +bool +row_lock_conflicts(BTreeLeafTuphdr *pageTuphdr, + BTreeLeafTuphdr *conflictTuphdr, + UndoLogType undoType, + UndoLocation *conflictUndoLocation, + RowLockMode mode, OXid my_oxid, CommitSeqNo my_csn, + OInMemoryBlkno blkno, UndoLocation savepointUndoLocation, + bool *redundant_row_locks, BTreeModifyLockStatus *lock_status) +{ + OTupleXactInfo xactInfo; + bool xactIsFinished; + bool xactIsFinal; + RowLockMode xactMode; + UndoLocation undoLocation; + UndoLocation lastLockOnlyUndoLocation; + BTreeLeafTuphdr curTuphdr, + finalTuphdr; + UndoLocation curUndoLocation, + finalUndoLocation; + UndoLocation retainedUndoLocation = get_snapshot_retained_undo_location(undoType); + bool foundFinal; + bool result = false; + + finalTuphdr = curTuphdr = *pageTuphdr; + finalUndoLocation = curUndoLocation = InvalidUndoLocation; + lastLockOnlyUndoLocation = InvalidUndoLocation; + xactInfo = curTuphdr.xactInfo; + xactMode = XACT_INFO_GET_LOCK_MODE(xactInfo); + if (ROW_LOCKS_CONFLICT(xactMode, mode)) + { + xactIsFinal = xactIsFinished = XACT_INFO_IS_FINISHED(xactInfo); + } + else + { + CommitSeqNo csn = XACT_INFO_MAP_CSN(xactInfo); + + xactIsFinished = !COMMITSEQNO_IS_INPROGRESS(csn); + xactIsFinal = (csn < my_csn); + } + foundFinal = xactIsFinal; + undoLocation = curTuphdr.undoLocation; + + while (curTuphdr.chainHasLocks || + XACT_INFO_IS_LOCK_ONLY(xactInfo) || + !xactIsFinal) + { + bool prevChainHasLocks = false; + bool delete_record = false; + + if (XACT_INFO_IS_LOCK_ONLY(xactInfo)) + { + OXid oxid = XACT_INFO_GET_OXID(xactInfo); + + if (oxid == my_oxid) + { + /* Check if there are redundant row-level locks */ + if (xactMode <= mode && + (!UndoLocationIsValid(savepointUndoLocation) || + (UndoLocationIsValid(undoLocation) && + undoLocation >= savepointUndoLocation))) + *redundant_row_locks = true; + if (xactMode >= mode) + *lock_status = Max(*lock_status, BTreeModifySameOrStrongerLock); + else + *lock_status = Max(*lock_status, BTreeModifyWeakerLock); + } + else + { + CommitSeqNo csn; + + /* + * Row-level locks make sense only for in-progress + * transactions. We delete RLL for both committed and aborted + * transactions. + */ + csn = oxid_get_csn(oxid, false); + if (COMMITSEQNO_IS_ABORTED(csn) || + COMMITSEQNO_IS_NORMAL(csn) || + COMMITSEQNO_IS_FROZEN(csn)) + { + delete_record = true; + } + else if (ROW_LOCKS_CONFLICT(xactMode, mode) && + (!result || XACT_INFO_GET_OXID(conflictTuphdr->xactInfo) == my_oxid)) + { + *conflictTuphdr = curTuphdr; + *conflictUndoLocation = curUndoLocation; + result = true; + } + } + } + else if (!xactIsFinished) + { + if (XACT_INFO_GET_OXID(xactInfo) == my_oxid) + { + if (xactMode >= mode) + *lock_status = Max(*lock_status, BTreeModifySameOrStrongerLock); + else + *lock_status = Max(*lock_status, BTreeModifyWeakerLock); + } + if (ROW_LOCKS_CONFLICT(xactMode, mode) && + (!result || (XACT_INFO_GET_OXID(conflictTuphdr->xactInfo) == my_oxid && + XACT_INFO_GET_OXID(xactInfo) != my_oxid))) + { + *conflictTuphdr = curTuphdr; + *conflictUndoLocation = curUndoLocation; + result = true; + } + } + + if (delete_record && undoLocation >= retainedUndoLocation) + { + BTreeLeafTuphdr prev_tuphdr; + + prev_tuphdr = curTuphdr; + if (!get_prev_leaf_header_from_undo_if_exists(undoType, &prev_tuphdr)) + { + /* Undo gone — skip deletion, treat as end of chain */ + goto next_record; + } + if (!UndoLocationIsValid(curUndoLocation)) + { + page_block_reads(blkno); + pageTuphdr->xactInfo = prev_tuphdr.xactInfo; + pageTuphdr->undoLocation = prev_tuphdr.undoLocation; + pageTuphdr->chainHasLocks = prev_tuphdr.chainHasLocks; + } + else + { + /* + * Update chainHasLocks flag of the next undo records if + * needed. + */ + if (XACT_INFO_IS_LOCK_ONLY(curTuphdr.xactInfo) && + !curTuphdr.chainHasLocks) + { + clean_chain_has_locks_flag(undoType, + lastLockOnlyUndoLocation, + pageTuphdr, + blkno); + lastLockOnlyUndoLocation = InvalidUndoLocation; + } + + curTuphdr.xactInfo = prev_tuphdr.xactInfo; + curTuphdr.undoLocation = prev_tuphdr.undoLocation; + curTuphdr.chainHasLocks = prev_tuphdr.chainHasLocks; + update_leaf_header_in_undo_if_exists(undoType, + &curTuphdr, + curUndoLocation); + + } + } + +next_record: + if (!UndoLocationIsValid(undoLocation) || + undoLocation < retainedUndoLocation) + { + /* + * We have reached the end of "in-progress" undo chain. Fix tail + * "chainHasLocks" flag if needed. + */ + if (curTuphdr.chainHasLocks) + { + clean_chain_has_locks_flag(undoType, + lastLockOnlyUndoLocation, + pageTuphdr, + blkno); + lastLockOnlyUndoLocation = InvalidUndoLocation; + } + + if (!result) + { + *conflictTuphdr = finalTuphdr; + *conflictUndoLocation = finalUndoLocation; + } + return result; + } + + if (!delete_record) + { + /* + * Update previous location of lock-only record. + */ + if (XACT_INFO_IS_LOCK_ONLY(xactInfo)) + lastLockOnlyUndoLocation = undoLocation; + + prevChainHasLocks = curTuphdr.chainHasLocks; + + /* + * A concurrent commit may have released the undo. Treat as end + * of chain. + */ + if (!get_prev_leaf_header_from_undo_if_exists(undoType, &curTuphdr)) + { + if (!result) + { + *conflictTuphdr = finalTuphdr; + *conflictUndoLocation = finalUndoLocation; + } + return result; + } + } + + curUndoLocation = undoLocation; + xactInfo = curTuphdr.xactInfo; + xactMode = XACT_INFO_GET_LOCK_MODE(xactInfo); + if (ROW_LOCKS_CONFLICT(xactMode, mode)) + { + xactIsFinal = xactIsFinished = XACT_INFO_IS_FINISHED(xactInfo); + } + else + { + CommitSeqNo csn = XACT_INFO_MAP_CSN(xactInfo); + + xactIsFinished = !COMMITSEQNO_IS_INPROGRESS(csn); + xactIsFinal = (csn < my_csn); + } + undoLocation = curTuphdr.undoLocation; + + if (prevChainHasLocks && + !curTuphdr.chainHasLocks && + !XACT_INFO_IS_LOCK_ONLY(xactInfo)) + { + /* + * We have reached the end of "in-progress" undo chain. Fix tail + * "chainHasLocks" flag if needed. + */ + clean_chain_has_locks_flag(undoType, + lastLockOnlyUndoLocation, + pageTuphdr, + blkno); + lastLockOnlyUndoLocation = InvalidUndoLocation; + } + + if (!foundFinal && xactIsFinal) + { + finalTuphdr = curTuphdr; + finalUndoLocation = curUndoLocation; + foundFinal = true; + } + } + + if (!result) + { + *conflictTuphdr = finalTuphdr; + *conflictUndoLocation = finalUndoLocation; + } + return result; +} + +/* + * Remove redudant row-level locks. + */ +void +remove_redundant_row_locks(BTreeLeafTuphdr *pageTuphdr, + BTreeLeafTuphdr *conflictTuphdrPtr, + UndoLogType undoType, + UndoLocation *conflictTupHdrUndoLocation, + RowLockMode mode, + OXid my_oxid, OInMemoryBlkno blkno, + UndoLocation savepointUndoLocation) +{ + BTreeLeafTuphdr tuphdr = *pageTuphdr; + OTupleXactInfo xactInfo = tuphdr.xactInfo; + bool chainHasLocks = tuphdr.chainHasLocks, + xactIsFinished = XACT_INFO_IS_FINISHED(xactInfo); + UndoLocation undoLocation = tuphdr.undoLocation, + prevUndoLoc = InvalidUndoLocation, + lastLockOnlyUndoLocation = InvalidUndoLocation; + UndoLocation retainedUndoLocation = get_snapshot_retained_undo_location(undoType); + int prevFormatFlags = 0; + + while ((!xactIsFinished || chainHasLocks) && + undoLocation >= retainedUndoLocation && + UndoLocationIsValid(undoLocation)) + { + /* + * A concurrent commit may have released the undo. Treat as end of + * chain. + */ + if (!get_prev_leaf_header_from_undo_if_exists(undoType, &tuphdr)) + break; + + if (XACT_INFO_IS_LOCK_ONLY(xactInfo) && XACT_INFO_GET_OXID(xactInfo) == my_oxid) + { + bool delete_record = false; + + if (!UndoLocationIsValid(undoLocation) || !UNDO_REC_EXISTS(undoType, undoLocation)) + break; + + if (XACT_INFO_GET_LOCK_MODE(xactInfo) <= mode && + (!UndoLocationIsValid(savepointUndoLocation) || + (UndoLocationIsValid(undoLocation) && + undoLocation >= savepointUndoLocation))) + delete_record = true; + + if (delete_record) + { + if (*conflictTupHdrUndoLocation == undoLocation) + { + *conflictTuphdrPtr = tuphdr; + *conflictTupHdrUndoLocation = prevUndoLoc; + } + if (!UndoLocationIsValid(prevUndoLoc)) + { + page_block_reads(blkno); + pageTuphdr->xactInfo = tuphdr.xactInfo; + pageTuphdr->undoLocation = tuphdr.undoLocation; + } + else + { + /* + * Update chainHasLocks flag of the next undo records if + * needed. + */ + if (XACT_INFO_IS_LOCK_ONLY(xactInfo) && !chainHasLocks) + { + clean_chain_has_locks_flag(undoType, + lastLockOnlyUndoLocation, + pageTuphdr, + blkno); + } + tuphdr.formatFlags = prevFormatFlags; + update_leaf_header_in_undo_if_exists(undoType, &tuphdr, prevUndoLoc); + } + } + } + + /* + * Update last location of lock-only record. + */ + if (XACT_INFO_IS_LOCK_ONLY(xactInfo)) + lastLockOnlyUndoLocation = prevUndoLoc; + + prevUndoLoc = undoLocation; + prevFormatFlags = tuphdr.formatFlags; + xactInfo = tuphdr.xactInfo; + xactIsFinished = XACT_INFO_IS_FINISHED(xactInfo); + undoLocation = tuphdr.undoLocation; + chainHasLocks = tuphdr.chainHasLocks; + } +} + +/* + * Finds first non-lock-only undo record and returns pointer to it. Returns + * NULL if such record is not found. + */ +UndoLocation +find_non_lock_only_undo_record(UndoLogType undoType, BTreeLeafTuphdr *tuphdr) +{ + OTupleXactInfo xactInfo = tuphdr->xactInfo; + UndoLocation undoLocation = InvalidUndoLocation; + + while (XACT_INFO_IS_LOCK_ONLY(xactInfo)) + { + undoLocation = tuphdr->undoLocation; + if (!UndoLocationIsValid(undoLocation)) + return InvalidUndoLocation; + if (!get_prev_leaf_header_from_undo_if_exists(undoType, tuphdr)) + return InvalidUndoLocation; + xactInfo = tuphdr->xactInfo; + } + + return undoLocation; +} + +void +get_prev_leaf_header_from_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, bool inPage) +{ + BTreeLeafTuphdr prevTuphdr = {0, 0}; + + Assert(UndoLocationIsValid(tuphdr->undoLocation)); + Assert(UNDO_REC_EXISTS(undoType, tuphdr->undoLocation)); + + undo_read(undoType, tuphdr->undoLocation, + sizeof(prevTuphdr), (Pointer) &prevTuphdr); + + if (!XACT_INFO_IS_LOCK_ONLY(tuphdr->xactInfo) || !inPage) + { + *tuphdr = prevTuphdr; + } + else + { + tuphdr->xactInfo = prevTuphdr.xactInfo; + tuphdr->undoLocation = prevTuphdr.undoLocation; + tuphdr->chainHasLocks = prevTuphdr.chainHasLocks; + } +} + +/* + * Like get_prev_leaf_header_from_undo(), but tolerates a concurrently + * cleaned undo record. Returns false if the undo record no longer exists + * (treat the chain end as if the transaction committed). + */ +bool +get_prev_leaf_header_from_undo_if_exists(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr) +{ + BTreeLeafTuphdr prevTuphdr = {0, 0}; + + if (!UndoLocationIsValid(tuphdr->undoLocation)) + return false; + + if (!undo_read_if_exists(undoType, tuphdr->undoLocation, + sizeof(prevTuphdr), (Pointer) &prevTuphdr)) + return false; + + *tuphdr = prevTuphdr; + return true; +} + +void +get_prev_leaf_header_and_tuple_from_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + OTuple *tuple, + LocationIndex sizeAvailable) +{ + BTreeModifyUndoStackItem item = {0}; + LocationIndex tupleSize; + UndoLocation undoLocation = tuphdr->undoLocation; + + Assert(UndoLocationIsValid(undoLocation)); + if (!UNDO_REC_EXISTS(undoType, undoLocation)) + { + UndoLocation catalogRetainUndoLocation; + TransactionId xmin = InvalidTransactionId; + TransactionId catalog_xmin; + UndoLocation retainLocation; + + catalogRetainUndoLocation = get_current_replication_catalog_retain_undo_location(); + + ProcArrayGetReplicationSlotXmin(&xmin, &catalog_xmin); + + retainLocation = pg_atomic_read_u64(enable_rewind + ? &get_undo_meta_by_type(undoType)->minRewindRetainLocation + : &get_undo_meta_by_type(undoType)->minProcRetainLocation); + + elog(PANIC, "undo record does not exist: " + "undoLocation = %lu, retainLocation = %lu, " + "replicationRetainUndoLocation = %lu, replicationXmin = %u", + (unsigned long) undoLocation, + (unsigned long) retainLocation, + (unsigned long) catalogRetainUndoLocation, + xmin); + } + + undo_read(undoType, + tuphdr->undoLocation - offsetof(BTreeModifyUndoStackItem, tuphdr), + sizeof(BTreeModifyUndoStackItem), + (Pointer) &item); + Assert(item.header.type == ModifyUndoItemType); + Assert(item.action == BTreeOperationUpdate); + + *tuphdr = item.tuphdr; + tuple->formatFlags = tuphdr->formatFlags; + tupleSize = item.header.itemSize - sizeof(BTreeModifyUndoStackItem); + if (sizeAvailable == 0) + tuple->data = palloc(tupleSize); + Assert(sizeAvailable == 0 || sizeAvailable >= tupleSize); + undo_read(undoType, + undoLocation + BTreeLeafTuphdrSize, + tupleSize, + tuple->data); + tuphdr->formatFlags = 0; +} + +void +update_leaf_header_in_undo(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + UndoLocation location) +{ + Assert(UndoLocationIsValid(location) && UNDO_REC_EXISTS(undoType, location)); + + undo_write(undoType, + location, + sizeof(*tuphdr), + (Pointer) tuphdr); +} + +/* + * Like update_leaf_header_in_undo(), but tolerates a concurrently cleaned + * undo record. Returns false without writing if the undo is gone. + */ +static bool +update_leaf_header_in_undo_if_exists(UndoLogType undoType, + BTreeLeafTuphdr *tuphdr, + UndoLocation location) +{ + if (!UndoLocationIsValid(location)) + return false; + + return undo_write_if_exists(undoType, + location, + sizeof(*tuphdr), + (Pointer) tuphdr); +} diff --git a/contrib/orioledb/src/catalog/ddl.c b/contrib/orioledb/src/catalog/ddl.c new file mode 100644 index 00000000000..8a34a431d70 --- /dev/null +++ b/contrib/orioledb/src/catalog/ddl.c @@ -0,0 +1,4693 @@ +/*------------------------------------------------------------------------- + * + * ddl.c + * Routines for DDL handling. + * + * Copyright (c) 2021-2026, Oriole DB Inc. + * Copyright (c) 2025-2026, Supabase Inc. + * + * IDENTIFICATION + * contrib/orioledb/src/catalog/ddl.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "orioledb.h" + +#include "btree/scan.h" +#include "btree/undo.h" +#include "catalog/indices.h" +#include "catalog/o_indices.h" +#include "catalog/o_tables.h" +#include "catalog/o_sys_cache.h" +#include "storage/lockdefs.h" +#include "tableam/descr.h" +#include "tableam/operations.h" +#include "catalog/pg_am.h" +#include "tableam/toast.h" +#include "transam/oxid.h" +#include "transam/undo.h" +#include "tuple/slot.h" +#include "utils/compress.h" +#include "recovery/wal.h" + +#include "access/heapam.h" +#include "access/reloptions.h" +#include "access/tableam.h" +#include "access/toast_compression.h" +#include "access/transam.h" +#include "catalog/catalog.h" +#include "catalog/heap.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_attrdef.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_class.h" +#include "catalog/pg_constraint.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_database.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_enum.h" +#include "catalog/pg_extension.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_tablespace.h" +#include "catalog/pg_type.h" +#include "catalog/toasting.h" +#include "commands/createas.h" +#include "commands/dbcommands.h" +#include "commands/defrem.h" +#include "commands/event_trigger.h" +#include "commands/matview.h" +#include "commands/prepare.h" +#include "commands/tablespace.h" +#include "commands/vacuum.h" +#include "commands/view.h" +#include "commands/tablecmds.h" +#include "fmgr.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "nodes/makefuncs.h" +#include "nodes/pg_list.h" +#include "nodes/primnodes.h" +#include "optimizer/optimizer.h" +#include "optimizer/planner.h" +#include "parser/parse_coerce.h" +#include "parser/parse_collate.h" +#include "parser/parse_expr.h" +#include "parser/parse_relation.h" +#include "parser/parse_type.h" +#include "parser/parse_utilcmd.h" +#include "partitioning/partdesc.h" +#include "pgstat.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/lwlock.h" +#include "storage/smgr.h" +#include "tcop/dest.h" +#include "tcop/utility.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/resowner.h" +#include "utils/rls.h" +#include "utils/syscache.h" +#include "utils/snapmgr.h" + +#include +#include + +static ProcessUtility_hook_type next_ProcessUtility_hook = NULL; +static object_access_hook_type old_objectaccess_hook = NULL; + +List *drop_index_list = NIL; +List *partition_drop_index_list = NIL; +static List *alter_type_exprs = NIL; +static List *o_alter_generated_column_id = NIL; +static List *dropped_attrs = NIL; +Oid o_saved_relrewrite = InvalidOid; +Oid o_saved_reltablespace = InvalidOid; +List *o_reuse_indices = NIL; +static ORelOids saved_oids; +static bool in_rewrite = false; +List *reindex_list = NIL; +Query *savedDataQuery = NULL; +IndexBuildResult o_pkey_result = {0}; +bool o_in_add_column = false; +static CreateStmt *create_stmt = NULL; +static List *o_added_columns = NIL; + +static void orioledb_utility_command(PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *env, + DestReceiver *dest, + struct QueryCompletion *qc); +static void orioledb_object_access_hook(ObjectAccessType access, Oid classId, + Oid objectId, int subId, void *arg); + +static void o_alter_column_type(AlterTableCmd *cmd, const char *queryString, + Relation rel); +static Node *o_get_alter_type_expr(Relation rel, int attidx); +static void o_fill_new_slot(OTable *new_o_table, Relation rel, int attidx, + Node *expr, TupleTableSlot *old_slot, + TupleTableSlot *new_slot, TupleTableSlot *scan_slot); +static void o_find_collation_dependencies(Oid colloid); +static void redefine_indices(Relation rel, OTable *new_o_table, bool primary, + Oid oldRelnode); + +static bool get_db_info(const char *name, LOCKMODE lockmode, Oid *dbIdP); +static Oid o_createdb(ParseState *pstate, const CreatedbStmt *stmt); +static void o_validate_replica_identity(Relation rel, ReplicaIdentityStmt *stmt); +static void o_process_added_column(AlterTableCmd *cmd); + +void +orioledb_setup_ddl_hooks(void) +{ + next_ProcessUtility_hook = ProcessUtility_hook; + ProcessUtility_hook = orioledb_utility_command; + old_objectaccess_hook = object_access_hook; + object_access_hook = orioledb_object_access_hook; +} + + +static const char * +alter_table_type_to_string(AlterTableType cmdtype) +{ + switch (cmdtype) + { + case AT_AddColumn: + case AT_AddColumnToView: + return "ADD COLUMN"; + case AT_ColumnDefault: + case AT_CookedColumnDefault: + return "ALTER COLUMN ... SET DEFAULT"; + case AT_DropNotNull: + return "ALTER COLUMN ... DROP NOT NULL"; + case AT_SetNotNull: + return "ALTER COLUMN ... SET NOT NULL"; + case AT_DropExpression: + return "ALTER COLUMN ... DROP EXPRESSION"; + case AT_CheckNotNull: + return NULL; /* not real grammar */ + case AT_SetStatistics: + return "ALTER COLUMN ... SET STATISTICS"; + case AT_SetOptions: + return "ALTER COLUMN ... SET"; + case AT_ResetOptions: + return "ALTER COLUMN ... RESET"; + case AT_SetStorage: + return "ALTER COLUMN ... SET STORAGE"; + case AT_SetCompression: + return "ALTER COLUMN ... SET COMPRESSION"; + case AT_DropColumn: + case AT_AddIndex: + case AT_ReAddIndex: + return NULL; /* not real grammar */ + case AT_AddConstraint: + case AT_ReAddConstraint: + case AT_ReAddDomainConstraint: + case AT_AddIndexConstraint: + return "ADD CONSTRAINT"; + case AT_AlterConstraint: + return "ALTER CONSTRAINT"; + case AT_ValidateConstraint: + return "VALIDATE CONSTRAINT"; + case AT_DropConstraint: + case AT_ReAddComment: + return NULL; /* not real grammar */ + case AT_AlterColumnType: + return "ALTER COLUMN ... SET DATA TYPE"; + case AT_AlterColumnGenericOptions: + return "ALTER COLUMN ... OPTIONS"; + case AT_ChangeOwner: + return "OWNER TO"; + case AT_ClusterOn: + return "CLUSTER ON"; + case AT_DropCluster: + return "SET WITHOUT CLUSTER"; + case AT_SetAccessMethod: + return "SET ACCESS METHOD"; + case AT_SetLogged: + return "SET LOGGED"; + case AT_SetUnLogged: + return "SET UNLOGGED"; + case AT_DropOids: + return "SET WITHOUT OIDS"; + case AT_SetTableSpace: + return "SET TABLESPACE"; + case AT_SetRelOptions: + return "SET"; + case AT_ResetRelOptions: + return "RESET"; + case AT_ReplaceRelOptions: + return NULL; /* not real grammar */ + case AT_EnableTrig: + return "ENABLE TRIGGER"; + case AT_EnableAlwaysTrig: + return "ENABLE ALWAYS TRIGGER"; + case AT_EnableReplicaTrig: + return "ENABLE REPLICA TRIGGER"; + case AT_DisableTrig: + return "DISABLE TRIGGER"; + case AT_EnableTrigAll: + return "ENABLE TRIGGER ALL"; + case AT_DisableTrigAll: + return "DISABLE TRIGGER ALL"; + case AT_EnableTrigUser: + return "ENABLE TRIGGER USER"; + case AT_DisableTrigUser: + return "DISABLE TRIGGER USER"; + case AT_EnableRule: + return "ENABLE RULE"; + case AT_EnableAlwaysRule: + return "ENABLE ALWAYS RULE"; + case AT_EnableReplicaRule: + return "ENABLE REPLICA RULE"; + case AT_DisableRule: + return "DISABLE RULE"; + case AT_AddInherit: + return "INHERIT"; + case AT_DropInherit: + return "NO INHERIT"; + case AT_AddOf: + return "OF"; + case AT_DropOf: + return "NOT OF"; + case AT_ReplicaIdentity: + return "REPLICA IDENTITY"; + case AT_EnableRowSecurity: + return "ENABLE ROW SECURITY"; + case AT_DisableRowSecurity: + return "DISABLE ROW SECURITY"; + case AT_ForceRowSecurity: + return "FORCE ROW SECURITY"; + case AT_NoForceRowSecurity: + return "NO FORCE ROW SECURITY"; + case AT_GenericOptions: + return "OPTIONS"; + case AT_AttachPartition: + return "ATTACH PARTITION"; + case AT_DetachPartition: + return "DETACH PARTITION"; + case AT_DetachPartitionFinalize: + return "DETACH PARTITION ... FINALIZE"; + case AT_AddIdentity: + return "ALTER COLUMN ... ADD IDENTITY"; + case AT_SetIdentity: + return "ALTER COLUMN ... SET"; + case AT_DropIdentity: + return "ALTER COLUMN ... DROP IDENTITY"; +#if PG_VERSION_NUM >= 170000 + case AT_SetExpression: + return "ALTER COLUMN ... SET EXPRESSION"; +#endif + case AT_ReAddStatistics: + return NULL; /* not real grammar */ + } + + return NULL; +} + +static bool +is_alter_table_partition(PlannedStmt *pstmt) +{ + AlterTableStmt *top_atstmt = (AlterTableStmt *) pstmt->utilityStmt; + + if (list_length(top_atstmt->cmds) == 1) + { + AlterTableCmd *cmd = linitial(top_atstmt->cmds); + + if (cmd->subtype == AT_AttachPartition || + cmd->subtype == AT_DetachPartition || + cmd->subtype == AT_DetachPartitionFinalize) + return true; + } + return false; +} + + +/* + * Given a VacuumRelation, fill in the table OID if it wasn't specified, + * and optionally add VacuumRelations for partitions of the table. + * + * If a VacuumRelation does not have an OID supplied and is a partitioned + * table, an extra entry will be added to the output for each partition. + * Presently, only autovacuum supplies OIDs when calling vacuum(), and + * it does not want us to expand partitioned tables. + */ +static List * +expand_vacuum_rel(VacuumRelation *vrel, int options) +{ + List *vacrels = NIL; + + /* If caller supplied OID, there's nothing we need do here. */ + if (OidIsValid(vrel->oid)) + { + vacrels = lappend(vacrels, vrel); + } + else + { + /* Process a specific relation, and possibly partitions thereof */ + Oid relid; + HeapTuple tuple; + Form_pg_class classForm; + bool include_parts; + int rvr_opts; + + /* + * We transiently take AccessShareLock to protect the syscache lookup + * below, as well as find_all_inheritors's expectation that the caller + * holds some lock on the starting relation. + */ + rvr_opts = (options & VACOPT_SKIP_LOCKED) ? RVR_SKIP_LOCKED : 0; + relid = RangeVarGetRelidExtended(vrel->relation, + AccessShareLock, + rvr_opts, + NULL, NULL); + + /* + * If the lock is unavailable, emit the same log statement that + * vacuum_rel() and analyze_rel() would. + */ + if (!OidIsValid(relid)) + { + return vacrels; + } + + /* + * To check whether the relation is a partitioned table and its + * ownership, fetch its syscache entry. + */ + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relid); + classForm = (Form_pg_class) GETSTRUCT(tuple); + + /* + * Make a returnable VacuumRelation for this rel if user is a proper + * owner. + */ + if (vacuum_is_relation_owner(relid, classForm, options)) + { + vacrels = lappend(vacrels, makeVacuumRelation(vrel->relation, + relid, + vrel->va_cols)); + } + + include_parts = (classForm->relkind == RELKIND_PARTITIONED_TABLE); + ReleaseSysCache(tuple); + + /* + * If it is, make relation list entries for its partitions. Note that + * the list returned by find_all_inheritors() includes the passed-in + * OID, so we have to skip that. There's no point in taking locks on + * the individual partitions yet, and doing so would just add + * unnecessary deadlock risk. For this last reason we do not check + * yet the ownership of the partitions, which get added to the list to + * process. Ownership will be checked later on anyway. + */ + if (include_parts) + { + List *part_oids = find_all_inheritors(relid, NoLock, NULL); + ListCell *part_lc; + + foreach(part_lc, part_oids) + { + Oid part_oid = lfirst_oid(part_lc); + + if (part_oid == relid) + continue; /* ignore original table */ + + /* + * We omit a RangeVar since it wouldn't be appropriate to + * complain about failure to open one of these relations + * later. + */ + vacrels = lappend(vacrels, makeVacuumRelation(NULL, + part_oid, + vrel->va_cols)); + } + } + + /* + * Release lock again. This means that by the time we actually try to + * process the table, it might be gone or renamed. In the former case + * we'll silently ignore it; in the latter case we'll process it + * anyway, but we must beware that the RangeVar doesn't necessarily + * identify it anymore. This isn't ideal, perhaps, but there's little + * practical alternative, since we're typically going to commit this + * transaction and begin a new one between now and then. Moreover, + * holding locks on multiple relations would create significant risk + * of deadlock. + */ + UnlockRelationOid(relid, AccessShareLock); + } + + return vacrels; +} + +/* + * Construct a list of VacuumRelations for all vacuumable rels in + * the current database. + */ +static List * +get_all_vacuum_rels(int options) +{ + List *vacrels = NIL; + Relation pgclass; + TableScanDesc scan; + HeapTuple tuple; + + pgclass = table_open(RelationRelationId, AccessShareLock); + + scan = table_beginscan_catalog(pgclass, 0, NULL); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + Oid relid = classForm->oid; + + /* check permissions of relation */ + if (!vacuum_is_relation_owner(relid, classForm, options)) + continue; + + /* + * We include partitioned tables here; depending on which operation is + * to be performed, caller will decide whether to process or ignore + * them. + */ + if (classForm->relkind != RELKIND_RELATION && + classForm->relkind != RELKIND_MATVIEW && + classForm->relkind != RELKIND_PARTITIONED_TABLE) + continue; + + /* + * Build VacuumRelation(s) specifying the table OIDs to be processed. + * We omit a RangeVar since it wouldn't be appropriate to complain + * about failure to open one of these relations later. + */ + vacrels = lappend(vacrels, makeVacuumRelation(NULL, + relid, + NIL)); + } + + table_endscan(scan); + table_close(pgclass, AccessShareLock); + return vacrels; +} + +/* Based on postgres function ReindexMultipleTables */ +static bool +check_multiple_tables(const char *objectName, ReindexObjectType objectKind, bool concurrently) +{ + Oid objectOid; + Relation relationRelation; + TableScanDesc scan; + ScanKeyData scan_keys[1]; + HeapTuple tuple; + MemoryContext private_context; + int num_keys; + bool has_orioledb = false; + + Assert(objectKind == REINDEX_OBJECT_SCHEMA || + objectKind == REINDEX_OBJECT_SYSTEM || + objectKind == REINDEX_OBJECT_DATABASE); + + /* + * This matches the options enforced by the grammar, where the object name + * is optional for DATABASE and SYSTEM. + */ + Assert(objectName || objectKind != REINDEX_OBJECT_SCHEMA); + + if (objectKind == REINDEX_OBJECT_SYSTEM && concurrently) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot reindex system catalogs concurrently"))); + + /* + * Get OID of object to reindex, being the database currently being used + * by session for a database or for system catalogs, or the schema defined + * by caller. At the same time do permission checks that need different + * processing depending on the object type. + */ + if (objectKind == REINDEX_OBJECT_SCHEMA) + { + objectOid = get_namespace_oid(objectName, false); + + if (!object_ownercheck(NamespaceRelationId, objectOid, GetUserId()) +#if PG_VERSION_NUM >= 170000 + && !has_privs_of_role(GetUserId(), ROLE_PG_MAINTAIN) +#endif + ) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_SCHEMA, + objectName); + } + else + { + objectOid = MyDatabaseId; + + if (objectName && strcmp(objectName, get_database_name(objectOid)) != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can only reindex the currently open database"))); + if (!object_ownercheck(DatabaseRelationId, objectOid, GetUserId()) +#if PG_VERSION_NUM >= 170000 + && !has_privs_of_role(GetUserId(), ROLE_PG_MAINTAIN) +#endif + ) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_DATABASE, + get_database_name(objectOid)); + } + + /* + * Create a memory context that will survive forced transaction commits we + * do below. Since it is a child of PortalContext, it will go away + * eventually even if we suffer an error; there's no need for special + * abort cleanup logic. + */ + private_context = AllocSetContextCreate(PortalContext, + "check_multiple_tables", + ALLOCSET_SMALL_SIZES); + + /* + * Define the search keys to find the objects to reindex. For a schema, we + * select target relations using relnamespace, something not necessary for + * a database-wide operation. + */ + if (objectKind == REINDEX_OBJECT_SCHEMA) + { + num_keys = 1; + ScanKeyInit(&scan_keys[0], + Anum_pg_class_relnamespace, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(objectOid)); + } + else + num_keys = 0; + + /* + * Scan pg_class to build a list of the relations we need to reindex. + * + * We only consider plain relations and materialized views here (toast + * rels will be processed indirectly by reindex_relation). + */ + relationRelation = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(relationRelation, num_keys, scan_keys); + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Form_pg_class classtuple = (Form_pg_class) GETSTRUCT(tuple); + Oid relid = classtuple->oid; + Relation tbl; + + /* + * Only regular tables and matviews can have indexes, so ignore any + * other kind of relation. + * + * Partitioned tables/indexes are skipped but matching leaf partitions + * are processed. + */ + if (classtuple->relkind != RELKIND_RELATION && + classtuple->relkind != RELKIND_MATVIEW) + continue; + + /* Skip temp tables of other backends; we can't reindex them at all */ + if (classtuple->relpersistence == RELPERSISTENCE_TEMP && + !isTempNamespace(classtuple->relnamespace)) + continue; + + /* + * Check user/system classification. SYSTEM processes all the + * catalogs, and DATABASE processes everything that's not a catalog. + */ + if (objectKind == REINDEX_OBJECT_SYSTEM && + !IsCatalogRelationOid(relid)) + continue; + else if (objectKind == REINDEX_OBJECT_DATABASE && + IsCatalogRelationOid(relid)) + continue; + + /* + * The table can be reindexed if the user is superuser, the table + * owner, or the database/schema owner (but in the latter case, only + * if it's not a shared relation). object_ownercheck includes the + * superuser case, and depending on objectKind we already know that + * the user has permission to run REINDEX on this database or schema + * per the permission checks at the beginning of this routine. + */ + if (classtuple->relisshared && + object_ownercheck(RelationRelationId, relid, GetUserId())) + continue; + + /* + * Skip system tables, since index_create() would reject indexing them + * concurrently (and it would likely fail if we tried). + */ + if (concurrently && IsCatalogRelationOid(relid)) + { + continue; + } + + tbl = relation_open(relid, AccessShareLock); + if (is_orioledb_rel(tbl)) + { + ListCell *index; + + foreach(index, RelationGetIndexList(tbl)) + { + Oid indexOid = lfirst_oid(index); + Relation ind = relation_open(indexOid, AccessShareLock); + OBTOptions *options = (OBTOptions *) ind->rd_options; + + if (ind->rd_rel->relam == BTREE_AM_OID && !(options && !options->orioledb_index)) + { + String *ix_name = makeString(pstrdup(ind->rd_rel->relname.data)); + + reindex_list = list_append_unique(reindex_list, ix_name); + } + relation_close(ind, AccessShareLock); + } + + if (concurrently) + has_orioledb = true; + } + relation_close(tbl, AccessShareLock); + } + table_endscan(scan); + table_close(relationRelation, AccessShareLock); + + MemoryContextDelete(private_context); + return has_orioledb; +} + +#if PG_VERSION_NUM >= 170000 +/* + * create_ctas_internal + * + * Internal utility used for the creation of the definition of a relation + * created via CREATE TABLE AS or a materialized view. Caller needs to + * provide a list of attributes (ColumnDef nodes). + */ +static ObjectAddress +create_ctas_internal(List *attrList, IntoClause *into) +{ + CreateStmt *create = makeNode(CreateStmt); + bool is_matview; + char relkind; + Datum toast_options; + static char *validnsps[] = HEAP_RELOPT_NAMESPACES; + ObjectAddress intoRelationAddr; + + /* This code supports both CREATE TABLE AS and CREATE MATERIALIZED VIEW */ + is_matview = (into->viewQuery != NULL); + relkind = is_matview ? RELKIND_MATVIEW : RELKIND_RELATION; + + /* + * Create the target relation by faking up a CREATE TABLE parsetree and + * passing it to DefineRelation. + */ + create->relation = into->rel; + create->tableElts = attrList; + create->inhRelations = NIL; + create->ofTypename = NULL; + create->constraints = NIL; + create->options = into->options; + create->oncommit = into->onCommit; + create->tablespacename = into->tableSpaceName; + create->if_not_exists = false; + create->accessMethod = into->accessMethod; + + /* + * Create the relation. (This will error out if there's an existing view, + * so we don't need more code to complain if "replace" is false.) + */ + intoRelationAddr = DefineRelation(create, relkind, InvalidOid, NULL, NULL); + + /* + * If necessary, create a TOAST table for the target table. Note that + * NewRelationCreateToastTable ends with CommandCounterIncrement(), so + * that the TOAST table will be visible for insertion. + */ + CommandCounterIncrement(); + + /* parse and validate reloptions for the toast table */ + toast_options = transformRelOptions((Datum) 0, + create->options, + "toast", + validnsps, + true, false); + + (void) heap_reloptions(RELKIND_TOASTVALUE, toast_options, true); + + NewRelationCreateToastTable(intoRelationAddr.objectId, toast_options); + + /* Create the "view" part of a materialized view. */ + if (is_matview) + { + /* StoreViewQuery scribbles on tree, so make a copy */ + Query *query = (Query *) copyObject(into->viewQuery); + + StoreViewQuery(intoRelationAddr.objectId, query, false); + CommandCounterIncrement(); + } + + return intoRelationAddr; +} + +/* + * create_ctas_nodata + * + * Create CTAS or materialized view when WITH NO DATA is used, starting from + * the targetlist of the SELECT or view definition. + */ +static ObjectAddress +create_ctas_nodata(List *tlist, IntoClause *into) +{ + List *attrList; + ListCell *t, + *lc; + + /* + * Build list of ColumnDefs from non-junk elements of the tlist. If a + * column name list was specified in CREATE TABLE AS, override the column + * names in the query. (Too few column names are OK, too many are not.) + */ + attrList = NIL; + lc = list_head(into->colNames); + foreach(t, tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(t); + + if (!tle->resjunk) + { + ColumnDef *col; + char *colname; + + if (lc) + { + colname = strVal(lfirst(lc)); + lc = lnext(into->colNames, lc); + } + else + colname = tle->resname; + + col = makeColumnDef(colname, + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + exprCollation((Node *) tle->expr)); + + /* + * It's possible that the column is of a collatable type but the + * collation could not be resolved, so double-check. (We must + * check this here because DefineRelation would adopt the type's + * default collation rather than complaining.) + */ + if (!OidIsValid(col->collOid) && + type_is_collatable(col->typeName->typeOid)) + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("no collation was derived for column \"%s\" with collatable type %s", + col->colname, + format_type_be(col->typeName->typeOid)), + errhint("Use the COLLATE clause to set the collation explicitly."))); + + attrList = lappend(attrList, col); + } + } + + if (lc != NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("too many column names were specified"))); + + /* Create the relation definition using the ColumnDef list */ + return create_ctas_internal(attrList, into); +} +#endif + +static bool +ReindexPartitions(Oid relid, bool concurrently) +{ + List *inhoids; + ListCell *lc; + bool has_orioledb = false; + + inhoids = find_all_inheritors(relid, ShareLock, NULL); + + foreach(lc, inhoids) + { + Oid partoid = lfirst_oid(lc); + Relation part_rel = relation_open(partoid, AccessShareLock); + + /* + * This discards partitioned tables, partitioned indexes and foreign + * tables. + */ + if (!RELKIND_HAS_STORAGE(part_rel->rd_rel->relkind)) + { + relation_close(part_rel, AccessShareLock); + continue; + } + + Assert(part_rel->rd_rel->relkind == RELKIND_INDEX || + part_rel->rd_rel->relkind == RELKIND_RELATION); + + if (concurrently) + { + if ((part_rel->rd_rel->relkind == RELKIND_RELATION || + part_rel->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(part_rel)) + { + has_orioledb = true; + } + else if (part_rel->rd_rel->relkind == RELKIND_INDEX) + { + Relation tbl; + + tbl = relation_open(part_rel->rd_index->indrelid, AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION) && + is_orioledb_rel(tbl)) + { + has_orioledb = true; + } + relation_close(tbl, AccessShareLock); + } + } + relation_close(part_rel, AccessShareLock); + } + return has_orioledb; +} + +static void +orioledb_utility_command(PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *env, + DestReceiver *dest, + struct QueryCompletion *qc) +{ + bool isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL); + ParseState *pstate; + bool call_next = true; + + /* copied from standard_ProcessUtility */ + if (readOnlyTree) + pstmt = copyObject(pstmt); + + /* Is this enough? */ + if (isTopLevel) + { + in_rewrite = false; + o_saved_relrewrite = InvalidOid; + o_saved_reltablespace = InvalidOid; + ORelOidsSetInvalid(saved_oids); + savedDataQuery = NULL; + in_nontransactional_truncate = false; + } + + pstate = make_parsestate(NULL); + pstate->p_sourcetext = queryString; + pstate->p_queryEnv = env; + + /* + * DDL WAL ordering barrier. + * + * OrioleDB does not emit its WAL records immediately and directly into + * PostgreSQL WAL. We first accumulate Oriole records in a per-backend + * local WAL buffer and flush that buffer either: - when it overflows, or + * - at transaction finalization. + * + * This deferred flushing can reorder OrioleDB WAL records relative to + * PostgreSQL-native WAL records generated by utility commands (DDL). In + * particular, a utility command may generate PG WAL immediately, while + * Oriole's related records stay buffered until later. On crash/recovery + * or logical decoding this may surface as observing PG DDL changes before + * the corresponding Oriole metadata/state changes, which breaks + * assumptions about atomicity and visibility of DDL boundaries. + * + * To enforce a stable ordering across the two WAL streams, we treat entry + * into ProcessUtility as a barrier: before executing any utility command + * we flush any pending Oriole local WAL so that all Oriole records + * produced by prior statements become durable/visible in WAL *before* + * this DDL starts producing PostgreSQL WAL. + * + * Note: recovery workers do not produce local WAL in the same way and + * must not perform this flush here. + */ + if (!is_recovery_process() && !local_wal_is_empty()) + flush_local_wal(false, false); + + if (IsA(pstmt->utilityStmt, AlterTableStmt) && + !is_alter_table_partition(pstmt)) + { + AlterTableStmt *atstmt = (AlterTableStmt *) pstmt->utilityStmt; + Oid relid; + LOCKMODE lockmode; + ObjectType objtype; + + objtype = atstmt->objtype; + + /* + * alter_type_exprs is expected to be allocated in PortalContext so it + * isn't freed by us and pointer may be invalid there + */ + alter_type_exprs = NIL; + dropped_attrs = NIL; + + /* + * Figure out lock mode, and acquire lock. This also does basic + * permissions checks, so that we won't wait for a lock on (for + * example) a relation on which we have no permissions. + */ + lockmode = AlterTableGetLockLevel(atstmt->cmds); + relid = AlterTableLookupRelation(atstmt, lockmode); + + if (OidIsValid(relid) && objtype == OBJECT_TABLE && + (lockmode == AccessExclusiveLock || lockmode == ShareUpdateExclusiveLock)) + { + Relation rel = relation_open(relid, lockmode); + + if (is_orioledb_rel(rel)) + { + ListCell *lc; + + foreach(lc, atstmt->cmds) + { + AlterTableCmd *cmd = (AlterTableCmd *) lfirst(lc); + + /* make checks */ + switch (cmd->subtype) + { + case AT_AddColumn: + case AT_AddConstraint: + case AT_AddIdentity: + case AT_AddIndex: + case AT_AddInherit: + case AT_AlterColumnType: + case AT_ChangeOwner: + case AT_ColumnDefault: + case AT_CookedColumnDefault: + case AT_DisableRowSecurity: + case AT_DropColumn: + case AT_DropConstraint: + case AT_DropExpression: + case AT_DropIdentity: + case AT_DropInherit: + case AT_DropNotNull: + case AT_EnableRowSecurity: + case AT_GenericOptions: + case AT_ResetRelOptions: + case AT_SetIdentity: + case AT_SetNotNull: + case AT_SetRelOptions: + case AT_EnableRule: + case AT_EnableAlwaysRule: + case AT_EnableReplicaRule: + case AT_DisableRule: + case AT_SetTableSpace: + case AT_SetStorage: + case AT_ReplicaIdentity: + case AT_AddIndexConstraint: + case AT_AddOf: + case AT_AlterColumnGenericOptions: + case AT_AlterConstraint: + case AT_DisableTrig: + case AT_DisableTrigAll: + case AT_DisableTrigUser: + case AT_DropOf: + case AT_EnableAlwaysTrig: + case AT_EnableReplicaTrig: + case AT_EnableTrig: + case AT_EnableTrigAll: + case AT_EnableTrigUser: + case AT_ForceRowSecurity: + case AT_NoForceRowSecurity: + case AT_ReplaceRelOptions: + case AT_ResetOptions: + case AT_SetLogged: + case AT_SetOptions: + case AT_SetStatistics: + case AT_SetUnLogged: + case AT_ValidateConstraint: +#if PG_VERSION_NUM >= 170000 + case AT_SetExpression: +#endif + break; + case AT_DropOids: + ereport(WARNING, + (errmsg("alter table subcommand \"%s\" has no effect on OrioleDB tables since they do not use OIDs", + alter_table_type_to_string(cmd->subtype)))); + break; + case AT_ClusterOn: + case AT_DropCluster: + ereport(WARNING, + (errmsg("alter table subcommand \"%s\" has no performance effect on OrioleDB tables with primary key", + alter_table_type_to_string(cmd->subtype)))); + break; + case AT_SetAccessMethod: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("changing access method is not supported for OrioleDB tables"))); + break; + case AT_SetCompression: + default: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported alter table subcommand")), + errdetail("Subcommand \"%s\" is not supported on OrioleDB tables yet. This will be implemented in future.", + alter_table_type_to_string(cmd->subtype))); + break; + } + + switch (cmd->subtype) + { + case AT_AlterColumnType: + o_alter_column_type(cmd, queryString, rel); + break; + case AT_ReplicaIdentity: + o_validate_replica_identity(rel, (ReplicaIdentityStmt *) cmd->def); + break; + case AT_AddColumn: + o_process_added_column(cmd); + break; + default: + break; + } + } + } + else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + /* + * Parent partition table is always heap-based, however child + * partitions can use orioledb, so we need to process + * non-oriole relations as well + */ + ListCell *lc; + + foreach(lc, atstmt->cmds) + { + AlterTableCmd *cmd = (AlterTableCmd *) lfirst(lc); + + if (cmd->subtype == AT_AddColumn) + { + o_process_added_column(cmd); + } + } + } + table_close(rel, lockmode); + } + } + else if (IsA(pstmt->utilityStmt, ClusterStmt)) + { + ClusterStmt *stmt = (ClusterStmt *) pstmt->utilityStmt; + + if (stmt->relation != NULL) + { + /* This is the single-relation case. */ + Oid tableOid; + Relation rel = NULL; + bool orioledb; + + tableOid = RangeVarGetRelid(stmt->relation, AccessShareLock, + false); + rel = table_open(tableOid, AccessShareLock); + orioledb = is_orioledb_rel(rel); + table_close(rel, AccessShareLock); + if (orioledb) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("orioledb tables does not support CLUSTER")), + errdetail("CLUSTER makes no much sense for index-organized tables.")); + } + } + else if (IsA(pstmt->utilityStmt, VacuumStmt)) + { + VacuumStmt *vacstmt = (VacuumStmt *) pstmt->utilityStmt; + ListCell *lc; + bool full = false, + skip_locked = false, + analyze = false; + int options; + + foreach(lc, vacstmt->options) + { + DefElem *opt = (DefElem *) lfirst(lc); + + if (strcmp(opt->defname, "full") == 0) + full = defGetBoolean(opt); + else if (strcmp(opt->defname, "skip_locked") == 0) + skip_locked = defGetBoolean(opt); + else if (strcmp(opt->defname, "analyze") == 0) + analyze = defGetBoolean(opt); + } + options = + (vacstmt->is_vacuumcmd ? VACOPT_VACUUM : VACOPT_ANALYZE) | + (skip_locked ? VACOPT_SKIP_LOCKED : 0) | + (analyze ? VACOPT_ANALYZE : 0) | + (full ? VACOPT_FULL : 0); + if (full) + { + List *relations = vacstmt->rels; + + if (relations != NIL) + { + List *newrels = NIL; + + foreach(lc, relations) + { + VacuumRelation *vrel = lfirst_node(VacuumRelation, lc); + List *sublist; + + sublist = expand_vacuum_rel(vrel, options); + newrels = list_concat(newrels, sublist); + } + relations = newrels; + } + else + relations = get_all_vacuum_rels(options); + foreach(lc, relations) + { + VacuumRelation *vrel = lfirst_node(VacuumRelation, lc); + Relation rel; + bool orioledb; + + if (options & VACOPT_SKIP_LOCKED) + { + if (ConditionalLockRelationOid(vrel->oid, AccessShareLock)) + rel = relation_open(vrel->oid, NoLock); + else + continue; + } + else + { + rel = relation_open(vrel->oid, AccessShareLock); + } + + orioledb = is_orioledb_rel(rel); + if (orioledb) + { + if (orioledb_strict_mode) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("orioledb table \"%s\" does not support VACUUM FULL", + RelationGetRelationName(rel))), + errdetail("VACUUM FULL is not supported for OrioleDB tables yet.")); + } + else + { + ListCell *lc2; + + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("orioledb table \"%s\" does not support VACUUM FULL, using a plain VACUUM instead", + RelationGetRelationName(rel)))); + + foreach(lc2, vacstmt->options) + { + DefElem *opt = (DefElem *) lfirst(lc2); + + if (strcmp(opt->defname, "full") == 0) + opt->arg = (Node *) makeInteger(0); + } + } + } + relation_close(rel, AccessShareLock); + } + } + } + else if (IsA(pstmt->utilityStmt, ReindexStmt)) + { + ReindexStmt *stmt = (ReindexStmt *) pstmt->utilityStmt; + char *tablespacename = NULL; + bool concurrently = false; + bool has_orioledb = false; + ListCell *lc; + + foreach(lc, stmt->params) + { + DefElem *opt = (DefElem *) lfirst(lc); + + if (strcmp(opt->defname, "concurrently") == 0) + concurrently = defGetBoolean(opt); + else if (strcmp(opt->defname, "tablespace") == 0) + tablespacename = defGetString(opt); + else if (strcmp(opt->defname, "verbose") != 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized REINDEX option \"%s\"", + opt->defname), + parser_errposition(pstate, opt->location))); + } + + /* Show same error as in ExecReindex */ + if (concurrently) + PreventInTransactionBlock(isTopLevel, + "REINDEX CONCURRENTLY"); + + if (tablespacename != NULL) + { + Oid tablespaceOid = get_tablespace_oid(tablespacename, false); + + /* Check permissions except when moving to database's default */ + if (OidIsValid(tablespaceOid) && + tablespaceOid != MyDatabaseTableSpace) + { + AclResult aclresult; + + aclresult = object_aclcheck(TableSpaceRelationId, tablespaceOid, + GetUserId(), ACL_CREATE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_TABLESPACE, + get_tablespace_name(tablespaceOid)); + } + } + + switch (stmt->kind) + { + case REINDEX_OBJECT_INDEX: + { + Oid indOid = RangeVarGetRelid(stmt->relation, + AccessShareLock, + false); + Relation iRel, + tbl; + OBTOptions *options; + + if (get_rel_relkind(indOid) == RELKIND_PARTITIONED_INDEX) + { + has_orioledb = ReindexPartitions(indOid, concurrently); + break; + } + + iRel = index_open(indOid, AccessShareLock); + tbl = relation_open(iRel->rd_index->indrelid, + AccessShareLock); + options = (OBTOptions *) iRel->rd_options; + if (is_orioledb_rel(tbl) && + iRel->rd_rel->relam == BTREE_AM_OID && + !(options && !options->orioledb_index)) + { + String *ix_name; + + ix_name = makeString(pstrdup(iRel->rd_rel->relname.data)); + reindex_list = list_append_unique(reindex_list, ix_name); + if (concurrently) + has_orioledb = true; + } + relation_close(tbl, AccessShareLock); + relation_close(iRel, AccessShareLock); + } + break; + case REINDEX_OBJECT_TABLE: + { + Oid tblOid = RangeVarGetRelid(stmt->relation, + AccessShareLock, + false); + Relation tbl; + + if (get_rel_relkind(tblOid) == RELKIND_PARTITIONED_TABLE) + { + has_orioledb = ReindexPartitions(tblOid, concurrently); + break; + } + tbl = relation_open(tblOid, AccessShareLock); + if (is_orioledb_rel(tbl)) + { + ListCell *index; + + foreach(index, RelationGetIndexList(tbl)) + { + Oid indexOid = lfirst_oid(index); + Relation ind = relation_open(indexOid, AccessShareLock); + OBTOptions *options = (OBTOptions *) ind->rd_options; + + if (ind->rd_rel->relam == BTREE_AM_OID && !(options && !options->orioledb_index)) + { + String *ix_name = makeString(pstrdup(ind->rd_rel->relname.data)); + + reindex_list = list_append_unique(reindex_list, ix_name); + } + relation_close(ind, AccessShareLock); + if (concurrently) + has_orioledb = true; + } + } + relation_close(tbl, AccessShareLock); + } + break; + case REINDEX_OBJECT_SCHEMA: + case REINDEX_OBJECT_SYSTEM: + case REINDEX_OBJECT_DATABASE: + has_orioledb = check_multiple_tables(stmt->name, stmt->kind, concurrently); + break; + default: + elog(ERROR, "unrecognized object type: %d", + (int) stmt->kind); + break; + } + + if (has_orioledb && concurrently) + { + if (tablespacename != NULL) + { + Oid tablespaceOid = get_tablespace_oid(tablespacename, false); + + if (tablespaceOid == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot move non-shared relation to tablespace \"%s\"", + get_tablespace_name(tablespaceOid)))); + } + + if (orioledb_strict_mode) + elog(ERROR, "REINDEX CONCURRENTLY is not supported for orioledb tables yet"); + else + elog(WARNING, "REINDEX CONCURRENTLY is not supported for orioledb tables yet, using a plain REINDEX instead"); + + foreach(lc, stmt->params) + { + DefElem *opt = (DefElem *) lfirst(lc); + + if (strcmp(opt->defname, "concurrently") == 0) + stmt->params = foreach_delete_current(stmt->params, lc); + } + } + } + else if (IsA(pstmt->utilityStmt, TransactionStmt)) + { + TransactionStmt *tstmt = (TransactionStmt *) pstmt->utilityStmt; + + if (tstmt->kind == TRANS_STMT_PREPARE && have_retained_undo_location()) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use PREPARE TRANSACTION in transaction that uses orioledb table")), + errdetail("OrioleDB does not support prepared transactions yet.")); + } + } + else if (IsA(pstmt->utilityStmt, AlterCollationStmt)) + { + AlterCollationStmt *astmt = (AlterCollationStmt *) pstmt->utilityStmt; + Oid collOid = get_collation_oid(astmt->collname, false); + + o_find_collation_dependencies(collOid); + } + else if (IsA(pstmt->utilityStmt, CreateStmt)) + { + create_stmt = (CreateStmt *) pstmt->utilityStmt; + } +#if PG_VERSION_NUM >= 170000 + else if (IsA(pstmt->utilityStmt, CreateTableAsStmt)) + { + CreateTableAsStmt *stmt = (CreateTableAsStmt *) pstmt->utilityStmt; + IntoClause *into = stmt->into; + + if (!into->skipData) + { + bool is_matview = (into->viewQuery != NULL); + + if (is_matview && + ((into->accessMethod && strcmp(into->accessMethod, "orioledb") == 0) || + (!into->accessMethod && strcmp(default_table_access_method, "orioledb") == 0))) + { + Query *query = castNode(Query, stmt->query); + ObjectAddress address; + + Assert(query->commandType == CMD_SELECT); + + address = create_ctas_nodata(query->targetList, into); + + /* + * We cannot just use rel->rd_rules in access hook, because it + * recalculates expression two times if it executes postgreses + * code, even if it skips insertion to table + */ + savedDataQuery = (Query *) copyObject(into->viewQuery); + RefreshMatViewByOid(address.objectId, true, false, + queryString, NULL, qc); + savedDataQuery = NULL; + + if (qc) + qc->commandTag = CMDTAG_SELECT; + + call_next = false; + } + } + } +#endif + else if (IsA(pstmt->utilityStmt, RefreshMatViewStmt)) + { + RefreshMatViewStmt *stmt = (RefreshMatViewStmt *) pstmt->utilityStmt; + Oid matviewOid; + Relation matviewRel; +#if PG_VERSION_NUM >= 170000 + matviewOid = RangeVarGetRelidExtended(stmt->relation, NoLock, 0, + RangeVarCallbackMaintainsTable, NULL); +#else + matviewOid = RangeVarGetRelidExtended(stmt->relation, NoLock, 0, + RangeVarCallbackOwnsTable, NULL); +#endif + matviewRel = table_open(matviewOid, AccessShareLock); + + if (matviewRel->rd_rel->relkind == RELKIND_MATVIEW && + is_orioledb_rel(matviewRel)) + { + if (!stmt->skipData) + { + savedDataQuery = linitial_node(Query, matviewRel->rd_rules->rules[0]->actions); + if (stmt->concurrent) + { + if (orioledb_strict_mode) + { + elog(ERROR, "REFRESH MATERIALIZED VIEW CONCURRENTLY is not supported for orioledb tables yet"); + } + else + { + stmt->concurrent = false; + elog(WARNING, "REFRESH MATERIALIZED VIEW CONCURRENTLY is not supported for orioledb tables yet, using a plain REFRESH MATERIALIZED VIEW instead"); + } + } + } + stmt->skipData = true; + } + table_close(matviewRel, AccessShareLock); + } + else if (IsA(pstmt->utilityStmt, IndexStmt)) + { + IndexStmt *stmt = (IndexStmt *) pstmt->utilityStmt; + + if (stmt->concurrent) + { + Oid relid; + Relation rel; + LOCKMODE lockmode; + + PreventInTransactionBlock(context == PROCESS_UTILITY_TOPLEVEL, + "CREATE INDEX CONCURRENTLY"); + + lockmode = ShareUpdateExclusiveLock; + relid = + RangeVarGetRelidExtended(stmt->relation, lockmode, + 0, + RangeVarCallbackOwnsRelation, + NULL); + rel = table_open(relid, lockmode); + + if (is_orioledb_rel(rel)) + { + if (orioledb_strict_mode) + { + table_close(rel, lockmode); + elog(ERROR, "concurrent index creation is not supported for orioledb tables yet"); + } + else + { + stmt->concurrent = false; + elog(WARNING, "concurrent index creation is not supported for orioledb tables yet, using a plain CREATE INDEX instead"); + } + } + table_close(rel, lockmode); + } + } + else if (IsA(pstmt->utilityStmt, CreatedbStmt)) + { + /* no event triggers for global objects */ + PreventInTransactionBlock(isTopLevel, "CREATE DATABASE"); + o_createdb(pstate, (CreatedbStmt *) pstmt->utilityStmt); + + call_next = false; + } + + if (call_next) + { + if (next_ProcessUtility_hook) + (*next_ProcessUtility_hook) (pstmt, queryString, + readOnlyTree, + context, params, env, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, + readOnlyTree, + context, params, env, + dest, qc); + } + + if (IsA(pstmt->utilityStmt, ReindexStmt)) + { + if (reindex_list) + { + list_free_deep(reindex_list); + reindex_list = NIL; + } + } + else if (IsA(pstmt->utilityStmt, DropStmt)) + { + if (partition_drop_index_list) + { + list_free(partition_drop_index_list); + partition_drop_index_list = NIL; + } + if (dropped_attrs) + { + list_free(dropped_attrs); + dropped_attrs = NIL; + } + } + else if (IsA(pstmt->utilityStmt, AlterTableStmt)) + { + if (alter_type_exprs) + { + list_free_deep(alter_type_exprs); + alter_type_exprs = NIL; + } + if (o_alter_generated_column_id) + { + list_free_deep(o_alter_generated_column_id); + o_alter_generated_column_id = NIL; + } + if (dropped_attrs) + { + list_free(dropped_attrs); + dropped_attrs = NIL; + } + + /* + * Don't free memory explicitly, delegate it to the memory context + * mechanism + */ + o_added_columns = NIL; + } + else if (IsA(pstmt->utilityStmt, CreateStmt)) + { + create_stmt = NULL; + } + else if (IsA(pstmt->utilityStmt, CreateSeqStmt) && o_added_columns != NIL) + { + CreateSeqStmt *seqstmt = (CreateSeqStmt *) pstmt->utilityStmt; + + if (seqstmt->for_identity) + { + /* + * Here we enrich already existing list elements with data about + * created sequences. We reuse the same list for enriched data, so + * first pop the head element, enrich it with data, then push it + * back to the list tail + */ + NextValueExpr *nve = makeNode(NextValueExpr); + + List *pair = linitial(o_added_columns); + Oid typeOid = intVal(linitial(pair)); + char *colname = strVal(lsecond(pair)); + + o_added_columns = list_delete_first(o_added_columns); + + nve->seqid = RangeVarGetRelid(seqstmt->sequence, NoLock, false); + nve->typeId = typeOid; + + o_added_columns = lappend(o_added_columns, + /* cppcheck-suppress unknownEvaluationOrder */ + list_make2(expression_planner((Expr *) nve), makeString(colname))); + } + } + + free_parsestate(pstate); +} + +static void +o_validate_replica_identity(Relation rel, ReplicaIdentityStmt *stmt) +{ + elog(DEBUG4, "Current replident %c, setting replident %c", rel->rd_rel->relreplident, stmt->identity_type); + + if (stmt->identity_type == REPLICA_IDENTITY_DEFAULT) + { + return; + } + else if (stmt->identity_type == REPLICA_IDENTITY_FULL) + { + return; + } + else if (stmt->identity_type == REPLICA_IDENTITY_NOTHING) + { + elog(ERROR, "replica identity type NOTHING is not supported for OrioleDB tables yet"); + } + else if (stmt->identity_type == REPLICA_IDENTITY_INDEX) + { + elog(ERROR, "replica identity type INDEX is not supported for OrioleDB tables yet"); + } +} + +static void +o_alter_column_type(AlterTableCmd *cmd, const char *queryString, Relation rel) +{ + ColumnDef *def = (ColumnDef *) cmd->def; + + if (def->raw_default) + { + Node *cooked_default; + ParseState *pstate; + ParseNamespaceItem *nsitem; + AttrNumber attnum; + + pstate = make_parsestate(NULL); + pstate->p_sourcetext = queryString; + nsitem = addRangeTableEntryForRelation(pstate, rel, AccessShareLock, + NULL, false, true); + addNSItemToQuery(pstate, nsitem, false, true, true); + cooked_default = transformExpr(pstate, def->raw_default, + EXPR_KIND_ALTER_COL_TRANSFORM); + attnum = get_attnum(RelationGetRelid(rel), cmd->name); + alter_type_exprs = + lappend(alter_type_exprs, + /* cppcheck-suppress unknownEvaluationOrder */ + list_make4(makeInteger(attnum), makeInteger(rel->rd_rel->oid), cooked_default, makeString(cmd->name))); + } +} + +static void +o_find_collation_dependencies(Oid colloid) +{ + Relation depRel; + ScanKeyData key[2]; + SysScanDesc depScan; + HeapTuple depTup; + HeapTuple collationtup; + Form_pg_collation collform; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* + * We scan pg_depend to find those things that depend on the given type. + * (We assume we can ignore refobjsubid for a type.) + */ + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(CollationRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(colloid)); + + depScan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 2, key); + + collationtup = SearchSysCache1(COLLOID, colloid); + if (!HeapTupleIsValid(collationtup)) + elog(ERROR, "cache lookup failed for collation (%u)", colloid); + collform = (Form_pg_collation) GETSTRUCT(collationtup); + + while (HeapTupleIsValid(depTup = systable_getnext(depScan))) + { + Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup); + Relation rel; + + /* Else, ignore dependees that aren't user columns of relations */ + /* (we assume system columns are never of interesting types) */ + if (pg_depend->classid != RelationRelationId) + continue; + + rel = relation_open(pg_depend->objid, AccessShareLock); + + if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(rel)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot refresh collation \"%s\" because " + "orioledb table \"%s\" uses it", + collform->collname.data, + RelationGetRelationName(rel)))); + } + else if (rel->rd_rel->relkind == RELKIND_INDEX) + { + Relation tbl; + + tbl = relation_open(rel->rd_index->indrelid, AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION || + tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(tbl)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot refresh collation \"%s\" because " + "orioledb index \"%s\" uses it", + collform->collname.data, + RelationGetRelationName(rel)))); + } + relation_close(tbl, AccessShareLock); + } + + relation_close(rel, AccessShareLock); + } + ReleaseSysCache(collationtup); + systable_endscan(depScan); + + relation_close(depRel, AccessShareLock); +} + +static void +o_find_composite_type_dependencies(Oid typeOid, Relation origRelation) +{ + Relation depRel; + ScanKeyData key[2]; + SysScanDesc depScan; + HeapTuple depTup; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* + * We scan pg_depend to find those things that depend on the given type. + * (We assume we can ignore refobjsubid for a type.) + */ + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(TypeRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(typeOid)); + + depScan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 2, key); + + while (HeapTupleIsValid(depTup = systable_getnext(depScan))) + { + Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup); + Relation rel; + + /* Check for directly dependent types */ + if (pg_depend->classid == TypeRelationId) + { + /* + * This must be an array, domain, or range containing the given + * type, so recursively check for uses of this type. Note that + * any error message will mention the original type not the + * container; this is intentional. + */ + o_find_composite_type_dependencies(pg_depend->objid, origRelation); + continue; + } + + /* Else, ignore dependees that aren't user columns of relations */ + /* (we assume system columns are never of interesting types) */ + if (pg_depend->classid != RelationRelationId || + pg_depend->objsubid <= 0) + continue; + + rel = relation_open(pg_depend->objid, AccessShareLock); + + if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(rel)) + { + OTable *table; + ORelOids table_oids; + bool found = false; + int i; + + ORelOidsSetFromRel(table_oids, rel); + + table = o_tables_get(table_oids); + if (table == NULL) + { + elog(NOTICE, "orioledb table %s not found", RelationGetRelationName(rel)); + } + else + { + for (i = 0; i < table->nindices && !found; i++) + { + int j; + + for (j = 0; j < table->indices[i].nfields && !found; j++) + { + if (table->indices[i].fields[j].attnum == + pg_depend->objsubid - 1) + found = true; + } + + } + + + if (found) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot alter type \"%s\" because index \"%s\" uses it", + RelationGetRelationName(origRelation), + NameStr(table->indices[i - 1].name)))); + } + o_table_free(table); + } + } + else if (OidIsValid(rel->rd_rel->reltype)) + { + /* + * A view or composite type itself isn't a problem, but we must + * recursively check for indirect dependencies via its rowtype. + */ + o_find_composite_type_dependencies(rel->rd_rel->reltype, + origRelation); + } + + relation_close(rel, AccessShareLock); + } + + systable_endscan(depScan); + + relation_close(depRel, AccessShareLock); +} + +static bool +ATColumnChangeRequiresRewrite(OTableField *old_field, OTableField *field, Oid objectId, + int subId) +{ + ParseState *pstate = make_parsestate(NULL); + Node *expr = NULL; + bool rewrite = false; + ListCell *lc; + bool append_transform = false; + + foreach(lc, alter_type_exprs) + { + AttrNumber attnum = intVal(linitial((List *) lfirst(lc))); + Oid oid = intVal(lsecond((List *) lfirst(lc))); + const char *attname = ((String *) (lfourth((List *) lfirst(lc))))->sval; + + if (attnum == subId) + { + expr = (Node *) lthird((List *) lfirst(lc)); + append_transform = strcmp(attname, field->name.data) == 0 && objectId != oid; + break; + } + } + + /* code from ATPrepAlterColumnType */ + if (!expr) + { + expr = (Node *) makeVar(1, subId, old_field->typid, old_field->typmod, + old_field->collation, 0); + append_transform = true; + } + expr = coerce_to_target_type(pstate, expr, exprType(expr), field->typid, + field->typmod, COERCION_EXPLICIT, + COERCE_IMPLICIT_CAST, -1); + if (expr != NULL) + { + if (append_transform) + { + char *field_name = pstrdup(field->name.data); + + /* cppcheck-suppress unknownEvaluationOrder */ + alter_type_exprs = lappend(alter_type_exprs, list_make4(makeInteger(subId), makeInteger(objectId), expr, makeString(field_name))); + } + assign_expr_collations(pstate, expr); + expr = (Node *) expression_planner((Expr *) expr); + + while (!rewrite) + { + /* only one varno, so no need to check that */ + if (IsA(expr, Var) && ((Var *) expr)->varattno == subId) + break; + else if (IsA(expr, RelabelType)) + expr = (Node *) ((RelabelType *) expr)->arg; + else if (IsA(expr, CoerceToDomain)) + { + CoerceToDomain *d = (CoerceToDomain *) expr; + + if (DomainHasConstraints(d->resulttype)) + rewrite = true; + expr = (Node *) d->arg; + } + else if (IsA(expr, FuncExpr)) + { + FuncExpr *f = (FuncExpr *) expr; + + switch (f->funcid) + { + case F_TIMESTAMPTZ_TIMESTAMP: + case F_TIMESTAMP_TIMESTAMPTZ: + if (TimestampTimestampTzRequiresRewrite()) + rewrite = true; + else + expr = linitial(f->args); + break; + default: + rewrite = true; + } + } + else + rewrite = true; + } + } + + return rewrite; +} + +static void +set_toast_oids_and_options(Relation rel, Relation toast_rel, bool only_fillfactor, bool index_bridging) +{ + ORelOids oids, + toastOids; + OIndexKey *trees; + int numTrees; + OTable *o_table; + ORelOptions *options = (ORelOptions *) rel->rd_options; + OCompress compress = default_compress, + primary_compress = default_primary_compress, + toast_compress = default_toast_compress; + uint8 fillfactor = BTREE_DEFAULT_FILLFACTOR; + OXid oxid = InvalidOXid; + OSnapshot oSnapshot; + bool is_temp; + + Assert(RelIsInMyDatabase(rel)); + ORelOidsSetFromRel(oids, rel); + ORelOidsSetFromRel(toastOids, toast_rel); + + o_table = o_tables_get(oids); + + if (!only_fillfactor) + o_table->toast_oids = toastOids; + + if (options) + { + if (!only_fillfactor) + { + if (options->compress_offset > 0) + { + char *str; + + str = (char *) (((Pointer) options) + + options->compress_offset); + if (str) + compress = o_parse_compress(str); + } + if (options->primary_compress_offset > 0) + { + char *str; + + str = (char *) (((Pointer) options) + + options->primary_compress_offset); + if (str) + primary_compress = o_parse_compress(str); + } + if (options->toast_compress_offset > 0) + { + char *str; + + str = (char *) (((Pointer) options) + + options->toast_compress_offset); + if (str) + toast_compress = o_parse_compress(str); + } + index_bridging = index_bridging || options->index_bridging; + } + fillfactor = options->std_options.fillfactor; + } + + if (!only_fillfactor) + { + if (rel->rd_rel->relpersistence != + RELPERSISTENCE_PERMANENT && + (OCompressIsValid(compress) || + OCompressIsValid(primary_compress) || + OCompressIsValid(toast_compress))) + { + o_table_free(o_table); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("temp and unlogged orioledb tables does not " + "support compression options"))); + } + + if (OCompressIsValid(compress)) + { + if (!OCompressIsValid(primary_compress)) + primary_compress = compress; + if (!OCompressIsValid(toast_compress)) + toast_compress = compress; + } + o_table->default_compress = compress; + o_table->toast_compress = toast_compress; + o_table->primary_compress = primary_compress; + o_table->index_bridging = index_bridging; + + if (index_bridging) + { + o_table->bridge_oids.datoid = MyDatabaseId; + o_table->bridge_oids.relnode = GetNewRelFileNumber(MyDatabaseTableSpace, NULL, + rel->rd_rel->relpersistence); + o_table->bridge_oids.reloid = o_table->bridge_oids.relnode; + } + } + + o_table->fillfactor = fillfactor; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + o_tables_rel_meta_lock(rel); + o_indices_update(o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_after_update(o_table, oxid, oSnapshot.csn); + + trees = o_table_make_index_keys(o_table, &numTrees); + is_temp = o_table->persistence == RELPERSISTENCE_TEMP; + add_undo_create_relnode(oids, trees, numTrees, !is_temp); + o_tables_rel_meta_unlock(rel, InvalidOid); + pfree(trees); + o_table_free(o_table); +} + +static void +create_o_table_for_rel(Relation rel) +{ + ORelOids oids; + TupleDesc tupdesc; + OTable *o_table; + OSnapshot oSnapshot; + OXid oxid = InvalidOXid; + XLogRecPtr cur_lsn; + Oid datoid; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + Assert(RelIsInMyDatabase(rel)); + ORelOidsSetFromRel(oids, rel); + tupdesc = RelationGetDescr(rel); + + o_tables_rel_meta_lock(rel); + o_table = o_table_tableam_create(oids, tupdesc, + rel->rd_rel->relpersistence, + RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR), + rel->rd_rel->reltablespace, + false); + o_cache_table_types(o_table); + + o_sys_cache_set_datoid_lsn(&cur_lsn, &datoid); + o_database_cache_add_if_needed(datoid, datoid, cur_lsn, NULL); + + o_tables_add(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(rel, InvalidOid); + o_table_free(o_table); +} + +typedef struct +{ + DestReceiver pub; /* publicly-known function pointers */ + Relation rel; + OTableDescr *descr; + CommitSeqNo csn; + OXid oxid; +} DR_transientrel; + +/* + * transientrel_startup --- executor startup + */ +static void +transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + DR_transientrel *myState = (DR_transientrel *) self; + OSnapshot oSnapshot; + + fill_current_oxid_osnapshot(&myState->oxid, &oSnapshot); + myState->csn = oSnapshot.csn; +} + +/* + * transientrel_receive --- receive one tuple + */ +static bool +transientrel_receive(TupleTableSlot *slot, DestReceiver *self) +{ + DR_transientrel *myState = (DR_transientrel *) self; + + o_tbl_insert(myState->descr, myState->rel, slot, myState->oxid, myState->csn); + + /* We know this is a newly created relation, so there are no indexes */ + + return true; +} + +/* + * transientrel_shutdown --- executor end + */ +static void +transientrel_shutdown(DestReceiver *self) +{ +} + +/* + * transientrel_destroy --- release DestReceiver object + */ +static void +transientrel_destroy(DestReceiver *self) +{ + pfree(self); +} + +static DestReceiver * +CreateOrioledbDestReceiver(Relation rel) +{ + DR_transientrel *self = (DR_transientrel *) palloc0(sizeof(DR_transientrel)); + + self->pub.receiveSlot = transientrel_receive; + self->pub.rStartup = transientrel_startup; + self->pub.rShutdown = transientrel_shutdown; + self->pub.rDestroy = transientrel_destroy; + self->pub.mydest = DestTransientRel; + self->rel = rel; + self->descr = relation_get_descr(rel); + Assert(self->descr != NULL); + + return (DestReceiver *) self; +} + +void +o_drop_table(ORelOids oids) +{ + OSnapshot oSnapshot; + OXid oxid; + OTable *table; + OIndexKey *trees; + int numTrees; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + o_tables_table_meta_lock(NULL); + table = o_tables_drop_by_oids(oids, oxid, oSnapshot.csn); + o_tables_table_meta_unlock(NULL, InvalidOid); + trees = o_table_make_index_keys(table, &numTrees); + add_undo_drop_relnode(oids, trees, numTrees); + pfree(trees); + o_table_free(table); +} + +static void +rewrite_matview(Relation rel, OTable *old_o_table, OTable *new_o_table) +{ + DestReceiver *dest = CreateOrioledbDestReceiver(rel); + List *rewritten; + PlannedStmt *plan; + QueryDesc *queryDesc; + Query *copied_query; + Query *query; + + /* Lock and rewrite, using a copy to preserve the original query. */ + copied_query = copyObject(savedDataQuery); + AcquireRewriteLocks(copied_query, true, false); + rewritten = QueryRewrite(copied_query); + + /* SELECT should never rewrite to more or less than one SELECT query */ + if (list_length(rewritten) != 1) + elog(ERROR, "unexpected rewrite result for REFRESH MATERIALIZED VIEW"); + query = (Query *) linitial(rewritten); + + /* Check for user-requested abort. */ + CHECK_FOR_INTERRUPTS(); + + /* Plan the query which will generate data for the refresh. */ + plan = pg_plan_query(query, "ORIOLEDB rewrite_matview", CURSOR_OPT_PARALLEL_OK, NULL); + + /* + * Use a snapshot with an updated command ID to ensure this query sees + * results of any previously executed queries. (This could only matter if + * the planner executed an allegedly-stable function that changed the + * database contents, but let's do it anyway to be safe.) + */ + PushCopiedSnapshot(GetActiveSnapshot()); + UpdateActiveSnapshotCommandId(); + + /* Create a QueryDesc, redirecting output to our tuple receiver */ + queryDesc = CreateQueryDesc(plan, "ORIOLEDB rewrite_matview", + GetActiveSnapshot(), InvalidSnapshot, + dest, NULL, NULL, 0); + + /* call ExecutorStart to prepare the plan for execution */ + ExecutorStart(queryDesc, 0); + + /* run the plan */ + ExecutorRun(queryDesc, ForwardScanDirection, 0, true); + + pgstat_count_heap_insert(rel, queryDesc->estate->es_processed); + + /* and clean up */ + ExecutorFinish(queryDesc); + ExecutorEnd(queryDesc); + + FreeQueryDesc(queryDesc); + + PopActiveSnapshot(); + + SetMatViewPopulatedState(rel, true); +} + +static void +rewrite_table(Relation rel, OTable *old_o_table, OTable *new_o_table) +{ + OTableDescr *old_descr = NULL; + void *sscan; + TupleTableSlot *old_slot; + TupleTableSlot *new_slot; + OTuple tup; + CommitSeqNo tupleCsn; + BTreeLocationHint hint; + OTableDescr *descr; + OSnapshot oSnapshot; + OXid oxid; + int primary_init_nfields = old_o_table->primary_init_nfields; + int num_check = rel->rd_att->constr ? rel->rd_att->constr->num_check : 0; + EState *check_estate = NULL; + ExprState **check_exprs = NULL; + ExprContext *check_econtext = NULL; + + if (!old_o_table->has_primary) + primary_init_nfields--; + + old_descr = o_fetch_table_descr(old_o_table->oids); + ResourceOwnerRememberOTableDescr(CurrentResourceOwner, old_descr); + descr = relation_get_descr(rel); + ResourceOwnerRememberOTableDescr(CurrentResourceOwner, descr); + old_slot = MakeSingleTupleTableSlot(old_descr->tupdesc, &TTSOpsOrioleDB); + new_slot = MakeSingleTupleTableSlot(descr->tupdesc, &TTSOpsOrioleDB); + sscan = make_btree_seq_scan(&GET_PRIMARY(old_descr)->desc, &o_in_progress_snapshot, NULL); + + /* + * OrioleDB engine change execution order when relation is rewrited. So + * real data transfer from old relation ti the new one executed after + * dropping. So in statments with moving data from one column to another + * via ALTER COLUMN and DROP we gather an error that collumn already + * dropped. To avoid this behavior mark column dropped in current + * statement as not dropped. This is ugly solution actually need refactor + * handling of ALTER TABLE to avoid global vars and lists that brings alot + * of bugs. + */ + if (OidIsValid(o_saved_relrewrite)) + { + for (int i = 0; i < old_slot->tts_tupleDescriptor->natts; i++) + { + ListCell *lc; + + foreach(lc, dropped_attrs) + { + Oid relOid = intVal(linitial((List *) lfirst(lc))); + AttrNumber attnum = intVal(lsecond((List *) lfirst(lc))); + + if (relOid == rel->rd_rel->oid && attnum == i + 1) + { + old_slot->tts_tupleDescriptor->attrs[i].attisdropped = false; + break; + } + } + } + } + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + /* Prepare CHECK constraint expressions for validation during rewrite */ + if (num_check > 0) + { + int i; + ConstrCheck *check = rel->rd_att->constr->check; + + check_estate = CreateExecutorState(); + check_econtext = GetPerTupleExprContext(check_estate); + check_exprs = (ExprState **) palloc(num_check * sizeof(ExprState *)); + + for (i = 0; i < num_check; i++) + { + Expr *checkconstexpr = stringToNode(check[i].ccbin); + + check_exprs[i] = ExecPrepareExpr(checkconstexpr, check_estate); + } + } + + while (!O_TUPLE_IS_NULL(tup = btree_seq_scan_getnext(sscan, old_slot->tts_mcxt, &tupleCsn, &hint))) + { + tts_orioledb_store_tuple(old_slot, tup, old_descr, + COMMITSEQNO_INPROGRESS, PrimaryIndexNumber, + true, &hint); + slot_getallattrs(old_slot); + tts_orioledb_detoast(old_slot); + + /* + * Constraints and GENERATED expressions might reference the tableoid + * column, so fill tts_tableOid with the desired value. + */ + new_slot->tts_tableOid = RelationGetRelid(rel); + + /* + * Process tuple in two phases: 1) Non-generated attrs 2) Generated + * attrs only Rewriting tuples in such manner helps to handle + * situations when a generated column depends on the value of another + * changing column. + */ + for (int i = 0; i < old_slot->tts_tupleDescriptor->natts; i++) + { + Node *expr = NULL; + bool has_def = false; + bool should_build_def = false; + Form_pg_attribute attr = &old_slot->tts_tupleDescriptor->attrs[i]; + + if (attr->attgenerated) + continue; + + expr = o_get_alter_type_expr(rel, i); + + /* + * old_slot may not contain all new properties if there are + * multiple expressions within a single ALTER TABLE. + */ + has_def = attr->atthasdef || rel->rd_att->attrs[i].atthasdef; + + /* + * Build default for columns which have explicit DEFAULT + * expressions + */ + should_build_def = !expr && has_def && !attr->atthasmissing && + i >= primary_init_nfields && + old_slot->tts_isnull[i]; + + /* If column has domain type, try to build domain default value */ + should_build_def |= !expr && get_typtype(attr->atttypid) == TYPTYPE_DOMAIN && old_slot->tts_isnull[i]; + + if (should_build_def) + { + Node *defaultexpr = build_column_default(rel, i + 1); + + expr = defaultexpr; + } + + /* + * When the new column is of a domain type: the domain might have + * a not-null constraint, or a check constraint that indirectly + * rejects nulls. If there are any domain constraints then we + * construct an explicit NULL default value that will be passed + * through CoerceToDomain processing. + */ + if (!attr->attisdropped && !expr && DomainHasConstraints(attr->atttypid) && + old_slot->tts_isnull[i]) + { + Oid baseTypeId; + int32 baseTypeMod; + Oid baseTypeColl; + Node *defval; + + defval = build_column_default(rel, i + 1); + + if (!defval) + { + baseTypeMod = attr->atttypmod; + baseTypeId = getBaseTypeAndTypmod(attr->atttypid, &baseTypeMod); + baseTypeColl = get_typcollation(baseTypeId); + defval = (Node *) makeNullConst(baseTypeId, baseTypeMod, baseTypeColl); + } + else + { + baseTypeId = exprType(defval); + } + defval = (Node *) coerce_to_target_type(NULL, + defval, + baseTypeId, + attr->atttypid, + attr->atttypmod, + COERCION_ASSIGNMENT, + COERCE_IMPLICIT_CAST, + -1); + if (defval == NULL) /* should not happen */ + elog(ERROR, "failed to coerce base type to domain"); + expr = defval; + } + else if (rel->rd_att->attrs[i].attidentity && old_slot->tts_isnull[i]) + { + ListCell *lc; + + foreach(lc, o_added_columns) + { + List *pair = lfirst(lc); + + if (!strcmp(strVal(lsecond(pair)), attr->attname.data)) + { + expr = (Node *) linitial(pair); + break; + } + } + + if (expr == NULL) /* should not happen */ + elog(ERROR, "failed to find sequence for brand-new column %s", attr->attname.data); + } + + o_fill_new_slot(new_o_table, rel, i, expr, + old_slot, new_slot, old_slot); + } + + /* Make new_slot valid for using it as a scan_slot in o_fill_new_slot */ + ExecStoreVirtualTuple(new_slot); + + for (int i = 0; i < old_slot->tts_tupleDescriptor->natts; i++) + { + Node *expr = NULL; + Form_pg_attribute attr = &old_slot->tts_tupleDescriptor->attrs[i]; + + if (!attr->attgenerated) + continue; + + expr = o_get_alter_type_expr(rel, i); + + /* + * Build new value for GENERATED column if the generation + * expression has been updated using ALTER TABLE ... SET + * EXPRESSION ..., if the value was not present in the existing + * row, or if the column type has changed. + */ + if (expr || (old_slot->tts_isnull[i] || + (o_alter_generated_column_id != NIL + && list_member( + o_alter_generated_column_id, + /* cppcheck-suppress unknownEvaluationOrder */ + list_make2(makeInteger(RelationGetRelid(rel)), makeInteger(i + 1)))))) + { + Node *defaultexpr = build_column_default(rel, i + 1); + + expr = defaultexpr; + } + + /* + * Use new_slot as a scan slot, because all generated attrs depend + * only on new values + */ + o_fill_new_slot(new_o_table, rel, i, expr, + old_slot, new_slot, new_slot); + } + + new_slot->tts_nvalid = new_slot->tts_tupleDescriptor->natts; + + /* Validate CHECK constraints on the rewritten tuple */ + if (num_check > 0) + { + int j; + + check_econtext->ecxt_scantuple = new_slot; + for (j = 0; j < num_check; j++) + { + if (!ExecCheck(check_exprs[j], check_econtext)) + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("check constraint \"%s\" of relation \"%s\" is violated by some row", + rel->rd_att->constr->check[j].ccname, + RelationGetRelationName(rel)))); + } + } + + o_tbl_insert(descr, rel, new_slot, oxid, oSnapshot.csn); + + ExecClearTuple(old_slot); + ExecClearTuple(new_slot); + } + + if (check_estate) + { + pfree(check_exprs); + FreeExecutorState(check_estate); + } + + ExecDropSingleTupleTableSlot(old_slot); + ExecDropSingleTupleTableSlot(new_slot); + free_btree_seq_scan(sscan); + ResourceOwnerForgetOTableDescr(CurrentResourceOwner, descr); + ResourceOwnerForgetOTableDescr(CurrentResourceOwner, old_descr); + + o_drop_table(old_o_table->oids); +} + +static void +redefine_indices(Relation rel, OTable *new_o_table, bool primary, Oid oldRelnode) +{ + ListCell *index; + + foreach(index, RelationGetIndexList(rel)) + { + bool closed = false; + Oid indexOid = lfirst_oid(index); + Relation ind = relation_open(indexOid, AccessShareLock); + + if ((primary && ind->rd_index->indisprimary) || (!primary && !ind->rd_index->indisprimary)) + { + OBTOptions *options = (OBTOptions *) ind->rd_options; + + if (ind->rd_rel->relam != BTREE_AM_OID || (options && !options->orioledb_index)) + { + ReindexParams reindex_params = {0}; + + relation_close(ind, AccessShareLock); + reindex_index( +#if PG_VERSION_NUM >= 170000 + NULL, +#endif + indexOid, 0, ind->rd_rel->relpersistence, &reindex_params); + closed = true; + } + else + { + o_define_index_validate(new_o_table->oids, ind, NULL, NULL); + relation_close(ind, AccessShareLock); + o_define_index(rel, NULL, ind->rd_rel->oid, false, + InvalidIndexNumber, oldRelnode, NULL); + closed = true; + } + } + if (!closed) + relation_close(ind, AccessShareLock); + } + + if (primary) + { + ORelOids oids; + OTable *updated_o_table; + + /* + * Partial reimplementation of assign_new_oids just for toast, because + * it isn't called for tables without pkeys here, but it should + */ + + ORelOidsSetFromRel(oids, rel); + updated_o_table = o_tables_get(oids); + Assert(updated_o_table != NULL); + + if (!updated_o_table->has_primary) + { + Oid toast_relid; + Relation toast_rel; + OSnapshot oSnapshot; + OXid oxid; + + toast_relid = rel->rd_rel->reltoastrelid; + toast_rel = table_open(toast_relid, AccessExclusiveLock); + RelationSetNewRelfilenode(toast_rel, toast_rel->rd_rel->relpersistence); + ORelOidsSetFromRel(updated_o_table->toast_oids, toast_rel); + table_close(toast_rel, AccessExclusiveLock); + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_tables_table_meta_lock(updated_o_table); + o_indices_update(updated_o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(updated_o_table, oxid, oSnapshot.csn); + o_tables_after_update(updated_o_table, oxid, oSnapshot.csn); + o_tables_table_meta_unlock(updated_o_table, oldRelnode); + recreate_table_descr_by_oids(updated_o_table->oids); + orioledb_free_rd_amcache(rel); + } + o_table_free(updated_o_table); + } + +} + +void +redefine_pkey_for_rel(Relation rel) +{ + ORelOids oids; + OTable *o_table; + + ORelOidsSetFromRel(oids, rel); + o_table = o_tables_get(oids); + Assert(o_table != NULL); + + redefine_indices(rel, o_table, true, InvalidOid); + + o_table_free(o_table); +} + +static void +change_bridging_option(Relation rel, bool value, bool isReset) +{ + Oid relid; + Relation pgclass; + HeapTuple tuple; + HeapTuple newtuple; + Datum datum; + bool isnull; + Datum newOptions; + Datum repl_val[Natts_pg_class]; + bool repl_null[Natts_pg_class]; + bool repl_repl[Natts_pg_class]; + static char *validnsps[] = HEAP_RELOPT_NAMESPACES; + DefElem *bridging_def; + + pgclass = table_open(RelationRelationId, RowExclusiveLock); + + /* Fetch heap tuple */ + relid = RelationGetRelid(rel); + tuple = SearchSysCacheLocked1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relid); + + /* Get the old reloptions */ + datum = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isnull); + + /* Generate new proposed reloptions (text array) */ + bridging_def = makeDefElem("index_bridging", isReset ? NULL : (Node *) makeBoolean(value), -1); + newOptions = transformRelOptions(isnull ? (Datum) 0 : datum, + list_make1(bridging_def), NULL, validnsps, false, isReset); + + /* Validate */ + (void) table_reloptions(rel, rel->rd_rel->relkind, newOptions, true); + + /* + * All we need do here is update the pg_class row; the new options will be + * propagated into relcaches during post-commit cache inval. + */ + memset(repl_val, 0, sizeof(repl_val)); + memset(repl_null, false, sizeof(repl_null)); + memset(repl_repl, false, sizeof(repl_repl)); + + if (newOptions != (Datum) 0) + repl_val[Anum_pg_class_reloptions - 1] = newOptions; + else + repl_null[Anum_pg_class_reloptions - 1] = true; + + repl_repl[Anum_pg_class_reloptions - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(pgclass), + repl_val, repl_null, repl_repl); + + CatalogTupleUpdate(pgclass, &newtuple->t_self, newtuple); + UnlockTuple(pgclass, &tuple->t_self, InplaceUpdateTupleLock); + + heap_freetuple(newtuple); + + ReleaseSysCache(tuple); + + table_close(pgclass, RowExclusiveLock); +} + +static void +add_bridge_index(Relation tbl, OTable *o_table, bool manually, Oid amoid) +{ + OSnapshot oSnapshot; + OXid oxid; + OTable *old_o_table; + OTableDescr *descr; + OTableDescr *old_descr; + int ix_num = InvalidIndexNumber; + + if (!manually) + { + HeapTuple tuple; + Form_pg_am amform; + + tuple = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for access method %u", amoid); + + amform = (Form_pg_am) GETSTRUCT(tuple); + + if (amoid != BTREE_AM_OID) + { + ereport(NOTICE, + errmsg("index bridging is enabled for orioledb table '%s'", + RelationGetRelationName(tbl)), + errdetail("index access method '%s' is supported only via index bridging for OrioleDB table", NameStr(amform->amname))); + } + else + { + ereport(NOTICE, + errmsg("index bridging is enabled for orioledb table '%s'", + RelationGetRelationName(tbl)), + errdetail("index access method '%s' is requested with index bridging for OrioleDB table", NameStr(amform->amname))); + } + + ReleaseSysCache(tuple); + } + + old_o_table = o_table; + o_table = o_tables_get(o_table->oids); + o_table->index_bridging = true; + assign_new_oids(o_table, tbl, false); + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_table->primary_init_nfields = o_table->nfields + 1; + + o_tables_table_meta_lock(NULL); + old_descr = o_fetch_table_descr(old_o_table->oids); + recreate_o_table(old_o_table, o_table); + descr = o_fetch_table_descr(o_table->oids); + rebuild_indices_insert_placeholders(descr); + o_tables_table_meta_unlock(NULL, InvalidOid); + + rebuild_indices(old_o_table, old_descr, o_table, descr, false, NULL); + o_tables_rel_meta_lock(tbl); + for (ix_num = 0; ix_num < o_table->nindices; ix_num++) + { + int ctid_idx_off; + OTableIndex *index; + + ctid_idx_off = o_table->has_primary ? 0 : 1; + index = &o_table->indices[ix_num]; + + o_indices_update(o_table, ix_num + ctid_idx_off, oxid, oSnapshot.csn); + o_invalidate_oids(index->oids); + o_add_invalidate_undo_item(index->oids, O_INVALIDATE_OIDS_ON_ABORT); + } + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(tbl, old_o_table->oids.relnode); + o_invalidate_oids(o_table->bridge_oids); + o_add_invalidate_undo_item(o_table->bridge_oids, O_INVALIDATE_OIDS_ON_ABORT); + o_invalidate_oids(o_table->oids); + o_add_invalidate_undo_item(o_table->oids, O_INVALIDATE_OIDS_ON_ABORT); + + change_bridging_option(tbl, true, false); + + o_table_free(old_o_table); + o_table_free(o_table); +} + +static void +drop_bridge_index(Relation tbl, OTable *o_table) +{ + OSnapshot oSnapshot; + OXid oxid; + OTable *old_o_table; + OTableDescr *descr; + OTableDescr *old_descr; + int ix_num = InvalidIndexNumber; + + old_o_table = o_table; + o_table = o_tables_get(o_table->oids); + o_table->index_bridging = false; + ORelOidsSetInvalid(o_table->bridge_oids); + assign_new_oids(o_table, tbl, false); + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_table->primary_init_nfields = o_table->nfields - 1; + + o_tables_table_meta_lock(NULL); + old_descr = o_fetch_table_descr(old_o_table->oids); + recreate_o_table(old_o_table, o_table); + descr = o_fetch_table_descr(o_table->oids); + rebuild_indices_insert_placeholders(descr); + o_tables_table_meta_unlock(NULL, InvalidOid); + + rebuild_indices(old_o_table, old_descr, o_table, descr, false, NULL); + o_tables_rel_meta_lock(tbl); + for (ix_num = 0; ix_num < o_table->nindices; ix_num++) + { + int ctid_idx_off; + OTableIndex *index; + + ctid_idx_off = o_table->has_primary ? 0 : 1; + index = &o_table->indices[ix_num]; + + o_indices_update(o_table, ix_num + ctid_idx_off, oxid, oSnapshot.csn); + o_invalidate_oids(index->oids); + o_add_invalidate_undo_item(index->oids, O_INVALIDATE_OIDS_ON_ABORT); + } + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(tbl, old_o_table->oids.relnode); + o_invalidate_oids(old_o_table->bridge_oids); + o_add_invalidate_undo_item(old_o_table->bridge_oids, O_INVALIDATE_OIDS_ON_ABORT); + o_invalidate_oids(o_table->oids); + o_add_invalidate_undo_item(o_table->oids, O_INVALIDATE_OIDS_ON_ABORT); + + change_bridging_option(tbl, false, true); + + o_table_free(old_o_table); + o_table_free(o_table); +} + +static void +cleanup_tablespace_dir(char *tablespace_path) +{ + DIR *dir; + struct dirent *file; + + dir = opendir(tablespace_path); + if (dir == NULL) + return; + + while (errno = 0, (file = readdir(dir)) != NULL) + { + Oid dbOid; + char *dbDirName; + + if (sscanf(file->d_name, "%u", &dbOid) != 1) + continue; + + dbDirName = psprintf("%s/%u", tablespace_path, dbOid); + + /* We assume that postgres throws it's own errors on not empty dirs */ + if (rmdir(dbDirName) < 0 && errno != ENOTEMPTY) + { + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove orioledb db dir \"%s\": %m", + dbDirName))); + } + pfree(dbDirName); + } + fsync_fname_ext(tablespace_path, true, false, FATAL); + + /* We assume that postgres throws it's own errors on not empty dirs */ + if (rmdir(tablespace_path) < 0 && errno != ENOTEMPTY) + { + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove tablespace orioledb dir \"%s\": %m", + tablespace_path))); + } + + /* We assume that postgres throws it's own errors on not empty dirs */ + if (errno != 0 && errno != ENOTEMPTY) + { + ereport(ERROR, (errcode_for_file_access(), + errmsg("unable to clean up orioledb tablespace: %m"))); + } + closedir(dir); +} + +/* + * get_collation - fetch qualified name of a collation + * + * If collation is InvalidOid or is the default for the given actual_datatype, + * then the return value is NIL. + */ +static List * +get_collation(Oid collation, Oid actual_datatype) +{ + List *result; + HeapTuple ht_coll; + Form_pg_collation coll_rec; + char *nsp_name; + char *coll_name; + + if (!OidIsValid(collation)) + return NIL; /* easy case */ + if (collation == get_typcollation(actual_datatype)) + return NIL; /* just let it default */ + + ht_coll = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation)); + if (!HeapTupleIsValid(ht_coll)) + elog(ERROR, "cache lookup failed for collation %u", collation); + coll_rec = (Form_pg_collation) GETSTRUCT(ht_coll); + + /* For simplicity, we always schema-qualify the name */ + nsp_name = get_namespace_name(coll_rec->collnamespace); + coll_name = pstrdup(NameStr(coll_rec->collname)); + /* cppcheck-suppress unknownEvaluationOrder */ + result = list_make2(makeString(nsp_name), makeString(coll_name)); + + ReleaseSysCache(ht_coll); + return result; +} + +/* + * get_opclass - fetch qualified name of an index operator class + * + * If the opclass is the default for the given actual_datatype, then + * the return value is NIL. + */ +static List * +get_opclass(Oid opclass, Oid actual_datatype) +{ + List *result = NIL; + HeapTuple ht_opc; + Form_pg_opclass opc_rec; + + ht_opc = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass)); + if (!HeapTupleIsValid(ht_opc)) + elog(ERROR, "cache lookup failed for opclass %u", opclass); + opc_rec = (Form_pg_opclass) GETSTRUCT(ht_opc); + + if (GetDefaultOpClass(actual_datatype, opc_rec->opcmethod) != opclass) + { + /* For simplicity, we always schema-qualify the name */ + char *nsp_name = get_namespace_name(opc_rec->opcnamespace); + char *opc_name = pstrdup(NameStr(opc_rec->opcname)); + + /* cppcheck-suppress unknownEvaluationOrder */ + result = list_make2(makeString(nsp_name), makeString(opc_name)); + } + + ReleaseSysCache(ht_opc); + return result; +} + +static void +orioledb_object_access_hook(ObjectAccessType access, Oid classId, Oid objectId, + int subId, void *arg) +{ + Relation rel; + + if (access == OAT_POST_CREATE && classId == ExtensionRelationId) + { +#if PG_VERSION_NUM >= 170000 + if (IsTransactionState()) + { + XLogRecPtr cur_lsn; + + o_sys_cache_set_datoid_lsn(&cur_lsn, NULL); + o_database_cache_add_if_needed(Template1DbOid, Template1DbOid, cur_lsn, NULL); + } +#endif + } + else if (access == OAT_DROP && classId == RelationRelationId) + { + ObjectAccessDrop *drop_arg = (ObjectAccessDrop *) arg; + + ASAN_UNPOISON_MEMORY_REGION(drop_arg, sizeof(*drop_arg)); + +#ifdef USE_ASSERT_CHECKING + { + LOCKTAG locktag; + + memset(&locktag, 0, sizeof(LOCKTAG)); + SET_LOCKTAG_RELATION(locktag, MyDatabaseId, objectId); + + Assert(DoLocalLockExist(&locktag)); + } +#endif + + rel = relation_open(objectId, AccessShareLock); + + if (rel != NULL) + { + bool is_open = true; + + if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId == 0) && is_orioledb_rel(rel) && + !OidIsValid(rel->rd_rel->relrewrite)) + { + ListCell *lc; + ORelOids oids; + OTableDescr *descr; + + ORelOidsSetFromRel(oids, rel); + foreach(lc, partition_drop_index_list) + { + List *drop_oids = (List *) lfirst(lc); + + if (lsecond_oid(drop_oids) == rel->rd_rel->oid) + partition_drop_index_list = foreach_delete_current(partition_drop_index_list, lc); + } + + descr = relation_get_descr(rel); + + /* + * Descriptor should be there as long as it's not temporary + * relation. Descriptors of temporary relations might be + * already deleted. + */ + Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP || + descr); + + if (descr != NULL) + o_drop_table(oids); + } + else if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId != 0) && is_orioledb_rel(rel)) + { + OTable *o_table; + OTableField *o_field = NULL; + ORelOids oids; + + ORelOidsSetFromRel(oids, rel); + o_table = o_tables_get(oids); + if (o_table == NULL) + { + /* table does not exist */ + elog(NOTICE, "orioledb table \"%s\" not found", + RelationGetRelationName(rel)); + } + else + { + o_field = &o_table->fields[subId - 1]; + + if (o_field && !o_field->droped) + { + OSnapshot oSnapshot; + OXid oxid; + + o_field->droped = true; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_tables_rel_meta_lock(rel); + o_indices_update(o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_after_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(rel, InvalidOid); + /* cppcheck-suppress unknownEvaluationOrder */ + dropped_attrs = lappend(dropped_attrs, list_make2(makeInteger(objectId), makeInteger(subId))); + } + o_table_free(o_table); + } + } + else if (rel->rd_rel->relkind == RELKIND_INDEX && + !(drop_arg->dropflags & PERFORM_DELETION_OF_RELATION)) + { + /* + * dropflags == PERFORM_DELETION_OF_RELATION ignored, to not + * drop indices when whole table dropped + */ + Relation tbl = relation_open(rel->rd_index->indrelid, + AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION || + tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(tbl)) + { + OIndexNumber ix_num; + OTableDescr *descr = relation_get_descr(tbl); + + Assert(descr != NULL); + ix_num = o_find_ix_num_by_name(descr, + rel->rd_rel->relname.data); + if (ix_num != InvalidIndexNumber) + { + String *relname; + + if (descr->indices[ix_num]->primaryIsCtid) + ix_num--; + relation_close(rel, AccessShareLock); + is_open = false; + + relname = makeString(rel->rd_rel->relname.data); + if (list_member_oid(o_reuse_indices, objectId)) + { + /* Do not drop index if it is set for reuse */ + elog(DEBUG1, "object_access_hook: skipping index %d drop as it is set for reuse", objectId); + } + else if (!(drop_arg->dropflags & + PERFORM_DELETION_INTERNAL) || + list_member(drop_index_list, relname)) + { + drop_index_list = list_delete(drop_index_list, + relname); + o_index_drop(tbl, ix_num); + } + } + } + relation_close(tbl, AccessShareLock); + } + else if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE && + (subId != 0)) + { + OClassArg arg = {.column_drop = true,.dropped = subId}; + + o_find_composite_type_dependencies(rel->rd_rel->reltype, rel); + CommandCounterIncrement(); + o_class_cache_update_if_needed(MyDatabaseId, rel->rd_rel->oid, + (Pointer) &arg); + } + else if ((rel->rd_rel->relkind == RELKIND_INDEX) && + (drop_arg->dropflags & PERFORM_DELETION_OF_RELATION)) + { + Relation tbl = relation_open(rel->rd_index->indrelid, + AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION || + tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(tbl)) + { + /* + * TODO: probably better way would be to add hook to + * findDependentObjects and filter partition index + * dependencies there, but for now + * PERFORM_DELETION_OF_RELATION passed for partiton index + * dependency and I'm not sure how to properly filter out + * only this kind of dependency and do not touch behaviour + * that not drops indices during table drop to not rebuild + * them + */ + + Relation depRel; + ObjectAddress object; + ScanKeyData key[2]; + int nkeys; + SysScanDesc scan; + HeapTuple tup; + + depRel = table_open(DependRelationId, RowExclusiveLock); + + object.classId = classId; + object.objectId = objectId; + object.objectSubId = subId; + + ScanKeyInit(&key[0], + Anum_pg_depend_classid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object.classId)); + ScanKeyInit(&key[1], + Anum_pg_depend_objid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object.objectId)); + nkeys = 2; + + scan = systable_beginscan(depRel, DependDependerIndexId, true, NULL, nkeys, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend foundDep = (Form_pg_depend) GETSTRUCT(tup); + + if (foundDep->deptype == DEPENDENCY_PARTITION_PRI || + foundDep->deptype == DEPENDENCY_PARTITION_SEC) + { + partition_drop_index_list = list_append_unique(partition_drop_index_list, + /* cppcheck-suppress unknownEvaluationOrder */ + list_make2_oid(rel->rd_rel->oid, + rel->rd_index->indrelid)); + break; + } + } + + systable_endscan(scan); + + table_close(depRel, RowExclusiveLock); + } + relation_close(tbl, AccessShareLock); + } + else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + { + ListCell *lc; + Relation tbl = relation_open(rel->rd_index->indrelid, + AccessShareLock); + + /* + * We don't have secondary index dependencies at this moment + * so we are passing them in partition_drop_index_list from + * before + */ + if (partition_drop_index_list != NIL) + { + foreach(lc, partition_drop_index_list) + { + List *oids = (List *) lfirst(lc); + Relation part_tbl = relation_open(lsecond_oid(oids), AccessShareLock); + OIndexNumber ix_num; + OTableDescr *descr; + int i; + + Assert((part_tbl->rd_rel->relkind == RELKIND_RELATION || + part_tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(part_tbl)); + + descr = relation_get_descr(part_tbl); + Assert(descr != NULL); + + ix_num = InvalidIndexNumber; + for (i = 0; i < descr->nIndices; i++) + { + if (descr->indices[i]->oids.reloid == linitial_oid(oids)) + { + ix_num = i; + break; + } + } + if (ix_num != InvalidIndexNumber) + { + if (descr->indices[ix_num]->primaryIsCtid) + ix_num--; + + o_index_drop(part_tbl, ix_num); + } + relation_close(part_tbl, AccessShareLock); + } + list_free(partition_drop_index_list); + partition_drop_index_list = NIL; + } + relation_close(tbl, AccessShareLock); + } + if (is_open) + relation_close(rel, AccessShareLock); + } + } + else if (access == OAT_DROP && classId == DatabaseRelationId) + { + OSnapshot oSnapshot; + OXid oxid; + + Assert(OidIsValid(objectId)); + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + o_tables_table_meta_lock(NULL); + o_tables_drop_all(oxid, oSnapshot.csn, objectId); + o_tables_table_meta_unlock(NULL, InvalidOid); + } + else if (access == OAT_DROP && classId == TypeRelationId && + ActiveSnapshotSet()) + { + OSnapshot oSnapshot; + OXid oxid; + Form_pg_type typeform; + HeapTuple tuple = NULL; + + Assert(OidIsValid(objectId)); + + fill_current_oxid_osnapshot_no_check(&oxid, &oSnapshot); + + o_tables_drop_columns_by_type(oxid, oSnapshot.csn, objectId); + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(objectId)); + Assert(tuple); + typeform = (Form_pg_type) GETSTRUCT(tuple); + + switch (typeform->typtype) + { + case TYPTYPE_COMPOSITE: + if (typeform->typtypmod == -1) + { + o_class_cache_delete(MyDatabaseId, typeform->typrelid); + } + break; + case TYPTYPE_RANGE: + o_range_cache_delete(MyDatabaseId, typeform->oid); + break; + case TYPTYPE_ENUM: + o_enum_cache_delete_all(MyDatabaseId, typeform->oid); + break; + } + if (typeform->typtype != TYPTYPE_BASE && + typeform->typtype != TYPTYPE_PSEUDO) + o_type_cache_delete(MyDatabaseId, typeform->oid); + if (tuple != NULL) + ReleaseSysCache(tuple); + } + else if (access == OAT_POST_CREATE && classId == RelationRelationId) + { + bool closed = false; + + rel = relation_open(objectId, AccessShareLock); + + if (rel != NULL) + { + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE && + (subId != 0)) + { + o_find_composite_type_dependencies(rel->rd_rel->reltype, rel); + } + else if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId != 0) && is_orioledb_rel(rel)) + { + /* Branch is taken during ALTER TABLE ... ADD COLUMN */ + OTableField *field; + Form_pg_attribute attr; + OTable *o_table; + ORelOids oids; + OSnapshot oSnapshot; + OXid oxid; + + ORelOidsSetFromRel(oids, rel); + + o_table = o_tables_get(oids); + if (o_table == NULL) + { + /* table does not exist */ + elog(NOTICE, "orioledb table \"%s\" not found", RelationGetRelationName(rel)); + } + else + { + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + o_table->nfields++; + o_table->fields = repalloc(o_table->fields, + o_table->nfields * + sizeof(OTableField)); + memset(&o_table->fields[o_table->nfields - 1], 0, + sizeof(OTableField)); + + CommandCounterIncrement(); + field = &o_table->fields[o_table->nfields - 1]; + attr = &rel->rd_att->attrs[rel->rd_att->natts - 1]; + orioledb_attr_to_field(field, attr); + + o_in_add_column = true; + + o_table_resize_constr(o_table); + + /* + * The domain expression may have already been created, so + * we need to explicitly call o_table_fill_constr to + * propagate the default value of the domain type to the + * new column. For non-domain types this will be called by + * another access_hook call only after the default value + * is created in pg catalog. + */ + if (get_typtype(field->typid) == TYPTYPE_DOMAIN && !in_rewrite) + { + o_table_fill_constr(o_table, rel, subId - 1, NULL, field); + } + + o_tables_rel_meta_lock(rel); + o_indices_update(o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_after_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(rel, InvalidOid); + + o_table_free(o_table); + } + } + else if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId == 0) && is_orioledb_rel(rel)) + { + if (!OidIsValid(rel->rd_rel->relrewrite)) + { + create_o_table_for_rel(rel); + } + else + { + Relation old_rel = relation_open(rel->rd_rel->relrewrite, AccessShareLock); + + o_saved_relrewrite = rel->rd_rel->relrewrite; + ORelOidsSetFromRel(saved_oids, old_rel); + relation_close(old_rel, AccessShareLock); + } + } + else if ((rel->rd_rel->relkind == RELKIND_TOASTVALUE) && + (subId == 0) && !OidIsValid(rel->rd_rel->relrewrite)) + { + Oid tbl_oid; + Relation tbl = NULL; + + /* This is faster than dependency scan */ + tbl_oid = pg_strtoint64(strrchr(rel->rd_rel->relname.data, + '_') + 1); + + tbl = try_table_open(tbl_oid, AccessShareLock); + if (tbl && is_orioledb_rel(tbl)) + { + set_toast_oids_and_options(tbl, rel, false, false); + } + if (tbl) + table_close(tbl, AccessShareLock); + } + else if (rel->rd_rel->relkind == RELKIND_INDEX) + { + /* Checks and adds bridged indexes */ + Relation tbl; + + CommandCounterIncrement(); + tbl = relation_open(rel->rd_index->indrelid, AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION || + tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(tbl)) + { + OSnapshot oSnapshot; + OXid oxid; + OTable *o_table; + ORelOids table_oids; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + + ORelOidsSetFromRel(table_oids, tbl); + o_table = o_tables_get(table_oids); + if (o_table == NULL) + { + elog(NOTICE, "orioledb table %s not found", + RelationGetRelationName(tbl)); + } + else + { + int ix_num = InvalidIndexNumber; + int i; + bool add_bridging = false; + bool btree_bridging = false; + + for (i = 0; i < o_table->nindices; i++) + { + if (strcmp(o_table->indices[i].name.data, rel->rd_rel->relname.data) == 0) + { + ix_num = i; + break; + } + } + + Assert(rel->rd_rel->relkind == RELKIND_INDEX); + + /* In case of index reuse, update the index oid */ + if (ix_num != InvalidIndexNumber && list_member_oid(o_reuse_indices, o_table->indices[ix_num].oids.reloid)) + { + Oid old_oid = o_table->indices[ix_num].oids.reloid; + + elog(DEBUG1, "object_access_hook: updating index oid %d to %d", old_oid, objectId); + o_table->indices[ix_num].oids.reloid = objectId; + o_tables_rel_meta_lock(tbl); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(tbl, InvalidOid); + o_invalidate_oids(o_table->oids); + o_reuse_indices = list_delete_oid(o_reuse_indices, old_oid); + drop_index_list = list_delete(drop_index_list, makeString(rel->rd_rel->relname.data)); + } + + if (!o_table->index_bridging) + { + if (rel->rd_rel->relam == BTREE_AM_OID) + { + OBTOptions *options = (OBTOptions *) rel->rd_options; + + if (options && !options->orioledb_index) + { + add_bridging = true; + btree_bridging = true; + } + } + else + { + add_bridging = true; + } + } + else if (rel->rd_rel->relam == BTREE_AM_OID) + { + if (!in_rewrite && !rel->rd_index->indisprimary && ix_num == InvalidIndexNumber) + { + OBTOptions *options = (OBTOptions *) rel->rd_options; + + if (options && !options->orioledb_index) + btree_bridging = true; + } + } + + if (btree_bridging) + { + ereport(WARNING, + errcode(ERRCODE_WARNING), + errmsg("using bridged btree index for orioledb"), + errdetail("This feature is intended for testing purposes and is not recommended for normal usage.")); + } + + if (add_bridging) + { + /* + * Ensure the table has a toast relation before + * adding the bridge index. The bridge rebuild + * path recreates all indices including the toast + * tree, so the toast OIDs must already be set. + * During CREATE TABLE the toast table may not + * exist yet if table inherits indices from parent + * table. + */ + if (!ORelOidsIsValid(o_table->toast_oids)) + { + Datum toast_options; + static char *validnsps[] = HEAP_RELOPT_NAMESPACES; + + Assert(create_stmt != NULL); + + toast_options = transformRelOptions((Datum) 0, + create_stmt->options, + "toast", + validnsps, + true, false); + (void) heap_reloptions(RELKIND_TOASTVALUE, + toast_options, + true); + + relation_close(tbl, AccessShareLock); + + /* + * NewRelationCreateToastTable ends with + * CommandCounterIncrement(), so that the + * TOAST table will be visible for + * add_bridge_index(). + */ + NewRelationCreateToastTable(rel->rd_index->indrelid, toast_options); + + tbl = relation_open(rel->rd_index->indrelid, AccessShareLock); + + ORelOidsSetFromRel(table_oids, tbl); + o_table_free(o_table); + o_table = o_tables_get(table_oids); + } + add_bridge_index(tbl, o_table, false, rel->rd_rel->relam); + } + else + o_table_free(o_table); + } + } + relation_close(tbl, AccessShareLock); + } + if (!closed) + relation_close(rel, AccessShareLock); + } + } + else if (access == OAT_POST_CREATE && classId == AttrDefaultRelationId) + { + rel = relation_open(objectId, AccessShareLock); + + if (rel != NULL && (rel->rd_rel->relkind == RELKIND_RELATION) && + (subId != 0) && is_orioledb_rel(rel)) + { + Form_pg_attribute attr; + OTable *o_table; + ORelOids oids; + OSnapshot oSnapshot; + OXid oxid; + + ORelOidsSetFromRel(oids, rel); + o_table = o_tables_get(oids); + if (o_table == NULL) + { + /* table does not exist */ + elog(NOTICE, "orioledb table \"%s\" not found", + RelationGetRelationName(rel)); + } + else + { + OTableField old_field; + OTableField *field; + bool changed; + + old_field = o_table->fields[subId - 1]; + CommandCounterIncrement(); + field = &o_table->fields[subId - 1]; + attr = &rel->rd_att->attrs[subId - 1]; + orioledb_attr_to_field(field, attr); + + /* TODO: Probably use CheckIndexCompatible here */ + changed = old_field.typid != field->typid || + old_field.typmod != field->typmod || + old_field.collation != field->collation; + + if (changed) + { + if (ATColumnChangeRequiresRewrite(&old_field, field, objectId, + subId)) + in_rewrite = true; + } + + if (!in_rewrite) + { + o_table_fill_constr(o_table, rel, subId - 1, + &old_field, field); + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_tables_rel_meta_lock(rel); + o_indices_update(o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_after_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(rel, InvalidOid); + + /* This has no effect? */ + o_table->fields[subId - 1] = old_field; + o_table_free(o_table); + } + } + } + relation_close(rel, AccessShareLock); + } + else if (access == OAT_POST_ALTER && classId == RelationRelationId) + { + rel = relation_open(objectId, AccessShareLock); + + if (rel != NULL) + { + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + { + OClassArg arg = {0}; + + o_find_composite_type_dependencies(rel->rd_rel->reltype, rel); + CommandCounterIncrement(); + o_class_cache_update_if_needed(MyDatabaseId, rel->rd_rel->oid, + (Pointer) &arg); + if (arg.found) + { + XLogRecPtr cur_lsn; + + o_sys_cache_set_datoid_lsn(&cur_lsn, NULL); + o_cache_type(MyDatabaseId, rel->rd_rel->reltype, InvalidOid, + cur_lsn); + } + } + else if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId != 0) && is_orioledb_rel(rel)) + { + OTable *o_table; + ORelOids oids; + + ORelOidsSetFromRel(oids, rel); + o_table = o_tables_get(oids); + if (o_table == NULL) + { + /* table does not exist */ + elog(NOTICE, "orioledb table \"%s\" not found", + RelationGetRelationName(rel)); + } + else + { + OTableField old_field; + OTableField *field; + Form_pg_attribute attr; + OSnapshot oSnapshot; + OXid oxid; + int ix_num; + bool changed_ty; + + old_field = o_table->fields[subId - 1]; + CommandCounterIncrement(); + field = &o_table->fields[subId - 1]; + attr = &rel->rd_att->attrs[subId - 1]; + orioledb_attr_to_field(field, attr); + + /* TODO: Probably use CheckIndexCompatible here */ + changed_ty = old_field.typid != field->typid || + old_field.typmod != field->typmod || + old_field.collation != field->collation; + + if (changed_ty) + { + if (ATColumnChangeRequiresRewrite(&old_field, field, objectId, + subId)) + in_rewrite = true; + } + + /* + * Alter table on generated column triggers table rewrite + * due to need of recalculating column value for existing + * rows + */ + if (old_field.generated) + { + in_rewrite = true; + o_alter_generated_column_id = lappend(o_alter_generated_column_id, + /* cppcheck-suppress unknownEvaluationOrder */ + list_make2(makeInteger(objectId), makeInteger(subId))); + } + + if (!in_rewrite) + { + orioledb_save_collation(field->collation); + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_tables_rel_meta_lock(rel); + for (ix_num = 0; ix_num < o_table->nindices; ix_num++) + { + bool compatible = false; + int field_num; + int ctid_idx_off; + OTableIndex *o_table_index; + List *attributeList = NIL; + int expr_field = 0; + ListCell *indexpr; + + ctid_idx_off = o_table->has_primary ? 0 : 1; + o_table_index = &o_table->indices[ix_num]; + indexpr = list_head(o_table_index->expressions); + + for (field_num = 0; field_num < o_table_index->nkeyfields; field_num++) + { + IndexElem *iparam; + OTableIndexField *iField = &o_table_index->fields[field_num]; + int attnum = iField->attnum; + OTableField *table_field; + + iparam = makeNode(IndexElem); + if (attnum != EXPR_ATTNUM) + { + table_field = &o_table->fields[attnum]; + iparam->name = table_field->name.data; + iparam->expr = NULL; + } + else + { + table_field = &o_table_index->exprfields[expr_field++]; + iparam->name = NULL; + iparam->expr = lfirst(indexpr); + indexpr = lnext(o_table_index->expressions, indexpr); + } + + iparam->collation = get_collation(table_field->collation, table_field->typid); + iparam->opclass = get_opclass(iField->opclass, table_field->typid); + iparam->ordering = iField->ordering; + iparam->nulls_ordering = iField->nullsOrdering; + + attributeList = lappend(attributeList, iparam); + } + + compatible = CheckIndexCompatible(o_table_index->oids.reloid, "btree", attributeList, NIL); + + for (field_num = 0; field_num < o_table_index->nkeyfields; + field_num++) + { + bool has_field; + + has_field = o_table_index->fields[field_num].attnum == + subId - 1; + if (o_table_index->type == oIndexPrimary || has_field) + { + o_indices_update(o_table, + ix_num + ctid_idx_off, + oxid, oSnapshot.csn); + o_invalidate_oids(o_table_index->oids); + o_add_invalidate_undo_item(o_table_index->oids, + O_INVALIDATE_OIDS_ON_ABORT); + } + if (changed_ty && has_field && (o_table_index->type == oIndexPrimary || !compatible)) + { + String *ix_name; + + ix_name = + makeString(pstrdup(o_table_index->name.data)); + drop_index_list = + list_append_unique(drop_index_list, + ix_name); + } + } + } + o_indices_update(o_table, PrimaryIndexNumber, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_after_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(rel, InvalidOid); + } + o_table->fields[subId - 1] = old_field; + o_table_free(o_table); + } + } + else if (rel->rd_rel->relkind == RELKIND_INDEX) + { + Relation tbl = relation_open(rel->rd_index->indrelid, + AccessShareLock); + + if ((tbl->rd_rel->relkind == RELKIND_RELATION || + tbl->rd_rel->relkind == RELKIND_MATVIEW) && + is_orioledb_rel(tbl)) + { + OTable *o_table; + ORelOids table_oids; + + ORelOidsSetFromRel(table_oids, tbl); + o_table = o_tables_get(table_oids); + if (o_table == NULL) + { + elog(NOTICE, "orioledb table %s not found", + RelationGetRelationName(tbl)); + } + else if (rel->rd_rel->relam == BTREE_AM_OID && + !(rel->rd_options && + !((OBTOptions *) rel->rd_options)->orioledb_index)) + { + int ix_num; + OSnapshot oSnapshot; + OXid oxid; + ORelOids idx_oids; + Oid reltablespace; + + ORelOidsSetFromRel(idx_oids, rel); + CommandCounterIncrement(); + if (rel->rd_options && !((OBTOptions *) rel->rd_options)->orioledb_index) + elog(ERROR, "Cannot change 'orioledb_index' option for existing indices"); + reltablespace = rel->rd_rel->reltablespace; + for (ix_num = 0; ix_num < o_table->nindices; ix_num++) + { + OTableIndex *index = &o_table->indices[ix_num]; + OBTOptions *options = (OBTOptions *) rel->rd_options; + + if (ORelOidsIsEqual(index->oids, idx_oids)) + { + namestrcpy(&index->name, + rel->rd_rel->relname.data); + index->fillfactor = options ? options->bt_options.fillfactor : BTREE_DEFAULT_FILLFACTOR; + break; + } + } + Assert(ix_num < o_table->nindices); + if (!OidIsValid(reltablespace)) + reltablespace = MyDatabaseTableSpace; + if (o_table->indices[ix_num].tablespace == reltablespace) + { + int ctid_idx_off = o_table->has_primary ? 0 : 1; + + fill_current_oxid_osnapshot(&oxid, &oSnapshot); + o_tables_rel_meta_lock(tbl); + o_indices_update(o_table, ix_num + ctid_idx_off, oxid, oSnapshot.csn); + o_tables_update(o_table, oxid, oSnapshot.csn); + o_tables_rel_meta_unlock(tbl, InvalidOid); + o_invalidate_oids(idx_oids); + o_add_invalidate_undo_item(idx_oids, + O_INVALIDATE_OIDS_ON_ABORT); + if (!ORelOidsIsEqual(idx_oids, table_oids)) + { + o_invalidate_oids(table_oids); + o_add_invalidate_undo_item(table_oids, + O_INVALIDATE_OIDS_ON_ABORT); + } + o_table_free(o_table); + } + else + { + o_index_drop(tbl, ix_num); + o_table_free(o_table); + o_define_index(tbl, rel, InvalidOid, false, + InvalidIndexNumber, InvalidOid, NULL); + } + } + else if (rel->rd_options) + { + bool old_orioledb_index = false; + bool new_orioledb_index = false; + + switch (rel->rd_amhandler) + { + case F_BTHANDLER: + old_orioledb_index = ((OBTOptions *) rel->rd_options)->orioledb_index; + CommandCounterIncrement(); + new_orioledb_index = rel->rd_options && + ((OBTOptions *) rel->rd_options)->orioledb_index; + break; + default: + break; + } + if (old_orioledb_index != new_orioledb_index || !rel->rd_options) + elog(ERROR, "Cannot change 'orioledb_index' option for existing indices"); + + } + } + relation_close(tbl, AccessShareLock); + } + else if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + (subId == 0)) + { + /* + * We come here during "ALTER TABLE ... SET TABLESPACE" after + * orioledb_relation_copy_data + */ + if (is_orioledb_rel(rel)) + { + ORelOids old_oids; + Oid old_reltablespace = rel->rd_rel->reltablespace; + + if (!OidIsValid(old_reltablespace)) + old_reltablespace = MyDatabaseTableSpace; + ORelOidsSetFromRel(old_oids, rel); + CommandCounterIncrement(); + if (old_reltablespace != rel->rd_rel->reltablespace) + { + o_saved_reltablespace = old_reltablespace; + saved_oids = old_oids; + } + } + } + else if (rel->rd_rel->relkind == RELKIND_TOASTVALUE && + OidIsValid(o_saved_relrewrite) && + OidIsValid(rel->rd_rel->relrewrite) && + (subId == 0)) + { + Relation tbl = NULL; + + tbl = table_open(o_saved_relrewrite, AccessShareLock); + if (is_orioledb_rel(tbl)) + { + ORelOids new_oids; + OTable *old_o_table, + *new_o_table; + + CommandCounterIncrement(); + + old_o_table = o_tables_get(saved_oids); + Assert(old_o_table != NULL); + + create_o_table_for_rel(tbl); + + set_toast_oids_and_options(tbl, rel, false, old_o_table->index_bridging); + + ORelOidsSetFromRel(new_oids, tbl); + new_o_table = o_tables_get(new_oids); + Assert(new_o_table != NULL); + + relation_close(tbl, AccessShareLock); + CommandCounterIncrement(); + tbl = relation_open(o_saved_relrewrite, AccessShareLock); + + /* + * Redefinig primary key here to not do rebuild after + * rewrite_table + */ + redefine_indices(tbl, new_o_table, true, InvalidOid); + + o_table_free(new_o_table); + new_o_table = o_tables_get(new_oids); + Assert(new_o_table != NULL); + + switch (tbl->rd_rel->relkind) + { + case RELKIND_RELATION: + rewrite_table(tbl, old_o_table, new_o_table); + break; + case RELKIND_MATVIEW: + o_saved_relrewrite = InvalidOid; + if (savedDataQuery != NULL) + rewrite_matview(tbl, old_o_table, new_o_table); + o_drop_table(old_o_table->oids); + break; + default: + Assert(false); + break; + } + + redefine_indices(tbl, new_o_table, false, InvalidOid); + + o_table_free(old_o_table); + o_table_free(new_o_table); + o_saved_relrewrite = InvalidOid; + } + table_close(tbl, AccessShareLock); + } + else if (rel->rd_rel->relkind == RELKIND_TOASTVALUE && + (subId == 0)) + { + Oid tbl_oid; + Relation tbl = NULL; + + /* This is faster than dependency scan */ + tbl_oid = pg_strtoint64(strrchr(rel->rd_rel->relname.data, + '_') + 1); + CommandCounterIncrement(); + + tbl = try_table_open(tbl_oid, AccessShareLock); + if (tbl && is_orioledb_rel(tbl)) + { + ORelOids oids; + OTableDescr *descr; + ORelOptions *options = (ORelOptions *) tbl->rd_options; + uint8 new_fillfactor; + bool new_index_bridging; + Oid reltablespace = rel->rd_rel->reltablespace; + + if (reltablespace == 0) + reltablespace = MyDatabaseTableSpace; + CommandCounterIncrement(); + ORelOidsSetFromRel(oids, tbl); + if (OidIsValid(o_saved_reltablespace) && + o_saved_reltablespace != reltablespace) + { + /* + * We come here during "ALTER TABLE ... SET + * TABLESPACE" + */ + OTable *old_o_table, + *new_o_table; + + Assert(ORelOidsIsValid(saved_oids)); + old_o_table = o_tables_get(saved_oids); + Assert(old_o_table != NULL); + + create_o_table_for_rel(tbl); + + set_toast_oids_and_options(tbl, rel, false, old_o_table->index_bridging); + + new_o_table = o_tables_get(oids); + Assert(new_o_table != NULL); + + relation_close(tbl, AccessShareLock); + CommandCounterIncrement(); + tbl = relation_open(tbl_oid, AccessShareLock); + + /* + * Redefinig primary key here to not do rebuild after + * rewrite_table + */ + redefine_indices(tbl, new_o_table, true, old_o_table->oids.relnode); + + o_table_free(new_o_table); + new_o_table = o_tables_get(oids); + Assert(new_o_table != NULL); + + switch (tbl->rd_rel->relkind) + { + case RELKIND_RELATION: + case RELKIND_MATVIEW: + + /* + * for matview we just copy data to not + * recalculate expressions + */ + Assert(alter_type_exprs == NIL); + rewrite_table(tbl, old_o_table, new_o_table); + break; + default: + Assert(false); + break; + } + + redefine_indices(tbl, new_o_table, false, old_o_table->oids.relnode); + + o_table_free(old_o_table); + o_table_free(new_o_table); + + o_saved_reltablespace = InvalidOid; + ORelOidsSetInvalid(saved_oids); + } + else + { + /* + * We come here during "ALTER TABLE ... SET