diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh new file mode 100755 index 0000000000000..7bd661ab4a06e --- /dev/null +++ b/benchmarks/collect_bench.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary to +# collect benchmarks from the current main and last 5 major releases. + +trap 'git checkout main' EXIT #checkout to main on exit +ARG1=$1 + +main(){ +timestamp=$(date +%s) +lp_file="results/$ARG1-$timestamp.lp" +mkdir -p results +touch $lp_file + +cp lineprotocol.py results/ + +git fetch upstream main +git checkout main + +# get current major version +output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') +major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) + + +# run for current main +echo "current major version: $major_version" +export RESULTS_DIR="results/$major_version.0.0" +./bench.sh run $ARG1 +python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + +# run for last 5 major releases +for i in {1..5}; do + echo "running benchmark on $((major_version-i)).0.0" + git fetch upstream $((major_version-i)).0.0 + git checkout $((major_version-i)).0.0 + export RESULTS_DIR="results/$((major_version-i)).0.0" + ./bench.sh run $ARG1 + python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +done +} + +main \ No newline at end of file diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 75e09b662e3e1..5242ccfeaab6d 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -1,3 +1,5 @@ +from __future__ import annotations + #!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -76,7 +78,6 @@ } """ -from __future__ import annotations import json from dataclasses import dataclass @@ -84,6 +85,7 @@ from pathlib import Path from argparse import ArgumentParser import sys +import subprocess print = sys.stdout.write @@ -128,16 +130,25 @@ class Context: start_time: int arguments: List[str] name: str + commit_timestamp: int @classmethod def load_from(cls, data: Dict[str, Any]) -> Context: + get_timestamp = subprocess.run( + ["git", "log", "-1", "--format=%ct", data["datafusion_version"]], + capture_output=True, + text=True, + check=True + ) + commit_timestamp = get_timestamp.stdout.strip() return cls( benchmark_version=data["benchmark_version"], datafusion_version=data["datafusion_version"], num_cpus=data["num_cpus"], start_time=data["start_time"], arguments=data["arguments"], - name=data["arguments"][0] + name=data["arguments"][0], + commit_timestamp=commit_timestamp ) @@ -164,7 +175,7 @@ def lineformat( ) -> None: baseline = BenchmarkRun.load_from_file(baseline) context = baseline.context - benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}" + benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus},commit_timestamp={context.commit_timestamp}" for query in baseline.queries: query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" @@ -180,7 +191,7 @@ def main() -> None: ) options = parser.parse_args() - lineformat(options.baseline_path) + lineformat(options.path)