From e935e813df805cff9ae9356614d7426a63e372a4 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 10:21:39 -0400 Subject: [PATCH 01/18] feat: run benchmark on latest 6 versions automatically --- benchmarks/collect_bench.sh | 69 +++++++++++++++++++++++++++++++++++++ benchmarks/lineprotocol.py | 9 +++-- benchmarks/src/util/run.rs | 16 +++++++++ 3 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 benchmarks/collect_bench.sh diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh new file mode 100644 index 0000000000000..ea8086e3dc191 --- /dev/null +++ b/benchmarks/collect_bench.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary to +# collect benchmarks from the current main and last 5 major releases. + +trap 'git checkout main' EXIT #checkout to main on exit +ARG1=$1 + +main(){ +timestamp=$(date +%s) +lp_file="results/$ARG1-$timestamp.lp" + +git fetch upstream main +git checkout main + +# get current major version +output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') +major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) + +# run for current main +echo "current major version: $major_version" +export RESULTS_DIR="results/$major_version.0.0" +./bench.sh run $ARG1 +python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + +# run for last 5 major releases +for i in {1..5}; do + echo "running benchmark on $((major_version-i)).0.0" + # git fetch upstream $((major_version-i)).0.0 + git checkout $((major_version-i)).0.0 + export RESULTS_DIR="results/$((major_version-i)).0.0" + ./bench.sh run $ARG1 + python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +done + +echo "[[inputs.file]] + files = [ \"$lp_file\" ] + data_format = \"influx\" + name_override = \"datafusion_benchmarks\" + +[[outputs.influxdb_v2]] + alias = \"monitor-tools\" + urls = [\"https://us-east-1-2.aws.cloud2.influxdata.com\"] + token = \"$INFLUX_TOKEN\" + organization = \"5d59ccc5163fc318\" + bucket = \"performance_metrics\" +" > results/telegraf.conf +telegraf --config results/telegraf.conf --once +} + +main \ No newline at end of file diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 75e09b662e3e1..22138be69a33b 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -16,6 +17,7 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations """ Converts a given json to LineProtocol format that can be @@ -76,7 +78,6 @@ } """ -from __future__ import annotations import json from dataclasses import dataclass @@ -124,6 +125,7 @@ def execution_time(self) -> float: class Context: benchmark_version: str datafusion_version: str + datafusion_commit_timestamp: int num_cpus: int start_time: int arguments: List[str] @@ -134,6 +136,7 @@ def load_from(cls, data: Dict[str, Any]) -> Context: return cls( benchmark_version=data["benchmark_version"], datafusion_version=data["datafusion_version"], + datafusion_commit_timestamp=data["datafusion_commit_timestamp"], num_cpus=data["num_cpus"], start_time=data["start_time"], arguments=data["arguments"], @@ -164,7 +167,7 @@ def lineformat( ) -> None: baseline = BenchmarkRun.load_from_file(baseline) context = baseline.context - benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}" + benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},datafusion_commit_timestamp={context.datafusion_commit_timestamp},num_cpus={context.num_cpus}" for query in baseline.queries: query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" @@ -180,7 +183,7 @@ def main() -> None: ) options = parser.parse_args() - lineformat(options.baseline_path) + lineformat(options.path) diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs index 13969f4d39497..50092de88e4c2 100644 --- a/benchmarks/src/util/run.rs +++ b/benchmarks/src/util/run.rs @@ -24,6 +24,7 @@ use std::{ path::Path, time::{Duration, SystemTime}, }; +use std::process::Command; fn serialize_start_time(start_time: &SystemTime, ser: S) -> Result where @@ -49,6 +50,9 @@ pub struct RunContext { pub benchmark_version: String, /// DataFusion crate version pub datafusion_version: String, + /// DataFusion crate commit timestamp + #[serde(serialize_with = "serialize_start_time")] + pub datafusion_commit_timestamp: SystemTime, /// Number of CPU cores pub num_cpus: usize, /// Start time @@ -66,9 +70,21 @@ impl Default for RunContext { impl RunContext { pub fn new() -> Self { + let commit_timestamp = Command::new("git") + .args(&["log", "-1", "--format=%ct"]) + .output() + .expect("failed to execute git command") + .stdout; + let commit_timestamp = String::from_utf8(commit_timestamp) + .expect("failed to convert git output to string") + .trim() + .parse::() + .expect("failed to parse commit timestamp"); + Self { benchmark_version: env!("CARGO_PKG_VERSION").to_owned(), datafusion_version: DATAFUSION_VERSION.to_owned(), + datafusion_commit_timestamp: SystemTime::UNIX_EPOCH + Duration::from_secs(commit_timestamp), num_cpus: get_available_parallelism(), start_time: SystemTime::now(), arguments: std::env::args().skip(1).collect::>(), From 2ad1dbec549015e608e56ad342821843e5d2d355 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 10:23:17 -0400 Subject: [PATCH 02/18] fix: make collect_bench executable --- benchmarks/collect_bench.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 benchmarks/collect_bench.sh diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh old mode 100644 new mode 100755 From 65d551da2f9d4a3205a794e335ec989fc9083f21 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 12:10:00 -0400 Subject: [PATCH 03/18] fix: collect_bench fetch upstream --- benchmarks/collect_bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index ea8086e3dc191..3335db5bd0065 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -44,7 +44,7 @@ python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file # run for last 5 major releases for i in {1..5}; do echo "running benchmark on $((major_version-i)).0.0" - # git fetch upstream $((major_version-i)).0.0 + git fetch upstream $((major_version-i)).0.0 git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" ./bench.sh run $ARG1 From ae07e055144f63695817c67d4bd02fe8e4e36a23 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 12:42:12 -0400 Subject: [PATCH 04/18] fix: copy lineprotocol.py during benchmarks --- benchmarks/collect_bench.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 3335db5bd0065..8973fcb528fe9 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -35,11 +35,13 @@ git checkout main output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) +cp lineprotocol.py results/lineprotocol.py + # run for current main echo "current major version: $major_version" export RESULTS_DIR="results/$major_version.0.0" ./bench.sh run $ARG1 -python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file # run for last 5 major releases for i in {1..5}; do @@ -48,7 +50,7 @@ for i in {1..5}; do git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" ./bench.sh run $ARG1 - python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done echo "[[inputs.file]] From 8fed0aa75deecbabc020d4c67b2debaa2e643596 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 13:07:48 -0400 Subject: [PATCH 05/18] fix: copy lineprotocol.py --- benchmarks/collect_bench.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 8973fcb528fe9..85352e11ee1bd 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -35,7 +35,8 @@ git checkout main output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) -cp lineprotocol.py results/lineprotocol.py +mkdir results +cp lineprotocol.py results/ # run for current main echo "current major version: $major_version" From 2f2ff253aad225879c0cbfc799e40fedc03aec25 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 13:18:18 -0400 Subject: [PATCH 06/18] fix: move python file import to the beginning --- benchmarks/lineprotocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 22138be69a33b..feb9379475861 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -1,3 +1,4 @@ +from __future__ import annotations #!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one @@ -17,7 +18,6 @@ # specific language governing permissions and limitations # under the License. -from __future__ import annotations """ Converts a given json to LineProtocol format that can be From 19b020015c83cbb1c0dc211851ea4bcb23ff8da2 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 13:20:15 -0400 Subject: [PATCH 07/18] fix: copy lineprotocol.py before git checkout --- benchmarks/collect_bench.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 85352e11ee1bd..2fac368e9d64e 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -28,6 +28,9 @@ main(){ timestamp=$(date +%s) lp_file="results/$ARG1-$timestamp.lp" +mkdir results +cp lineprotocol.py results/ + git fetch upstream main git checkout main @@ -35,8 +38,6 @@ git checkout main output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) -mkdir results -cp lineprotocol.py results/ # run for current main echo "current major version: $major_version" From ade635d8d9a26c45d5c7f84648ce8090f701b45f Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 13:58:34 -0400 Subject: [PATCH 08/18] fix: run collect_bench in working dir --- benchmarks/.gitignore | 1 + benchmarks/collect_bench.sh | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore index c35b1a7c1944f..4dfc01cf0fa48 100644 --- a/benchmarks/.gitignore +++ b/benchmarks/.gitignore @@ -1,3 +1,4 @@ data results venv +working diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 2fac368e9d64e..1c8878ff05f62 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -21,15 +21,21 @@ # etc and orchestrates gathering data and run the benchmark binary to # collect benchmarks from the current main and last 5 major releases. -trap 'git checkout main' EXIT #checkout to main on exit +trap 'rm -rf working; git checkout main' EXIT #checkout to main on exit ARG1=$1 main(){ timestamp=$(date +%s) lp_file="results/$ARG1-$timestamp.lp" -mkdir results -cp lineprotocol.py results/ +mkdir working +cp bench.sh working/ +cp collect_bench.sh working/ +cp lineprotocol.py working/ +cp -r queries working/ +cp -r src working/ + +cd working git fetch upstream main git checkout main @@ -43,7 +49,7 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) echo "current major version: $major_version" export RESULTS_DIR="results/$major_version.0.0" ./bench.sh run $ARG1 -python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file # run for last 5 major releases for i in {1..5}; do @@ -52,7 +58,7 @@ for i in {1..5}; do git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" ./bench.sh run $ARG1 - python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done echo "[[inputs.file]] @@ -66,8 +72,8 @@ echo "[[inputs.file]] token = \"$INFLUX_TOKEN\" organization = \"5d59ccc5163fc318\" bucket = \"performance_metrics\" -" > results/telegraf.conf -telegraf --config results/telegraf.conf --once +" > telegraf.conf +telegraf --config telegraf.conf --once } main \ No newline at end of file From 46e8e2bc1ee46ba9c771ea4f015f96722a6873b1 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:15:27 -0400 Subject: [PATCH 09/18] fix: copy data dir --- benchmarks/collect_bench.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 1c8878ff05f62..89705c51f9341 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -21,7 +21,7 @@ # etc and orchestrates gathering data and run the benchmark binary to # collect benchmarks from the current main and last 5 major releases. -trap 'rm -rf working; git checkout main' EXIT #checkout to main on exit +trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit ARG1=$1 main(){ @@ -32,6 +32,7 @@ mkdir working cp bench.sh working/ cp collect_bench.sh working/ cp lineprotocol.py working/ +cp -r data working/ cp -r queries working/ cp -r src working/ @@ -48,7 +49,7 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) # run for current main echo "current major version: $major_version" export RESULTS_DIR="results/$major_version.0.0" -./bench.sh run $ARG1 +DATAFUSION_DIR=../../ ./bench.sh run $ARG1 python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file # run for last 5 major releases @@ -57,7 +58,7 @@ for i in {1..5}; do git fetch upstream $((major_version-i)).0.0 git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" - ./bench.sh run $ARG1 + DATAFUSION_DIR=../../ ./bench.sh run $ARG1 python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done From b9335fadfb8545eb78fa2f8fd1944728d1379973 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:32:02 -0400 Subject: [PATCH 10/18] fix: lp_file --- benchmarks/collect_bench.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 89705c51f9341..540fdfbb82492 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -26,7 +26,8 @@ ARG1=$1 main(){ timestamp=$(date +%s) -lp_file="results/$ARG1-$timestamp.lp" +lp_file="../results/$ARG1-$timestamp.lp" +touch $lp_file mkdir working cp bench.sh working/ From cb5ae08864ce4d999fa8eb517d905f429c5a8f11 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:36:52 -0400 Subject: [PATCH 11/18] fix: create lp file --- benchmarks/collect_bench.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 540fdfbb82492..71ec67648b823 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -25,9 +25,6 @@ trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit ARG1=$1 main(){ -timestamp=$(date +%s) -lp_file="../results/$ARG1-$timestamp.lp" -touch $lp_file mkdir working cp bench.sh working/ @@ -39,6 +36,10 @@ cp -r src working/ cd working +timestamp=$(date +%s) +lp_file="../results/$ARG1-$timestamp.lp" +touch $lp_file + git fetch upstream main git checkout main @@ -49,16 +50,16 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) # run for current main echo "current major version: $major_version" -export RESULTS_DIR="results/$major_version.0.0" +export RESULTS_DIR="../results/$major_version.0.0" DATAFUSION_DIR=../../ ./bench.sh run $ARG1 python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file -# run for last 5 major releases +run for last 5 major releases for i in {1..5}; do echo "running benchmark on $((major_version-i)).0.0" git fetch upstream $((major_version-i)).0.0 git checkout $((major_version-i)).0.0 - export RESULTS_DIR="results/$((major_version-i)).0.0" + export RESULTS_DIR="../results/$((major_version-i)).0.0" DATAFUSION_DIR=../../ ./bench.sh run $ARG1 python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done From 5ca50fbcbb7d68691a1b842ad48e7a8d2e5b0b5b Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:59:09 -0400 Subject: [PATCH 12/18] fix: results dir --- benchmarks/collect_bench.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 71ec67648b823..b5efea66de4f8 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -26,7 +26,7 @@ ARG1=$1 main(){ -mkdir working +mkdir -p working cp bench.sh working/ cp collect_bench.sh working/ cp lineprotocol.py working/ @@ -38,6 +38,7 @@ cd working timestamp=$(date +%s) lp_file="../results/$ARG1-$timestamp.lp" +mkdir -p ../results touch $lp_file git fetch upstream main @@ -50,18 +51,18 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) # run for current main echo "current major version: $major_version" -export RESULTS_DIR="../results/$major_version.0.0" +export RESULTS_DIR="results/$major_version.0.0" DATAFUSION_DIR=../../ ./bench.sh run $ARG1 -python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +python3 lineprotocol.py ../$RESULTS_DIR/$ARG1.json >> $lp_file run for last 5 major releases for i in {1..5}; do echo "running benchmark on $((major_version-i)).0.0" git fetch upstream $((major_version-i)).0.0 git checkout $((major_version-i)).0.0 - export RESULTS_DIR="../results/$((major_version-i)).0.0" + export RESULTS_DIR="results/$((major_version-i)).0.0" DATAFUSION_DIR=../../ ./bench.sh run $ARG1 - python3 lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + python3 lineprotocol.py ../$RESULTS_DIR/$ARG1.json >> $lp_file done echo "[[inputs.file]] From f95d946257f0673b9d3dee7db62318cc1c5a0d36 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 21:07:55 -0400 Subject: [PATCH 13/18] feat: get the commit timestamp in lineprotocol.py --- benchmarks/.gitignore | 1 - benchmarks/collect_bench.sh | 25 ++++++++----------------- benchmarks/lineprotocol.py | 16 ++++++++++++---- benchmarks/src/util/run.rs | 16 ---------------- 4 files changed, 20 insertions(+), 38 deletions(-) diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore index 4dfc01cf0fa48..c35b1a7c1944f 100644 --- a/benchmarks/.gitignore +++ b/benchmarks/.gitignore @@ -1,4 +1,3 @@ data results venv -working diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index b5efea66de4f8..6ee50ba2cf472 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -25,22 +25,13 @@ trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit ARG1=$1 main(){ - -mkdir -p working -cp bench.sh working/ -cp collect_bench.sh working/ -cp lineprotocol.py working/ -cp -r data working/ -cp -r queries working/ -cp -r src working/ - -cd working - timestamp=$(date +%s) -lp_file="../results/$ARG1-$timestamp.lp" -mkdir -p ../results +lp_file="results/$ARG1-$timestamp.lp" +mkdir -p results touch $lp_file +cp lineprotocol.py results/ + git fetch upstream main git checkout main @@ -52,8 +43,8 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) # run for current main echo "current major version: $major_version" export RESULTS_DIR="results/$major_version.0.0" -DATAFUSION_DIR=../../ ./bench.sh run $ARG1 -python3 lineprotocol.py ../$RESULTS_DIR/$ARG1.json >> $lp_file +./bench.sh run $ARG1 +python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file run for last 5 major releases for i in {1..5}; do @@ -61,8 +52,8 @@ for i in {1..5}; do git fetch upstream $((major_version-i)).0.0 git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" - DATAFUSION_DIR=../../ ./bench.sh run $ARG1 - python3 lineprotocol.py ../$RESULTS_DIR/$ARG1.json >> $lp_file + ./bench.sh run $ARG1 + python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done echo "[[inputs.file]] diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index feb9379475861..5242ccfeaab6d 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -85,6 +85,7 @@ from pathlib import Path from argparse import ArgumentParser import sys +import subprocess print = sys.stdout.write @@ -125,22 +126,29 @@ def execution_time(self) -> float: class Context: benchmark_version: str datafusion_version: str - datafusion_commit_timestamp: int num_cpus: int start_time: int arguments: List[str] name: str + commit_timestamp: int @classmethod def load_from(cls, data: Dict[str, Any]) -> Context: + get_timestamp = subprocess.run( + ["git", "log", "-1", "--format=%ct", data["datafusion_version"]], + capture_output=True, + text=True, + check=True + ) + commit_timestamp = get_timestamp.stdout.strip() return cls( benchmark_version=data["benchmark_version"], datafusion_version=data["datafusion_version"], - datafusion_commit_timestamp=data["datafusion_commit_timestamp"], num_cpus=data["num_cpus"], start_time=data["start_time"], arguments=data["arguments"], - name=data["arguments"][0] + name=data["arguments"][0], + commit_timestamp=commit_timestamp ) @@ -167,7 +175,7 @@ def lineformat( ) -> None: baseline = BenchmarkRun.load_from_file(baseline) context = baseline.context - benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},datafusion_commit_timestamp={context.datafusion_commit_timestamp},num_cpus={context.num_cpus}" + benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus},commit_timestamp={context.commit_timestamp}" for query in baseline.queries: query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs index 50092de88e4c2..13969f4d39497 100644 --- a/benchmarks/src/util/run.rs +++ b/benchmarks/src/util/run.rs @@ -24,7 +24,6 @@ use std::{ path::Path, time::{Duration, SystemTime}, }; -use std::process::Command; fn serialize_start_time(start_time: &SystemTime, ser: S) -> Result where @@ -50,9 +49,6 @@ pub struct RunContext { pub benchmark_version: String, /// DataFusion crate version pub datafusion_version: String, - /// DataFusion crate commit timestamp - #[serde(serialize_with = "serialize_start_time")] - pub datafusion_commit_timestamp: SystemTime, /// Number of CPU cores pub num_cpus: usize, /// Start time @@ -70,21 +66,9 @@ impl Default for RunContext { impl RunContext { pub fn new() -> Self { - let commit_timestamp = Command::new("git") - .args(&["log", "-1", "--format=%ct"]) - .output() - .expect("failed to execute git command") - .stdout; - let commit_timestamp = String::from_utf8(commit_timestamp) - .expect("failed to convert git output to string") - .trim() - .parse::() - .expect("failed to parse commit timestamp"); - Self { benchmark_version: env!("CARGO_PKG_VERSION").to_owned(), datafusion_version: DATAFUSION_VERSION.to_owned(), - datafusion_commit_timestamp: SystemTime::UNIX_EPOCH + Duration::from_secs(commit_timestamp), num_cpus: get_available_parallelism(), start_time: SystemTime::now(), arguments: std::env::args().skip(1).collect::>(), From 0661a9ae5ad4366743213ab0e998cd917ba1817d Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 21:24:13 -0400 Subject: [PATCH 14/18] fix: comment --- benchmarks/collect_bench.sh | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 benchmarks/collect_bench.sh diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh new file mode 100644 index 0000000000000..83c717525b451 --- /dev/null +++ b/benchmarks/collect_bench.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary to +# collect benchmarks from the current main and last 5 major releases. + +trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit +ARG1=$1 + +main(){ +timestamp=$(date +%s) +lp_file="results/$ARG1-$timestamp.lp" +mkdir -p results +touch $lp_file + +cp lineprotocol.py results/ + +git fetch upstream main +git checkout main + +# get current major version +output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') +major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) + + +# run for current main +echo "current major version: $major_version" +export RESULTS_DIR="results/$major_version.0.0" +./bench.sh run $ARG1 +python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + +# run for last 5 major releases +for i in {1..5}; do + echo "running benchmark on $((major_version-i)).0.0" + git fetch upstream $((major_version-i)).0.0 + git checkout $((major_version-i)).0.0 + export RESULTS_DIR="results/$((major_version-i)).0.0" + ./bench.sh run $ARG1 + python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +done + +echo "[[inputs.file]] + files = [ \"$lp_file\" ] + data_format = \"influx\" + name_override = \"datafusion_benchmarks\" + +[[outputs.influxdb_v2]] + alias = \"monitor-tools\" + urls = [\"https://us-east-1-2.aws.cloud2.influxdata.com\"] + token = \"$INFLUX_TOKEN\" + organization = \"5d59ccc5163fc318\" + bucket = \"performance_metrics\" +" > telegraf.conf +telegraf --config telegraf.conf --once +} + +main \ No newline at end of file From 8ecd5479bfb1ecbe71e964e01cb68c358755c8fc Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 21:25:05 -0400 Subject: [PATCH 15/18] Revert "fix: comment" This reverts commit 0661a9ae5ad4366743213ab0e998cd917ba1817d. --- benchmarks/collect_bench.sh | 74 ------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 benchmarks/collect_bench.sh diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh deleted file mode 100644 index 83c717525b451..0000000000000 --- a/benchmarks/collect_bench.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This script is meant for developers of DataFusion -- it is runnable -# from the standard DataFusion development environment and uses cargo, -# etc and orchestrates gathering data and run the benchmark binary to -# collect benchmarks from the current main and last 5 major releases. - -trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit -ARG1=$1 - -main(){ -timestamp=$(date +%s) -lp_file="results/$ARG1-$timestamp.lp" -mkdir -p results -touch $lp_file - -cp lineprotocol.py results/ - -git fetch upstream main -git checkout main - -# get current major version -output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') -major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) - - -# run for current main -echo "current major version: $major_version" -export RESULTS_DIR="results/$major_version.0.0" -./bench.sh run $ARG1 -python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file - -# run for last 5 major releases -for i in {1..5}; do - echo "running benchmark on $((major_version-i)).0.0" - git fetch upstream $((major_version-i)).0.0 - git checkout $((major_version-i)).0.0 - export RESULTS_DIR="results/$((major_version-i)).0.0" - ./bench.sh run $ARG1 - python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file -done - -echo "[[inputs.file]] - files = [ \"$lp_file\" ] - data_format = \"influx\" - name_override = \"datafusion_benchmarks\" - -[[outputs.influxdb_v2]] - alias = \"monitor-tools\" - urls = [\"https://us-east-1-2.aws.cloud2.influxdata.com\"] - token = \"$INFLUX_TOKEN\" - organization = \"5d59ccc5163fc318\" - bucket = \"performance_metrics\" -" > telegraf.conf -telegraf --config telegraf.conf --once -} - -main \ No newline at end of file From 27522a64f035a180a550bf5e140b191543926346 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 21:26:23 -0400 Subject: [PATCH 16/18] fix: commit --- benchmarks/collect_bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 6ee50ba2cf472..83c717525b451 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -46,7 +46,7 @@ export RESULTS_DIR="results/$major_version.0.0" ./bench.sh run $ARG1 python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file -run for last 5 major releases +# run for last 5 major releases for i in {1..5}; do echo "running benchmark on $((major_version-i)).0.0" git fetch upstream $((major_version-i)).0.0 From dea534c6bcd8424470d5a7b9b57769ba9b7aa3b7 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 22:31:17 -0400 Subject: [PATCH 17/18] chore: remove telegraf part --- benchmarks/collect_bench.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 83c717525b451..5387cb9b91dd3 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -55,20 +55,6 @@ for i in {1..5}; do ./bench.sh run $ARG1 python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file done - -echo "[[inputs.file]] - files = [ \"$lp_file\" ] - data_format = \"influx\" - name_override = \"datafusion_benchmarks\" - -[[outputs.influxdb_v2]] - alias = \"monitor-tools\" - urls = [\"https://us-east-1-2.aws.cloud2.influxdata.com\"] - token = \"$INFLUX_TOKEN\" - organization = \"5d59ccc5163fc318\" - bucket = \"performance_metrics\" -" > telegraf.conf -telegraf --config telegraf.conf --once } main \ No newline at end of file From 9e96770e2d4b13ca25f54520a0ad665a2b74d01a Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Wed, 12 Mar 2025 22:31:50 -0400 Subject: [PATCH 18/18] chore: trap function --- benchmarks/collect_bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 5387cb9b91dd3..7bd661ab4a06e 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -21,7 +21,7 @@ # etc and orchestrates gathering data and run the benchmark binary to # collect benchmarks from the current main and last 5 major releases. -trap 'cd ..; rm -rf working; git checkout main' EXIT #checkout to main on exit +trap 'git checkout main' EXIT #checkout to main on exit ARG1=$1 main(){