Created
June 11, 2025 10:51
-
-
Save pepijnve/c5498e4762730bd68a2f6b188ed20f45 to your computer and use it in GitHub Desktop.
Datafusion benchmark script patch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh | |
| index b34c646c5..d8742c25f 100755 | |
| --- a/benchmarks/bench.sh | |
| +++ b/benchmarks/bench.sh | |
| @@ -43,6 +43,7 @@ DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data} | |
| CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"} | |
| PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true} | |
| VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv} | |
| +ITERATIONS=50 | |
| usage() { | |
| echo " | |
| @@ -446,7 +447,7 @@ run_tpch() { | |
| echo "Running tpch benchmark..." | |
| # Optional query filter to run specific query QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "") | |
| - debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY | |
| + debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations $ITERATIONS --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY | |
| } | |
| # Runs the tpch in memory | |
| @@ -464,7 +465,7 @@ run_tpch_mem() { | |
| # Optional query filter to run specific query | |
| QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "") # -m means in memory | |
| - debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY + debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations $ITERATIONS --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY | |
| } | |
| # Runs the cancellation benchmark | |
| @@ -472,7 +473,7 @@ run_cancellation() { | |
| RESULTS_FILE="${RESULTS_DIR}/cancellation.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running cancellation benchmark..." | |
| - debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations $ITERATIONS --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}" | |
| } | |
| # Runs the parquet filter benchmark | |
| @@ -480,7 +481,7 @@ run_parquet() { | |
| RESULTS_FILE="${RESULTS_DIR}/parquet.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running parquet filter benchmark..." | |
| - debug_run $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations $ITERATIONS -o "${RESULTS_FILE}" | |
| } | |
| # Runs the sort benchmark | |
| @@ -488,7 +489,7 @@ run_sort() { | |
| RESULTS_FILE="${RESULTS_DIR}/sort.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running sort benchmark..." | |
| - debug_run $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations $ITERATIONS -o "${RESULTS_FILE}" | |
| } | |
| @@ -542,7 +543,7 @@ run_clickbench_1() { | |
| RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running clickbench (1 file) benchmark..." | |
| - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" | |
| } | |
| # Runs the clickbench benchmark with the partitioned parquet files | |
| @@ -550,7 +551,7 @@ run_clickbench_partitioned() { | |
| RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running clickbench (partitioned, 100 files) benchmark..." | |
| - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" | |
| } | |
| # Runs the clickbench "extended" benchmark with a single large parquet file @@ -558,7 +559,7 @@ run_clickbench_extended() { | |
| RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running clickbench (1 file) extended benchmark..." | |
| - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}" | |
| } | |
| # Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors) | |
| @@ -673,7 +674,7 @@ run_imdb() { | |
| RESULTS_FILE="${RESULTS_DIR}/imdb.json" | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running imdb benchmark..." | |
| - debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations $ITERATIONS --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" | |
| } | |
| data_h2o() { | |
| @@ -945,7 +946,7 @@ run_external_aggr() { | |
| # number-of-partitions), and by default `--partitions` is set to number of | |
| # CPU cores, we set a constant number of partitions to prevent this | |
| # benchmark to fail on some machines. | |
| - debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations $ITERATIONS --path "${TPCH_DIR}" -o "${RESULTS_FILE}" | |
| } | |
| # Runs the sort integration benchmark | |
| @@ -955,7 +956,7 @@ run_sort_tpch() { | |
| echo "RESULTS_FILE: ${RESULTS_FILE}" | |
| echo "Running sort tpch benchmark..." | |
| - debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" | |
| + debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations $ITERATIONS --path "${TPCH_DIR}" -o "${RESULTS_FILE}" | |
| } | |
| @@ -1000,3 +1001,4 @@ setup_venv() { | |
| # And start the process up | |
| main | |
| + |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment