Skip to content

Instantly share code, notes, and snippets.

@pepijnve
Created June 11, 2025 10:51
Show Gist options
  • Select an option

  • Save pepijnve/c5498e4762730bd68a2f6b188ed20f45 to your computer and use it in GitHub Desktop.

Select an option

Save pepijnve/c5498e4762730bd68a2f6b188ed20f45 to your computer and use it in GitHub Desktop.
Datafusion benchmark script patch
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index b34c646c5..d8742c25f 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -43,6 +43,7 @@ DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
+ITERATIONS=50
usage() {
echo "
@@ -446,7 +447,7 @@ run_tpch() {
echo "Running tpch benchmark..."
# Optional query filter to run specific query QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY
+ debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations $ITERATIONS --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY
}
# Runs the tpch in memory
@@ -464,7 +465,7 @@ run_tpch_mem() {
# Optional query filter to run specific query
QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "") # -m means in memory
- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY + debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations $ITERATIONS --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY
}
# Runs the cancellation benchmark
@@ -472,7 +473,7 @@ run_cancellation() {
RESULTS_FILE="${RESULTS_DIR}/cancellation.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running cancellation benchmark..."
- debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations $ITERATIONS --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}"
}
# Runs the parquet filter benchmark
@@ -480,7 +481,7 @@ run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
- debug_run $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations $ITERATIONS -o "${RESULTS_FILE}"
}
# Runs the sort benchmark
@@ -488,7 +489,7 @@ run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
- debug_run $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations $ITERATIONS -o "${RESULTS_FILE}"
}
@@ -542,7 +543,7 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
}
# Runs the clickbench benchmark with the partitioned parquet files
@@ -550,7 +551,7 @@ run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
}
# Runs the clickbench "extended" benchmark with a single large parquet file @@ -558,7 +559,7 @@ run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations $ITERATIONS --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
}
# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
@@ -673,7 +674,7 @@ run_imdb() {
RESULTS_FILE="${RESULTS_DIR}/imdb.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running imdb benchmark..."
- debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations $ITERATIONS --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
}
data_h2o() {
@@ -945,7 +946,7 @@ run_external_aggr() {
# number-of-partitions), and by default `--partitions` is set to number of
# CPU cores, we set a constant number of partitions to prevent this
# benchmark to fail on some machines.
- debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations $ITERATIONS --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
}
# Runs the sort integration benchmark
@@ -955,7 +956,7 @@ run_sort_tpch() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort tpch benchmark..."
- debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+ debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations $ITERATIONS --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
}
@@ -1000,3 +1001,4 @@ setup_venv() {
# And start the process up
main
+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment