-
-
Notifications
You must be signed in to change notification settings - Fork 131
/
Copy pathclickbench.rs
173 lines (152 loc) · 6.41 KB
/
clickbench.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*
* Parseable Server (C) 2022 - 2024 Parseable, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
use std::{collections::HashMap, env, fs, process::Command, time::Instant};
use actix_web::{web::Json, Responder};
use datafusion::{
common::plan_datafusion_err,
error::DataFusionError,
execution::{runtime_env::RuntimeEnvBuilder, SessionStateBuilder},
prelude::{ParquetReadOptions, SessionConfig, SessionContext},
sql::{parser::DFParser, sqlparser::dialect::dialect_from_str},
};
use serde_json::{json, Value};
use tracing::{info, warn};
static PARQUET_FILE: &str = "PARQUET_FILE";
static QUERIES_FILE: &str = "QUERIES_FILE";
pub async fn clickbench_benchmark() -> Result<impl Responder, actix_web::Error> {
drop_system_caches()
.await
.map_err(actix_web::error::ErrorInternalServerError)?;
let results = tokio::task::spawn_blocking(run_benchmark)
.await
.map_err(actix_web::error::ErrorInternalServerError)?
.map_err(actix_web::error::ErrorInternalServerError)?;
Ok(results)
}
pub async fn drop_system_caches() -> Result<(), anyhow::Error> {
// Sync to flush file system buffers
match Command::new("sync").status() {
Ok(_) => {}
Err(e) => warn!("Failed to execute sync command: {}", e),
}
let _ = Command::new("sudo")
.args(["sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"])
.output()
.map_err(|e| {
warn!("Failed to drop system caches: {}", e);
anyhow::Error::msg("Failed to drop system caches. This might be expected if not running on Linux or without sudo privileges.")
})?;
Ok(())
}
#[tokio::main(flavor = "multi_thread")]
pub async fn run_benchmark() -> Result<Json<Value>, anyhow::Error> {
let mut session_config = SessionConfig::new().with_information_schema(true);
session_config = session_config.with_batch_size(8192);
let rt_builder = RuntimeEnvBuilder::new();
// set memory pool size
let runtime_env = rt_builder.build_arc()?;
let state = SessionStateBuilder::new()
.with_default_features()
.with_config(session_config)
.with_runtime_env(runtime_env)
.build();
state
.catalog_list()
.catalog(&state.config_options().catalog.default_catalog)
.expect("default catalog is provided by datafusion");
let ctx = SessionContext::new_with_state(state);
let mut table_options = HashMap::new();
table_options.insert("binary_as_string", "true");
let parquet_file = env::var(PARQUET_FILE)
.map_err(|_| anyhow::anyhow!("PARQUET_FILE environment variable not set. Please set it to the path of the hits.parquet file."))?;
register_hits(&ctx, &parquet_file).await?;
info!("hits.parquet registered");
let mut query_list = Vec::new();
let queries_file = env::var(QUERIES_FILE)
.map_err(|_| anyhow::anyhow!("QUERIES_FILE environment variable not set. Please set it to the path of the queries file."))?;
let queries = fs::read_to_string(queries_file)?;
for query in queries.lines() {
query_list.push(query.to_string());
}
execute_queries(&ctx, query_list).await
}
async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<(), anyhow::Error> {
let options: ParquetReadOptions<'_> = ParquetReadOptions::default();
ctx.register_parquet("hits", parquet_file, options)
.await
.map_err(|e| {
DataFusionError::Context(format!("Registering 'hits' as {parquet_file}"), Box::new(e))
})?;
Ok(())
}
pub async fn execute_queries(
ctx: &SessionContext,
query_list: Vec<String>,
) -> Result<Json<Value>, anyhow::Error> {
const TRIES: usize = 3;
let mut results = Vec::with_capacity(query_list.len());
let mut total_elapsed_per_iteration = [0.0; TRIES];
for (query_index, sql) in query_list.iter().enumerate() {
let mut elapsed_times = Vec::with_capacity(TRIES);
for iteration in 1..=TRIES {
let start = Instant::now();
let task_ctx = ctx.task_ctx();
let dialect = &task_ctx.session_config().options().sql_parser.dialect;
let dialect = dialect_from_str(dialect).ok_or_else(|| {
plan_datafusion_err!(
"Unsupported SQL dialect: {dialect}. Available dialects: \
Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \
MsSQL, ClickHouse, BigQuery, Ansi."
)
})?;
let statements = DFParser::parse_sql_with_dialect(sql, dialect.as_ref())?;
let statement = statements
.front()
.ok_or_else(|| anyhow::anyhow!("No SQL statement found in query: {}", sql))?;
let plan = ctx.state().statement_to_plan(statement.clone()).await?;
let df = ctx.execute_logical_plan(plan).await?;
let _ = df.collect().await?;
let elapsed = start.elapsed().as_secs_f64();
total_elapsed_per_iteration[iteration - 1] += elapsed;
info!("query {query_index} iteration {iteration} completed in {elapsed} secs");
elapsed_times.push(elapsed);
results.push(json!({
"query_index": query_index,
"query": sql,
"iteration": iteration,
"elapsed_time": elapsed
}));
}
}
let summary: Vec<Value> = total_elapsed_per_iteration
.iter()
.enumerate()
.map(|(iteration, &total_elapsed)| {
json!({
"iteration": iteration + 1,
"total_elapsed": total_elapsed
})
})
.collect();
info!("summary: {:?}", summary);
let result_json = json!({
"summary": summary,
"results": results
});
Ok(Json(result_json))
}