Skip to content

Commit

Permalink
New extract-queries command + other improvements (#115)
Browse files Browse the repository at this point in the history
The new `extract-queries` command extracts all queries from the server log and writes it into a TSV file that can be processed using the `example-queries` command. If the server log contains lines matching `Alive check with message "..."`, the message is used as base name for the query descriptions. For each query with that base name `Query #<index>` is appended to the description.

In this context, the following small improvements for the `example-queries` command (items 1, 2, 3, 4) and the `settings` command (item 5) were made:

1. The default for `--download_or_count` is now the more natural `download` (was: `count`).
2. The option `--accept` now has two more choices `application/qlever-results+json` and `AUTO`, where the later picks the accept header automatically from the query type
3. There is a new option `--add-query-type-to-description`
4. Long query descriptions are now truncated in the middle, that is shown as `Beginnig ... end`
5. For the runtime parameters supported by the settings command, there is now also `always-mulitply-unions`
  • Loading branch information
hannahbast authored Jan 23, 2025
1 parent dd95a05 commit 94c0f79
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 44 deletions.
179 changes: 135 additions & 44 deletions src/qlever/commands/example_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def additional_arguments(self, subparser) -> None:
subparser.add_argument(
"--get-queries-cmd",
type=str,
help="Command to get example queries as TSV " "(description, query)",
help="Command to get example queries as TSV "
"(description, query)",
)
subparser.add_argument(
"--query-ids",
type=str,
default="1-$",
help="Query IDs as comma-separated list of " "ranges (e.g., 1-5,7,12-$)",
help="Query IDs as comma-separated list of "
"ranges (e.g., 1-5,7,12-$)",
)
subparser.add_argument(
"--query-regex",
Expand All @@ -68,7 +70,7 @@ def additional_arguments(self, subparser) -> None:
subparser.add_argument(
"--download-or-count",
choices=["download", "count"],
default="count",
default="download",
help="Whether to download the full result "
"or just compute the size of the result",
)
Expand All @@ -88,10 +90,14 @@ def additional_arguments(self, subparser) -> None:
"text/tab-separated-values",
"text/csv",
"application/sparql-results+json",
"application/qlever-results+json",
"text/turtle",
"AUTO",
],
default="application/sparql-results+json",
help="Accept header for the SPARQL query",
help="Accept header for the SPARQL query; AUTO means "
"`text/turtle` for CONSTRUCT AND DESCRIBE queries, "
"`application/sparql-results+json` for all others",
)
subparser.add_argument(
"--clear-cache",
Expand All @@ -117,6 +123,13 @@ def additional_arguments(self, subparser) -> None:
default=14,
help="Width for printing the result size",
)
subparser.add_argument(
"--add-query-type-to-description",
action="store_true",
default=False,
help="Add the query type (SELECT, ASK, CONSTRUCT, DESCRIBE, "
"UNKNOWN) to the description",
)
subparser.add_argument(
"--show-query",
choices=["always", "never", "on-error"],
Expand All @@ -130,28 +143,49 @@ def additional_arguments(self, subparser) -> None:
help="When showing the query, also show the prefixes",
)

def pretty_print_query(self, query: str, show_prefixes: bool) -> None:
remove_prefixes_cmd = " | sed '/^PREFIX /Id'" if not show_prefixes else ""
def pretty_printed_query(self, query: str, show_prefixes: bool) -> str:
remove_prefixes_cmd = (
" | sed '/^PREFIX /Id'" if not show_prefixes else ""
)
pretty_print_query_cmd = (
f"echo {shlex.quote(query)}"
f" | docker run -i --rm sparqling/sparql-formatter"
f"{remove_prefixes_cmd} | grep -v '^$'"
)
try:
query_pp = run_command(pretty_print_query_cmd, return_output=True)
log.info(colored(query_pp.rstrip(), "cyan"))
except Exception as e:
log.error(f"Failed to pretty-print query: {e}")
log.info(colored(query.rstrip(), "cyan"))
query_pretty_printed = run_command(
pretty_print_query_cmd, return_output=True
)
return query_pretty_printed.rstrip()
except Exception:
log.error(
"Failed to pretty-print query, "
"returning original query: {e}"
)
return query.rstrip()

def sparql_query_type(self, query: str) -> str:
match = re.search(
r"(SELECT|ASK|CONSTRUCT|DESCRIBE)\s", query, re.IGNORECASE
)
if match:
return match.group(1).upper()
else:
return "UNKNOWN"

def execute(self, args) -> bool:
# We can't have both `--remove-offset-and-limit` and `--limit`.
if args.remove_offset_and_limit and args.limit:
log.error("Cannot have both --remove-offset-and-limit and --limit")
return False

# If `args.accept` is `application/sparql-results+json`, we need `jq`.
if args.accept == "application/sparql-results+json":
# If `args.accept` is `application/sparql-results+json` or
# `application/qlever-results+json` or `AUTO`, we need `jq`.
if (
args.accept == "application/sparql-results+json"
or args.accept == "application/qlever-results+json"
or args.accept == "AUTO"
):
try:
subprocess.run(
"jq --version",
Expand All @@ -174,8 +208,9 @@ def execute(self, args) -> bool:
return False

# Clear cache only works for QLever.
is_qlever = not args.sparql_endpoint or args.sparql_endpoint.startswith(
"https://qlever"
is_qlever = (
not args.sparql_endpoint
or args.sparql_endpoint.startswith("https://qlever")
)
if args.clear_cache == "yes" and not is_qlever:
log.warning("Clearing the cache only works for QLever")
Expand All @@ -193,7 +228,9 @@ def execute(self, args) -> bool:
if args.query_regex:
get_queries_cmd += f" | grep -Pi {shlex.quote(args.query_regex)}"
sparql_endpoint = (
args.sparql_endpoint if args.sparql_endpoint else f"localhost:{args.port}"
args.sparql_endpoint
if args.sparql_endpoint
else f"localhost:{args.port}"
)
self.show(
f"Obtain queries via: {get_queries_cmd}\n"
Expand All @@ -211,7 +248,9 @@ def execute(self, args) -> bool:

# Get the example queries.
try:
example_query_lines = run_command(get_queries_cmd, return_output=True)
example_query_lines = run_command(
get_queries_cmd, return_output=True
)
if len(example_query_lines) == 0:
log.error("No example queries matching the criteria found")
return False
Expand All @@ -220,20 +259,29 @@ def execute(self, args) -> bool:
log.error(f"Failed to get example queries: {e}")
return False

# We want the width of the query description to be an uneven number (in
# case we have to truncated it, in which case we want to have a " ... "
# in the middle).
width_query_description_half = args.width_query_description // 2
width_query_description = 2 * width_query_description_half + 1

# Launch the queries one after the other and for each print: the
# description, the result size (number of rows), and the query
# processing time (seconds).
query_times = []
result_sizes = []
num_failed = 0
for example_query_line in example_query_lines:
# Parse description and query.
# Parse description and query, and determine query type.
description, query = example_query_line.split("\t")
if len(query) == 0:
log.error("Could not parse description and query, line is:")
log.info("")
log.info(example_query_line)
return False
query_type = self.sparql_query_type(query)
if args.add_query_type_to_description or args.accept == "AUTO":
description = f"{description} [{query_type}]"

# Clear the cache.
if args.clear_cache == "yes":
Expand Down Expand Up @@ -267,7 +315,9 @@ def execute(self, args) -> bool:
# Count query.
if args.download_or_count == "count":
# First find out if there is a FROM clause.
regex_from_clause = re.compile(r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE)
regex_from_clause = re.compile(
r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE
)
match_from_clause = re.search(regex_from_clause, query)
from_clause = " "
if match_from_clause:
Expand Down Expand Up @@ -296,24 +346,39 @@ def execute(self, args) -> bool:
query = re.sub(r"\s*\.\s*\}", " }", query)
if args.show_query == "always":
log.info("")
self.pretty_print_query(query, args.show_prefixes)
log.info(
colored(
self.pretty_printed_query(query, args.show_prefixes),
"cyan",
)
)

# Accept header. For "AUTO", use `text/turtle` for CONSTRUCT
# queries and `application/sparql-results+json` for all others.
accept_header = args.accept
if accept_header == "AUTO":
if query_type == "CONSTRUCT" or query_type == "DESCRIBE":
accept_header = "text/turtle"
else:
accept_header = "application/sparql-results+json"

# Launch query.
try:
curl_cmd = (
f"curl -s {sparql_endpoint}"
f' -w "HTTP code: %{{http_code}}\\n"'
f' -H "Accept: {args.accept}"'
f' -H "Accept: {accept_header}"'
f" --data-urlencode query={shlex.quote(query)}"
)
log.debug(curl_cmd)
result_file = (
f"qlever.example_queries.result." f"{abs(hash(curl_cmd))}.tmp"
f"qlever.example_queries.result."
f"{abs(hash(curl_cmd))}.tmp"
)
start_time = time.time()
http_code = run_curl_command(
sparql_endpoint,
headers={"Accept": args.accept},
headers={"Accept": accept_header},
params={"query": query},
result_file=result_file,
).strip()
Expand All @@ -323,7 +388,9 @@ def execute(self, args) -> bool:
else:
error_msg = {
"short": f"HTTP code: {http_code}",
"long": re.sub(r"\s+", " ", Path(result_file).read_text()),
"long": re.sub(
r"\s+", " ", Path(result_file).read_text()
),
}
except Exception as e:
if args.log_level == "DEBUG":
Expand All @@ -336,8 +403,12 @@ def execute(self, args) -> bool:
# Get result size (via the command line, in order to avoid loading
# a potentially large JSON file into Python, which is slow).
if error_msg is None:
# CASE 0: Rhe result is empty despite a 200 HTTP code.
if Path(result_file).stat().st_size == 0:
# CASE 0: The result is empty despite a 200 HTTP code (not a
# problem for CONSTRUCT and DESCRIBE queries).
if Path(result_file).stat().st_size == 0 and (
not query_type == "CONSTRUCT"
and not query_type == "DESCRIBE"
):
result_size = 0
error_msg = {
"short": "Empty result",
Expand All @@ -347,7 +418,7 @@ def execute(self, args) -> bool:

# CASE 1: Just counting the size of the result (TSV or JSON).
elif args.download_or_count == "count":
if args.accept == "text/tab-separated-values":
if accept_header == "text/tab-separated-values":
result_size = run_command(
f"sed 1d {result_file}", return_output=True
)
Expand All @@ -370,21 +441,28 @@ def execute(self, args) -> bool:
# CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON).
else:
if (
args.accept == "text/tab-separated-values"
or args.accept == "text/csv"
accept_header == "text/tab-separated-values"
or accept_header == "text/csv"
):
result_size = run_command(
f"sed 1d {result_file} | wc -l", return_output=True
)
elif args.accept == "text/turtle":
elif accept_header == "text/turtle":
result_size = run_command(
f"sed '1d;/^@prefix/d;/^\\s*$/d' " f"{result_file} | wc -l",
f"sed '1d;/^@prefix/d;/^\\s*$/d' "
f"{result_file} | wc -l",
return_output=True,
)
elif accept_header == "application/qlever-results+json":
result_size = run_command(
f'jq -r ".resultsize" {result_file}',
return_output=True,
)
else:
try:
result_size = run_command(
f'jq -r ".results.bindings | length"' f" {result_file}",
f'jq -r ".results.bindings | length"'
f" {result_file}",
return_output=True,
)
except Exception as e:
Expand All @@ -398,13 +476,16 @@ def execute(self, args) -> bool:
Path(result_file).unlink(missing_ok=True)

# Print description, time, result in tabular form.
if len(description) > args.width_query_description:
description = description[: args.width_query_description - 3]
description += "..."
if len(description) > width_query_description:
description = (
description[: width_query_description_half - 2]
+ " ... "
+ description[-width_query_description_half + 2 :]
)
if error_msg is None:
result_size = int(result_size)
log.info(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{time_seconds:6.2f} s "
f"{result_size:>{args.width_result_size},}"
)
Expand All @@ -419,18 +500,28 @@ def execute(self, args) -> bool:
and args.show_query != "on-error"
):
error_msg["long"] = (
error_msg["long"][: args.width_error_message - 3] + "..."
error_msg["long"][: args.width_error_message - 3]
+ "..."
)
seperator_short_long = "\n" if args.show_query == "on-error" else " "
seperator_short_long = (
"\n" if args.show_query == "on-error" else " "
)
log.info(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{colored('FAILED ', 'red')}"
f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}"
f"{seperator_short_long}"
f"{colored(error_msg['long'], 'red')}"
)
if args.show_query == "on-error":
self.pretty_print_query(query, args.show_prefixes)
log.info(
colored(
self.pretty_printed_query(
query, args.show_prefixes
),
"cyan",
)
)
log.info("")

# Check that each query has a time and a result size, or it failed.
Expand All @@ -450,19 +541,19 @@ def execute(self, args) -> bool:
description = f"TOTAL for {n} {query_or_queries}"
log.info("")
log.info(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{total_query_time:6.2f} s "
f"{total_result_size:>14,}"
)
description = f"AVERAGE for {n} {query_or_queries}"
log.info(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{average_query_time:6.2f} s "
f"{average_result_size:>14,}"
)
description = f"MEDIAN for {n} {query_or_queries}"
log.info(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{median_query_time:6.2f} s "
f"{median_result_size:>14,}"
)
Expand All @@ -476,7 +567,7 @@ def execute(self, args) -> bool:
num_failed_string += " [all]"
log.info(
colored(
f"{description:<{args.width_query_description}} "
f"{description:<{width_query_description}} "
f"{num_failed:>24}",
"red",
)
Expand Down
Loading

0 comments on commit 94c0f79

Please sign in to comment.