-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcmdline_args.py
481 lines (461 loc) · 18.2 KB
/
cmdline_args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
import configargparse
import argparse
import sys
import re
import rich.table
import rich.console
import json
from pinecone import ServerlessSpec
from vsb.databases import Database
from vsb.workloads import Workload, WorkloadSequence
from vsb.vsb_types import DistanceMetric
from vsb import default_cache_dir, logger
import numpy as np
class WorkloadHelpAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if values == "help":
table = rich.table.Table(title="Available Workloads")
table.add_column("Name", justify="left", no_wrap=True)
table.add_column("Record Count", justify="right", style="green")
table.add_column("Dimensions")
table.add_column("Distance Metric", justify="center")
table.add_column("Request Count", justify="right", style="red")
for workload in Workload:
if workload == Workload.Synthetic:
# Don't describe synthetic workload, static methods are not available
table.add_row(
"synthetic", "<varies>", "<varies>", "<varies>", "<varies>"
)
continue
table.add_row(*tuple(str(x) for x in workload.describe()))
console = rich.console.Console()
console.print(table)
parser.exit(0)
else:
setattr(namespace, self.dest, values)
def json_to_pinecone_spec(spec_string):
try:
spec = json.loads(spec_string)
assert "pod" in spec or "serverless" in spec
assert len(spec) == 1
if "pod" in spec:
assert "environment" in spec["pod"] and "pod_type" in spec["pod"]
if "serverless" in spec:
assert "cloud" in spec["serverless"] and "region" in spec["serverless"]
return spec
except Exception as e:
raise ValueError from e
def add_vsb_cmdline_args(
parser: configargparse.ArgumentParser, include_locust_args: bool
) -> None:
"""
Add VSB's command-line arguments to `parser`.
:param parser: Parser to add arguments to.
:param include_locust_args: If True then also include existing locust
arguments which VSB also supports.
"""
main_group = parser.add_argument_group("Main arguments")
main_group.add_argument(
"--database",
required=True,
choices=tuple(e.value for e in Database),
help="The vector search database to test",
)
main_group.add_argument(
"--workload",
action=WorkloadHelpAction,
required=True,
choices=tuple(e.value for e in Workload)
+ tuple(e.value for e in WorkloadSequence)
+ ("help",),
help="The workload to run",
)
general_group = parser.add_argument_group("General options")
general_group.add_argument(
"--cache_dir",
type=str,
default=default_cache_dir,
help="Directory to store downloaded datasets. Default is %(default)s).",
)
general_group.add_argument(
"--log_dir",
"-o",
default="reports",
help="Directory to write logs to. Default is %(default)s.",
)
general_group.add_argument(
"--skip_populate",
action="store_true",
help="Skip the populate phase (useful if workload has already been loaded and is static)",
)
general_group.add_argument(
"--overwrite",
action=argparse.BooleanOptionalAction,
help="Overwrite the existing index if it already exists. Default is %(default)s.",
)
general_group.add_argument(
"--requests_per_sec",
type=float,
default=0,
help="Target requests per second for the Run phase. If using multiple users, "
"then the target will be distributed across all users. "
"Specify 0 for unlimited. Default is %(default)s.",
)
if include_locust_args:
general_group.add_argument(
"--loglevel",
"-L",
default="INFO",
help="Choose between DEBUG/INFO/WARNING/ERROR/CRITICAL. Default is INFO",
metavar="<level>",
)
general_group.add_argument(
"--users",
type=int,
metavar="<int>",
dest="num_users",
default=1,
help="Number of database clients to execute the workload. Default is %("
"default)s",
)
general_group.add_argument(
"--processes",
type=int,
help="Number of VSB subprocesses to fork and generate load from. Default "
"is to run in a single process",
)
synthetic_group = parser.add_argument_group(
"Options specific to synthetic workloads"
)
synthetic_group.add_argument(
"--synthetic_records",
"-N",
type=int,
default=1000,
help="Number of records to generate for the synthetic workload. For synthetic proportional "
"workloads, this is the initial number of records before queries. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_requests",
"-c",
type=int,
default=100,
help="Number of requests to generate for the synthetic workload. For synthetic proportional "
"workloads, this is the number of requests (including upserts) to run after the initial "
"population. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_dimensions",
type=int,
default=192,
help="Number of dimensions for the synthetic workload. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_metric",
type=str,
default="cosine",
choices=tuple(e.value for e in DistanceMetric),
help="Distance metric to use for the synthetic workload. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_top_k",
type=int,
default=10,
help="Top-k value to use for the synthetic workload. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_metadata",
"--sm",
action="append",
type=str,
default=None,
help="Metadata key-value template, in the form of <key:value>. Each flag specifies one pair; "
"keys are strings, and values can be formatted as <# digits>n (number), <# chars>s (string), "
"<# chars>s<# strings>l (list of strings), or b (boolean). Default is no metadata.",
)
synthetic_group.add_argument(
"--synthetic_seed",
type=str,
default=str(np.random.SeedSequence().entropy),
help="Seed to use for the synthetic workload. If not specified, a random seed will be generated.",
)
synthetic_group.add_argument(
"--synthetic_steps",
type=int,
default=2,
help="Number of steps to use for the synthetic workload. The total record/request set will be "
"evenly split amongst these steps, such that one portion of the records is upserted, then "
"one portion of the requests is run, and so forth. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_no_aggregate_stats",
action="store_true",
help="Aggregate statistics for the synthetic workload. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_insert_ratio",
"--si",
type=float,
default=0,
help="Proportion of insert operations for synthetic proportional workloads. Default is %(default)s. ",
)
synthetic_group.add_argument(
"--synthetic_update_ratio",
"--su",
type=float,
default=0.2,
help="Proportion of update operations for synthetic proportional workloads. Default is %(default)s. ",
)
synthetic_group.add_argument(
"--synthetic_query_ratio",
"--sq",
type=float,
default=0.8,
help="Proportion of query operations for synthetic proportional workloads. Default is %(default)s. ",
)
synthetic_group.add_argument(
"--synthetic_delete_ratio",
"--sd",
type=float,
default=0,
help="Proportion of delete operations for synthetic proportional workloads. Default is %(default)s. ",
)
synthetic_group.add_argument(
"--synthetic_fetch_ratio",
"--sf",
type=float,
default=0,
help="Proportion of fetch operations for synthetic proportional workloads. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_batch_size",
type=int,
default=1,
help="For synthetic proportional workload requests, how many operations are scheduled per cycle."
" Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_record_distribution",
"--sdist",
type=str,
default="normal",
choices=["uniform", "normal"],
help="Distribution of record vectors in space for synthetic proportional workloads. "
"For the euclidean metric, vectors are spread from [0, 255]. For cosine and dotproduct "
"metrics, vectors are spread from [-1, 1]. Default is %(default)s.",
)
synthetic_group.add_argument(
"--synthetic_query_distribution",
"--qdist",
type=str,
default="zipfian",
choices=["uniform", "zipfian"],
help="Distribution of query/fetch IDs for synthetic proportional workloads. Default is %(default)s.",
)
pinecone_group = parser.add_argument_group("Options specific to pinecone database")
pinecone_group.add_argument(
"--pinecone_api_key",
type=str,
help="API Key to connect to Pinecone index",
env_var="VSB__PINECONE_API_KEY",
)
pinecone_group.add_argument(
"--pinecone_index_name",
type=str,
default=None,
help="Name of Pinecone index to connect to. One will be created if it does not exist. Default is vsb-<workload>.",
env_var="VSB__PINECONE_INDEX_NAME",
)
pinecone_group.add_argument(
"--pinecone_index_spec",
type=json_to_pinecone_spec,
default={"serverless": {"cloud": "aws", "region": "us-east-1"}},
help="JSON spec of Pinecone index to create (if it does not exist). Default is %(default)s.",
)
pgvector_group = parser.add_argument_group("Options specific to pgvector database")
pgvector_group.add_argument(
"--pgvector_host",
type=str,
default="localhost",
help="pgvector host to connect to. Default is %(default)s.",
)
pgvector_group.add_argument(
"--pgvector_port",
type=str,
default="5432",
help="pgvector port to connect to. Default is %(default)s.",
)
pgvector_group.add_argument(
"--pgvector_database",
type=str,
help="pgvector database to use",
)
pgvector_group.add_argument(
"--pgvector_username",
type=str,
default="postgres",
help="Username to connect to pgvector index. Default is %(default)s.",
env_var="VSB__PGVECTOR_USERNAME",
)
pgvector_group.add_argument(
"--pgvector_password",
type=str,
default="postgres",
help="Password to connect to pgvector index. Default is %(default)s.",
env_var="VSB__PGVECTOR_PASSWORD",
)
pgvector_group.add_argument(
"--pgvector_index_type",
type=str,
choices=["none", "ivfflat", "hnsw", "gin", "hnsw+gin", "ivfflat+gin"],
default="hnsw",
help="Index type to use for pgvector. Specifying 'none' will not create an "
"ANN index, instead brute-force kNN search will be performed."
"Default is %(default)s.",
)
pgvector_group.add_argument(
"--pgvector_ivfflat_lists",
type=int,
default=0,
help="For pgvector IVFFLAT indexes the number of lists to create. A value of "
"0 (default) means to automatically calculate based on the number of "
"records R: R/1000 for up to 1M records, sqrt(R) for over 1M records.",
)
pgvector_group.add_argument(
"--pgvector_search_candidates",
type=int,
default="0", # 0 represents pgvector-recommended defaults (2*top_k for HNSW, sqrt(pgvector_ivfflat_lists) for IVFFLAT)
help="Specify the size of the dynamic candidate list (ef_search for HNSW, probes for IVFFLAT). A higher value provides better recall at the cost of speed. Default is 2*top_k for HNSW and sqrt(pgvector_ivfflat_lists) for IVFFLAT",
)
pgvector_group.add_argument(
"--pgvector_maintenance_work_mem",
type=str,
default="4GB",
help=(
"Set the postgres 'maintenance_work_mem' parameter - the amount of memory "
"to use for maintenance operations such as CREATE INDEX. This should be "
"at least as large as the index size. Specify as a string with size "
"suffix (e.g. '2GB'). Default is %(default)s."
),
)
def get_action(parser, argument_name):
"""Helper to lookup the named Action from the parser."""
for action in parser._actions:
if action.dest == argument_name:
return action
return None
def validate_parsed_args(
parser: configargparse.ArgumentParser, args: configargparse.Namespace
):
"""Perform additional validation on parsed arguments, checking that any
conditionally required arguments are present (e.g. --database=pinecone makes
--pinecone_api_key required).
If validation fails then parser.error() is called with an appropriate
message, which will terminate the process.
"""
match args.database:
case "pinecone":
required = (
"pinecone_api_key",
"pinecone_index_spec",
)
missing = list()
for name in required:
if not getattr(args, name):
missing.append(name)
if missing:
formatter = configargparse.HelpFormatter(".")
formatter.start_section("")
formatter.add_text("")
for name in missing:
formatter.add_argument(get_action(parser, name))
formatter.end_section()
formatter.add_text(
"Please ensure all missing arguments are specified " "and re-run."
)
# Needed to ensure env var names are included in the actions'
# help messages.
parser.format_help()
parser.error(
"The following arguments must be specified when --database is "
"'pinecone'" + formatter.format_help(),
)
case "pgvector":
pass
case _:
pass
match args.workload:
case "synthetic" | "synthetic-proportional" | "synthetic-runbook":
required = (
"synthetic_records",
"synthetic_requests",
"synthetic_dimensions",
"synthetic_metric",
"synthetic_top_k",
)
missing = list()
for name in required:
if not getattr(args, name):
missing.append(name)
if missing:
formatter = configargparse.HelpFormatter(".")
formatter.start_section("")
formatter.add_text("")
for name in missing:
formatter.add_argument(get_action(parser, name))
formatter.end_section()
formatter.add_text(
"Please ensure all missing arguments are specified " "and re-run."
)
# Needed to ensure env var names are included in the actions'
# help messages.
parser.format_help()
parser.error(
"The following arguments must be specified when --workload is "
"'synthetic'" + formatter.format_help(),
)
if (
args.synthetic_query_ratio == 0
and args.synthetic_insert_ratio == 0
and args.synthetic_update_ratio == 0
and args.synthetic_delete_ratio == 0
and args.synthetic_fetch_ratio == 0
):
parser.error(
"At least one of --synthetic_query_ratio, --synthetic_insert_ratio, "
"--synthetic_update_ratio, --synthetic_delete_ratio, or --synthetic_fetch_ratio "
"must be non-zero."
)
if args.synthetic_metadata:
for entry in args.synthetic_metadata:
if not re.search(r"(\w+):(\w+)", entry):
parser.error(
f"Metadata key-value pair '{entry}' must be formatted as <key:value>."
)
entry = entry.split(":")[-1]
match entry[-1]:
case "s":
if not re.search(r"(\d+)s", entry):
parser.error(
f"Metadata string value '{entry}' must be formatted as <# chars>s."
)
case "l":
if not re.search(r"(\d+)s(\d+)l", entry):
parser.error(
f"Metadata string list value '{entry}' must be formatted as <# chars>s<# strings>l."
)
case "n":
if not re.search(r"(\d+)n", entry):
parser.error(
f"Metadata number value '{entry}' must be formatted as <# digits>n."
)
case "b":
if entry != "b":
parser.error(
f"Metadata boolean value '{entry}' must be formatted as b."
)
case _:
parser.error(
f"Metadata value '{entry}' must be formatted as <# chars>s, <# digits>n, <# chars>s<# strings>l, or b."
)
pass