Skip to content

Commit cd711c4

Browse files
authored
[V1][Metrics] Handle preemptions (vllm-project#13169)
1 parent 378b3ef commit cd711c4

File tree

5 files changed

+48
-19
lines changed

5 files changed

+48
-19
lines changed

tests/entrypoints/openai/test_metrics.py

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
227227
"vllm:gpu_cache_usage_perc",
228228
"vllm:gpu_prefix_cache_queries",
229229
"vllm:gpu_prefix_cache_hits",
230+
"vllm:num_preemptions_total",
230231
"vllm:prompt_tokens_total",
231232
"vllm:generation_tokens_total",
232233
"vllm:iteration_tokens_total",

vllm/v1/core/scheduler.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ def schedule(self) -> "SchedulerOutput":
164164
self.kv_cache_manager.free(preempted_req)
165165
preempted_req.status = RequestStatus.PREEMPTED
166166
preempted_req.num_computed_tokens = 0
167+
self.request_preempted(preempted_req, scheduled_timestamp)
167168

168169
self.waiting.appendleft(preempted_req)
169170
preempted_reqs.append(preempted_req)
@@ -281,9 +282,9 @@ def schedule(self) -> "SchedulerOutput":
281282
self.waiting.popleft()
282283
self.running.append(request)
283284
self.scheduled_req_ids.add(request.request_id)
285+
self.request_scheduled(request, scheduled_timestamp)
284286
if request.status == RequestStatus.WAITING:
285287
scheduled_new_reqs.append(request)
286-
self.request_scheduled(request, scheduled_timestamp)
287288
elif request.status == RequestStatus.PREEMPTED:
288289
scheduled_resumed_reqs.append(request)
289290
else:
@@ -675,6 +676,13 @@ def request_scheduled(self, request: Request, timestamp: float):
675676
EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
676677
timestamp))
677678

679+
def request_preempted(self, request: Request, timestamp: float):
680+
if not self.log_stats:
681+
return
682+
request.events.append(
683+
EngineCoreEvent.new_event(EngineCoreEventType.PREEMPTED,
684+
timestamp))
685+
678686
def make_stats(self) -> Optional[SchedulerStats]:
679687
if not self.log_stats:
680688
return None

vllm/v1/engine/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class EngineCoreEventType(enum.IntEnum):
6565
"""The type of engine core request event."""
6666
QUEUED = 1
6767
SCHEDULED = 2
68+
PREEMPTED = 3
6869

6970

7071
class EngineCoreEvent(msgspec.Struct):

vllm/v1/metrics/loggers.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ def __init__(self, vllm_config: VllmConfig):
132132
"GPU prefix cache hits, in terms of number of cached blocks.",
133133
labelnames=labelnames).labels(*labelvalues)
134134

135+
self.counter_num_preempted_reqs = prometheus_client.Counter(
136+
name="vllm:num_preemptions_total",
137+
documentation="Cumulative number of preemption from the engine.",
138+
labelnames=labelnames).labels(*labelvalues)
139+
135140
self.counter_prompt_tokens = prometheus_client.Counter(
136141
name="vllm:prompt_tokens_total",
137142
documentation="Number of prefill tokens processed.",
@@ -282,17 +287,27 @@ def log(self, scheduler_stats: SchedulerStats,
282287
self.counter_gpu_prefix_cache_hits.inc(
283288
scheduler_stats.prefix_cache_stats.hits)
284289

290+
self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs)
285291
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
286292
self.counter_generation_tokens.inc(
287293
iteration_stats.num_generation_tokens)
288294
self.histogram_iteration_tokens.observe(
289295
iteration_stats.num_prompt_tokens + \
290296
iteration_stats.num_generation_tokens)
291297

298+
for ttft in iteration_stats.time_to_first_tokens_iter:
299+
self.histogram_time_to_first_token.observe(ttft)
300+
for tpot in iteration_stats.time_per_output_tokens_iter:
301+
self.histogram_time_per_output_token.observe(tpot)
302+
292303
for finished_request in iteration_stats.finished_requests:
293304
self.counter_request_success[finished_request.finish_reason].inc()
294305
self.histogram_e2e_time_request.observe(
295306
finished_request.e2e_latency)
307+
self.histogram_queue_time_request.observe(
308+
finished_request.queued_time)
309+
self.histogram_prefill_time_request.observe(
310+
finished_request.prefill_time)
296311
self.histogram_inference_time_request.observe(
297312
finished_request.inference_time)
298313
self.histogram_decode_time_request.observe(
@@ -302,15 +317,6 @@ def log(self, scheduler_stats: SchedulerStats,
302317
self.histogram_num_generation_tokens_request.observe(
303318
finished_request.num_generation_tokens)
304319

305-
for ttft in iteration_stats.time_to_first_tokens_iter:
306-
self.histogram_time_to_first_token.observe(ttft)
307-
for tpot in iteration_stats.time_per_output_tokens_iter:
308-
self.histogram_time_per_output_token.observe(tpot)
309-
for queue_time in iteration_stats.queue_times_iter:
310-
self.histogram_queue_time_request.observe(queue_time)
311-
for prefill_time in iteration_stats.prefill_times_iter:
312-
self.histogram_prefill_time_request.observe(prefill_time)
313-
314320
if self.gauge_lora_info is not None:
315321
running_lora_adapters = \
316322
",".join(iteration_stats.running_lora_adapters.keys())

vllm/v1/metrics/stats.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class FinishedRequestStats:
6767
e2e_latency: float = 0.0
6868
num_prompt_tokens: int = 0
6969
num_generation_tokens: int = 0
70+
queued_time: float = 0.0
71+
prefill_time: float = 0.0
7072
inference_time: float = 0.0
7173
decode_time: float = 0.0
7274

@@ -78,11 +80,10 @@ def __init__(self):
7880
self.iteration_timestamp = time.time()
7981
self.num_generation_tokens = 0
8082
self.num_prompt_tokens = 0
83+
self.num_preempted_reqs = 0
8184
self.finished_requests: List[FinishedRequestStats] = []
8285
self.time_to_first_tokens_iter: List[float] = []
8386
self.time_per_output_tokens_iter: List[float] = []
84-
self.queue_times_iter: List[float] = []
85-
self.prefill_times_iter: List[float] = []
8687
self.waiting_lora_adapters: Dict[str, int] = {}
8788
self.running_lora_adapters: Dict[str, int] = {}
8889

@@ -122,9 +123,6 @@ def update_from_output(self, output: "EngineCoreOutput",
122123
if is_prefilling:
123124
# TODO: re-enable no-output-for-partial-prefills invariant as above
124125
if num_new_generation_tokens > 0:
125-
prefill_interval = \
126-
engine_core_timestamp - req_stats.scheduled_ts
127-
self.prefill_times_iter.append(prefill_interval)
128126
req_stats.first_token_ts = engine_core_timestamp
129127
else:
130128
tpot = engine_core_timestamp - req_stats.last_token_ts
@@ -145,24 +143,39 @@ def update_from_events(self, req_id: str, events: List["EngineCoreEvent"],
145143
if lora_stats is not None:
146144
lora_stats.waiting_requests.add(req_id)
147145
elif event.type == EngineCoreEventType.SCHEDULED:
148-
queued_interval = event.timestamp - req_stats.queued_ts
149-
self.queue_times_iter.append(queued_interval)
150-
req_stats.scheduled_ts = event.timestamp
146+
if req_stats.scheduled_ts == 0.0: # ignore preemptions
147+
req_stats.scheduled_ts = event.timestamp
151148
LoRARequestStates.scheduled_request(lora_stats, req_id)
149+
elif event.type == EngineCoreEventType.PREEMPTED:
150+
self.num_preempted_reqs += 1
152151

153152
def update_from_finished_request(self, finish_reason: "FinishReason",
154153
request_output: "RequestOutput",
155154
req_stats: RequestStateStats):
156155
e2e_latency = self._time_since(req_stats.arrival_time)
157156

158-
inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
157+
# Queued interval is from first QUEUED event to first SCHEDULED
158+
queued_time = req_stats.scheduled_ts - req_stats.queued_ts
159+
160+
# Prefill interval is from first SCHEDULED to first NEW_TOKEN
161+
# Any preemptions during prefill is included in the interval
162+
prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts
163+
164+
# Decode interval is from first NEW_TOKEN to last NEW_TOKEN
165+
# Any preemptions during decode are included
159166
decode_time = req_stats.last_token_ts - req_stats.first_token_ts
160167

168+
# Inference interval is from first SCHEDULED to last NEW_TOKEN
169+
# Any preemptions during prefill or decode are included
170+
inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
171+
161172
finished_req = \
162173
FinishedRequestStats(finish_reason=finish_reason,
163174
e2e_latency=e2e_latency,
164175
num_prompt_tokens=len(request_output.prompt_token_ids),
165176
num_generation_tokens=req_stats.num_generation_tokens,
177+
queued_time=queued_time,
178+
prefill_time=prefill_time,
166179
inference_time=inference_time,
167180
decode_time=decode_time)
168181
self.finished_requests.append(finished_req)

0 commit comments

Comments
 (0)