@@ -67,6 +67,8 @@ class FinishedRequestStats:
67
67
e2e_latency : float = 0.0
68
68
num_prompt_tokens : int = 0
69
69
num_generation_tokens : int = 0
70
+ queued_time : float = 0.0
71
+ prefill_time : float = 0.0
70
72
inference_time : float = 0.0
71
73
decode_time : float = 0.0
72
74
@@ -78,11 +80,10 @@ def __init__(self):
78
80
self .iteration_timestamp = time .time ()
79
81
self .num_generation_tokens = 0
80
82
self .num_prompt_tokens = 0
83
+ self .num_preempted_reqs = 0
81
84
self .finished_requests : List [FinishedRequestStats ] = []
82
85
self .time_to_first_tokens_iter : List [float ] = []
83
86
self .time_per_output_tokens_iter : List [float ] = []
84
- self .queue_times_iter : List [float ] = []
85
- self .prefill_times_iter : List [float ] = []
86
87
self .waiting_lora_adapters : Dict [str , int ] = {}
87
88
self .running_lora_adapters : Dict [str , int ] = {}
88
89
@@ -122,9 +123,6 @@ def update_from_output(self, output: "EngineCoreOutput",
122
123
if is_prefilling :
123
124
# TODO: re-enable no-output-for-partial-prefills invariant as above
124
125
if num_new_generation_tokens > 0 :
125
- prefill_interval = \
126
- engine_core_timestamp - req_stats .scheduled_ts
127
- self .prefill_times_iter .append (prefill_interval )
128
126
req_stats .first_token_ts = engine_core_timestamp
129
127
else :
130
128
tpot = engine_core_timestamp - req_stats .last_token_ts
@@ -145,24 +143,39 @@ def update_from_events(self, req_id: str, events: List["EngineCoreEvent"],
145
143
if lora_stats is not None :
146
144
lora_stats .waiting_requests .add (req_id )
147
145
elif event .type == EngineCoreEventType .SCHEDULED :
148
- queued_interval = event .timestamp - req_stats .queued_ts
149
- self .queue_times_iter .append (queued_interval )
150
- req_stats .scheduled_ts = event .timestamp
146
+ if req_stats .scheduled_ts == 0.0 : # ignore preemptions
147
+ req_stats .scheduled_ts = event .timestamp
151
148
LoRARequestStates .scheduled_request (lora_stats , req_id )
149
+ elif event .type == EngineCoreEventType .PREEMPTED :
150
+ self .num_preempted_reqs += 1
152
151
153
152
def update_from_finished_request (self , finish_reason : "FinishReason" ,
154
153
request_output : "RequestOutput" ,
155
154
req_stats : RequestStateStats ):
156
155
e2e_latency = self ._time_since (req_stats .arrival_time )
157
156
158
- inference_time = req_stats .last_token_ts - req_stats .scheduled_ts
157
+ # Queued interval is from first QUEUED event to first SCHEDULED
158
+ queued_time = req_stats .scheduled_ts - req_stats .queued_ts
159
+
160
+ # Prefill interval is from first SCHEDULED to first NEW_TOKEN
161
+ # Any preemptions during prefill is included in the interval
162
+ prefill_time = req_stats .first_token_ts - req_stats .scheduled_ts
163
+
164
+ # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
165
+ # Any preemptions during decode are included
159
166
decode_time = req_stats .last_token_ts - req_stats .first_token_ts
160
167
168
+ # Inference interval is from first SCHEDULED to last NEW_TOKEN
169
+ # Any preemptions during prefill or decode are included
170
+ inference_time = req_stats .last_token_ts - req_stats .scheduled_ts
171
+
161
172
finished_req = \
162
173
FinishedRequestStats (finish_reason = finish_reason ,
163
174
e2e_latency = e2e_latency ,
164
175
num_prompt_tokens = len (request_output .prompt_token_ids ),
165
176
num_generation_tokens = req_stats .num_generation_tokens ,
177
+ queued_time = queued_time ,
178
+ prefill_time = prefill_time ,
166
179
inference_time = inference_time ,
167
180
decode_time = decode_time )
168
181
self .finished_requests .append (finished_req )
0 commit comments