7
7
import random
8
8
import time
9
9
import traceback
10
+ import warnings
10
11
from contextlib import contextmanager
11
12
from typing import TYPE_CHECKING , NamedTuple
12
13
@@ -66,11 +67,20 @@ class FluidExecutor(TrialExecutor):
66
67
def __init__ (self , ** kwargs ):
67
68
super ().__init__ (queue_trials = True ) # type: ignore
68
69
70
+ # whether in testing environment without GPU
71
+ self ._fake_gpus = False
72
+
69
73
# resources
70
74
self ._avail_resources = Resources (cpu = 0 , gpu = 0 )
71
75
self ._committed_resources = Resources (cpu = 0 , gpu = 0 )
72
76
self ._resources_initialized = False
73
77
self ._last_resource_refresh = float ("-inf" )
78
+ # list of trials that has resources committed
79
+ # this is usually those trials in jobs_running,
80
+ # but a trial may be only in _trials_running but not in jobs_running,
81
+ # because fetch_result was called on it.
82
+ # This is maintained solely by _commit_resources/_return_resources
83
+ self ._trials_running : List [Trial ] = set ()
74
84
75
85
# make sure our own GPU resources are created first in the cluster
76
86
create_custom_gpu_res ()
@@ -169,6 +179,9 @@ def _fluid(self, meta: TrialGroupMeta):
169
179
self ._dump_groups ()
170
180
# set of trials to consider
171
181
A = {trial .trial_id for trial in self ._trial_group (meta .grp )}
182
+ logger .debug (
183
+ f"_fluid: meta.perf.trials_missing_info={ meta .perf .trials_missing_info } meta.trials={ meta .trials } , meta.grp={ meta .grp } , trial_groups={ self .trial_groups } , A={ A } "
184
+ )
172
185
# assignment of resources
173
186
W : Dict [str , Resources ] = {}
174
187
# compute new idle resources if every trials in this group were stopped
@@ -209,8 +222,8 @@ def _fluid(self, meta: TrialGroupMeta):
209
222
# \frac{1}{c}),
210
223
# d
211
224
# )$$
212
- c = 1 / 2 # TODO: calc c
213
- d = 4 # TODO: calc d
225
+ c = 1 / 2
226
+ d = 4
214
227
w = np .minimum (
215
228
np .maximum (np .floor (H1 * np .size (H1 ) / np .sum (H1 )), 1 / c ), d
216
229
)
@@ -224,7 +237,7 @@ def _fluid(self, meta: TrialGroupMeta):
224
237
225
238
def _ensure_W (self , W : Dict [str , Resources ], meta : TrialGroupMeta ):
226
239
"""Adjust group resources given in W"""
227
- logger .debug (f"ensure_W: meta.trials={ meta .trials } " )
240
+ logger .debug (f"ensure_W: W= { W } meta.trials={ meta .trials } " )
228
241
# stop any trials with 0 res
229
242
# this has to be done first to free up resources for others to use
230
243
for trial_id , res in W .items ():
@@ -233,31 +246,37 @@ def _ensure_W(self, W: Dict[str, Resources], meta: TrialGroupMeta):
233
246
# add to paused, then ensure_stop, we do not change trial's status which is visible outside
234
247
running = self ._find_running (trial )
235
248
if running is not None :
249
+ # don't call pause_trial, which will trigger another fluid reschedule
236
250
self .jobs_paused [running .in_flight_future ] = running
237
- self ._ensure_stop (running .trial )
238
- else :
239
- trial . resources = res
240
- self .start_trial (trial )
251
+ self ._ensure_stop (running .trial )
252
+ trial . resources = res
253
+ # add to pending
254
+ self .start_trial (trial )
241
255
# adjust any trials with different res, including any not already running
242
256
for trial_id , res in W .items ():
243
257
# use trial group to map trial_id to trial
244
258
trial = self .trial_groups [trial_id ].trial
245
259
246
260
if res .cpu_total () + res .gpu_total () == 0 :
261
+ # already handled in the loop above
247
262
continue
248
263
249
- running = self ._find_running (trial )
250
- if running is not None and (
251
- # trial.resources != res
264
+ if (
265
+ # current_res != res
252
266
Resources .subtract (trial .resources , res ).is_nonnegative ()
253
267
!= Resources .subtract (res , trial .resources ).is_nonnegative ()
254
268
):
255
- self .jobs_paused [running .in_flight_future ] = running
256
- self ._ensure_stop (running .trial )
269
+ running = self ._find_running (trial )
270
+ if running is not None :
271
+ # don't call pause_trial, which will trigger another fluid reschedule
272
+ self .jobs_paused [running .in_flight_future ] = running
257
273
258
- # construct PendingJob and use _kickoff to start the trial
259
- pending = PendingJob (trial , None , True )
260
- self ._kickoff (pending , res )
274
+ self ._ensure_stop (trial )
275
+
276
+ # at this point, the job is always stopped but not in the pending queue,
277
+ # because fluid clears the pending queue.
278
+ trial .resources = res
279
+ self ._kickoff (PendingJob (trial , None , True ), res )
261
280
262
281
def _find_group (self , trial : Trial ) -> TrialGroupMeta :
263
282
return self .trial_group_meta [self .trial_groups [trial .trial_id ].group ]
@@ -280,6 +299,9 @@ def _find_running(self, trial: Trial) -> Optional[RunningJob]:
280
299
for _ , job in self .jobs_running .items ():
281
300
if job .trial == trial :
282
301
return job
302
+ logger .debug (
303
+ f"Cloud not find running trial: { trial } , currently running ones are { [job for _ , job in self .jobs_running .items ()]} "
304
+ )
283
305
284
306
def _find_pending (self , trial : Trial ) -> Optional [PendingJob ]:
285
307
for job in self .jobs_pending :
@@ -296,7 +318,7 @@ def _setup_remote_runner(
296
318
297
319
cls = ray .remote (
298
320
num_cpus = res .cpu ,
299
- num_gpus = res .gpu ,
321
+ num_gpus = 0 if self . _fake_gpus else res .gpu ,
300
322
memory = res .memory ,
301
323
object_store_memory = res .object_store_memory ,
302
324
resources = res .custom_resources ,
@@ -335,12 +357,11 @@ def _kickoff(self, pending: PendingJob, res: Resources) -> Optional[RunningJob]:
335
357
May return None if failed to start
336
358
"""
337
359
trial = pending .trial
338
- self ._commit_resources (res )
339
-
340
360
# this is needed for the Trainer to setup distributed training
341
361
# TODO: figure what config key is also needed to set resource info
342
362
trial .resources = res
343
363
364
+ self ._commit_resources (trial )
344
365
try :
345
366
reuse_allowed = pending .checkpoint is not None or trial .has_checkpoint ()
346
367
runner = self ._setup_remote_runner (trial , res , reuse_allowed )
@@ -382,6 +403,7 @@ def _kickoff(self, pending: PendingJob, res: Resources) -> Optional[RunningJob]:
382
403
stop_logger = True ,
383
404
# NOTE that we don't return the resources, since they may have been lost.
384
405
release_resources = False ,
406
+ update_status = True ,
385
407
)
386
408
387
409
def _ensure_train (self , trial : Trial ) -> RunningJob :
@@ -395,31 +417,40 @@ def _ensure_train(self, trial: Trial) -> RunningJob:
395
417
fut = _LocalWrapper (fut )
396
418
running = RunningJob (trial , fut )
397
419
self .jobs_running [fut ] = running
420
+ logger .debug (f"Set trial to running: { trial } , jobs_running={ self .jobs_running } " )
398
421
return running
399
422
400
423
def _ensure_stop (
401
- self , trial , error = False , error_msg = "" , stop_logger = True , release_resources = True
424
+ self ,
425
+ trial ,
426
+ error = False ,
427
+ error_msg = "" ,
428
+ stop_logger = True ,
429
+ release_resources = True ,
430
+ update_status = False ,
402
431
):
403
432
"""Stops the trial and its logger
404
433
Handles any error
405
434
"""
435
+ logger .debug (f"_ensure_stop: trial.resources={ trial .resources } " )
406
436
if stop_logger :
407
437
trial .close_logger ()
408
438
409
439
prior_status = trial .status
410
- self .set_status (trial , Trial .ERROR if error else Trial .TERMINATED )
411
440
trial .set_location (Location ())
441
+ if update_status :
442
+ self .set_status (trial , Trial .ERROR if error else Trial .TERMINATED )
412
443
413
444
# remove from running
414
445
in_flight = [j for _ , j in self .jobs_running .items () if j .trial == trial ]
415
446
for j in in_flight :
416
447
self .jobs_running .pop (j .in_flight_future )
417
- if release_resources :
418
- logger .debug ("Trial %s: Returning resources." , trial )
419
- self ._return_resources (trial .resources )
420
448
if in_flight :
421
449
if prior_status not in [Trial .RUNNING , Trial .ERROR ]:
422
450
assert False , "trial status invalid"
451
+ # release resources
452
+ if release_resources :
453
+ self ._return_resources (trial )
423
454
424
455
# remove from trial group
425
456
# del self.trial_groups[trial.trial_id]
@@ -451,7 +482,7 @@ def start_trial(self, trial, checkpoint=None, train=True):
451
482
def stop_trial (self , trial , error = False , error_msg = None , stop_logger = True ):
452
483
"""Add to to-stop queue and reschedule"""
453
484
logger .debug ("stop_trial %s" , trial )
454
- self ._ensure_stop (trial , error , error_msg , stop_logger )
485
+ self ._ensure_stop (trial , error , error_msg , stop_logger , update_status = True )
455
486
meta = self ._find_group (trial )
456
487
self ._fluid (meta )
457
488
@@ -540,6 +571,13 @@ def get_next_failed_trial(self) -> Optional[Trial]:
540
571
return None
541
572
542
573
def fetch_result (self , trial ):
574
+ """
575
+ Note that this will remove the trial from running queue,
576
+ so actions must be taken later to either continue_training/stop/pause,
577
+ to maintain consistent system state.
578
+
579
+ This is usually called from the runner, knowning the the future for this trial is ready.
580
+ """
543
581
running_job = self ._find_running (trial )
544
582
assert running_job , "Trial was not running"
545
583
self .jobs_running .pop (running_job .in_flight_future )
@@ -675,11 +713,6 @@ def export_trial_if_needed(self, trial: Trial):
675
713
def cleanup (self ):
676
714
self ._trial_cleanup .cleanup (partial = False )
677
715
678
- def has_gpus (self ):
679
- if not self ._resources_initialized :
680
- self ._update_avail_resources ()
681
- return self ._avail_resources .gpu > 0
682
-
683
716
def on_step_begin (self , trial_runner ):
684
717
"""Before step() called, update the available resources."""
685
718
self ._update_avail_resources ()
@@ -722,6 +755,16 @@ def _update_avail_resources(self, num_retries=5):
722
755
)
723
756
custom_resources = resources
724
757
758
+ if num_gpus == 0 :
759
+ warnings .warn (
760
+ "No GPU resources found, assuming local test, using CPU resources instead"
761
+ )
762
+ # local test
763
+ num_gpus = num_cpus
764
+ self ._fake_gpus = True
765
+ else :
766
+ self ._fake_gpus = False
767
+
725
768
avail_resources = Resources (
726
769
int (num_cpus ),
727
770
int (num_gpus ),
@@ -742,7 +785,10 @@ def _update_avail_resources(self, num_retries=5):
742
785
def idle_resources (self ) -> Resources :
743
786
return Resources .subtract (self ._avail_resources , self ._committed_resources )
744
787
745
- def _commit_resources (self , resources ):
788
+ def _commit_resources (self , trial : Trial ):
789
+ resources = trial .resources
790
+ self ._trials_running .add (trial )
791
+
746
792
committed = self ._committed_resources
747
793
all_keys = set (resources .custom_resources ).union (
748
794
set (committed .custom_resources )
@@ -759,8 +805,15 @@ def _commit_resources(self, resources):
759
805
committed .object_store_memory + resources .object_store_memory_total (),
760
806
custom_resources = custom_resources ,
761
807
)
808
+ logger .debug (f"Committed res={ resources } -> { self ._committed_resources } " )
809
+
810
+ def _return_resources (self , trial : Trial ):
811
+ if trial not in self ._trials_running :
812
+ return
813
+ logger .debug ("Trial %s: Returning resources." , trial )
814
+ self ._trials_running .remove (trial )
815
+ resources = trial .resources
762
816
763
- def _return_resources (self , resources ):
764
817
committed = self ._committed_resources
765
818
766
819
all_keys = set (resources .custom_resources ).union (
@@ -778,7 +831,9 @@ def _return_resources(self, resources):
778
831
779
832
assert (
780
833
self ._committed_resources .is_nonnegative ()
781
- ), "Resource invalid: {}" .format (resources )
834
+ ), "Resource invalid: {} - {} = {}" .format (
835
+ committed , resources , self ._committed_resources
836
+ )
782
837
783
838
def on_no_available_trials (self , trial_runner ):
784
839
"""This is called when we get all trial from a batch from the search algo"""
0 commit comments