From 13c2a7ded61e423c7ca6b8cac964f6badcebe2e3 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 11 Nov 2022 11:59:44 -0500 Subject: [PATCH] testiso: improve error when QEMU is terminated Right now when QEMU gets killed during a testiso run, the error is: FAIL: pxe-offline-install (bios + metal) (1m10.277s) Got EOF from completion channel, coreos-installer-test-OK expected This is accurate but doesn't hint well enough at the underlying cause. Rework the two spots in which we wait for virtio-serial strings to also check if QEMU was killed to provide a better error. E.g.: FAIL: pxe-install (bios + metal) (34.483s) QEMU unexpectedly exited while waiting awaiting completion: process killed Related: https://github.com/coreos/fedora-coreos-tracker/issues/1339 --- mantle/cmd/kola/testiso.go | 10 +++++++++- mantle/platform/metal.go | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/mantle/cmd/kola/testiso.go b/mantle/cmd/kola/testiso.go index 0ceba99b88..a848e7d845 100644 --- a/mantle/cmd/kola/testiso.go +++ b/mantle/cmd/kola/testiso.go @@ -537,8 +537,12 @@ func awaitCompletion(ctx context.Context, inst *platform.QemuInstance, outdir st } go func() { err := inst.Wait() + // only one Wait() gets process data, so also manually check for signal + if err == nil && inst.Signaled() { + err = errors.New("process killed") + } if err != nil { - errchan <- err + errchan <- errors.Wrapf(err, "QEMU unexpectedly exited while awaiting completion") } time.Sleep(1 * time.Minute) errchan <- fmt.Errorf("QEMU exited; timed out waiting for completion") @@ -549,6 +553,10 @@ func awaitCompletion(ctx context.Context, inst *platform.QemuInstance, outdir st l, err := r.ReadString('\n') if err != nil { if err == io.EOF { + // this may be from QEMU getting killed or exiting; wait a bit + // to give a chance for .Wait() above to feed the channel with a + // better error + time.Sleep(1 * time.Second) errchan <- fmt.Errorf("Got EOF from completion channel, %s expected", exp) } else { errchan <- errors.Wrapf(err, "reading from completion channel") diff --git a/mantle/platform/metal.go b/mantle/platform/metal.go index 89d8cb88ab..fbcbc32825 100644 --- a/mantle/platform/metal.go +++ b/mantle/platform/metal.go @@ -24,6 +24,7 @@ import ( "os" "path/filepath" "strings" + "time" coreosarch "github.com/coreos/stream-metadata-go/arch" "github.com/pkg/errors" @@ -435,11 +436,25 @@ func (t *installerRun) completePxeSetup(kargs []string) error { func switchBootOrderSignal(qinst *QemuInstance, bootstartedchan *os.File, booterrchan *chan error) { *booterrchan = make(chan error) + go func() { + err := qinst.Wait() + // only one Wait() gets process data, so also manually check for signal + if err == nil && qinst.Signaled() { + err = errors.New("process killed") + } + if err != nil { + *booterrchan <- errors.Wrapf(err, "QEMU unexpectedly exited while waiting for %s", bootStartedSignal) + } + }() go func() { r := bufio.NewReader(bootstartedchan) l, err := r.ReadString('\n') if err != nil { if err == io.EOF { + // this may be from QEMU getting killed or exiting; wait a bit + // to give a chance for .Wait() above to feed the channel with a + // better error + time.Sleep(1 * time.Second) *booterrchan <- fmt.Errorf("Got EOF from boot started channel, %s expected", bootStartedSignal) } else { *booterrchan <- errors.Wrapf(err, "reading from boot started channel")