Skip to content

Commit d942389

Browse files
committed
Use px/agent_status_diagnostics script within px cli to detect missing
kernel headers
1 parent 9effb34 commit d942389

File tree

8 files changed

+331
-146
lines changed

8 files changed

+331
-146
lines changed

src/pixie_cli/pkg/cmd/collect_logs.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import (
2727
"github.com/spf13/viper"
2828

2929
"px.dev/pixie/src/pixie_cli/pkg/utils"
30-
"px.dev/pixie/src/utils/shared/k8s"
30+
"px.dev/pixie/src/pixie_cli/pkg/vizier"
3131
)
3232

3333
func init() {
@@ -42,7 +42,7 @@ var CollectLogsCmd = &cobra.Command{
4242
viper.BindPFlag("namespace", cmd.Flags().Lookup("namespace"))
4343
},
4444
Run: func(cmd *cobra.Command, args []string) {
45-
c := k8s.NewLogCollector()
45+
c := vizier.NewLogCollector(mustCreateBundleReader(), viper.GetString("cloud_addr"))
4646
fName := fmt.Sprintf("pixie_logs_%s.zip", time.Now().Format("20060102150405"))
4747
err := c.CollectPixieLogs(fName)
4848
if err != nil {

src/pixie_cli/pkg/cmd/deploy.go

+20-64
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"context"
2323
"errors"
2424
"fmt"
25-
"io"
2625
"os"
2726
"strings"
2827
"time"
@@ -72,6 +71,7 @@ var BlockListedLabels = []string{
7271
}
7372

7473
func init() {
74+
DeployCmd.Flags().StringP("bundle", "b", "", "Path/URL to bundle file")
7575
DeployCmd.Flags().StringP("extract_yaml", "e", "", "Directory to extract the Pixie yamls to")
7676
DeployCmd.Flags().StringP("vizier_version", "v", "", "Pixie version to deploy")
7777
DeployCmd.Flags().BoolP("check", "c", true, "Check whether the cluster can run Pixie")
@@ -106,6 +106,7 @@ var DeployCmd = &cobra.Command{
106106
Use: "deploy",
107107
Short: "Deploys Pixie on the current K8s cluster",
108108
PreRun: func(cmd *cobra.Command, args []string) {
109+
viper.BindPFlag("bundle", cmd.Flags().Lookup("bundle"))
109110
viper.BindPFlag("extract_yaml", cmd.Flags().Lookup("extract_yaml"))
110111
viper.BindPFlag("vizier_version", cmd.Flags().Lookup("vizier_version"))
111112
viper.BindPFlag("check", cmd.Flags().Lookup("check"))
@@ -604,61 +605,6 @@ func deploy(cloudConn *grpc.ClientConn, clientset *kubernetes.Clientset, vzClien
604605
return clusterID
605606
}
606607

607-
func runSimpleHealthCheckScript(cloudAddr string, clusterID uuid.UUID) error {
608-
v, err := vizier.ConnectionToVizierByID(cloudAddr, clusterID)
609-
br := mustCreateBundleReader()
610-
if err != nil {
611-
return err
612-
}
613-
execScript := br.MustGetScript(script.AgentStatusScript)
614-
615-
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
616-
defer cancel()
617-
618-
resp, err := v.ExecuteScriptStream(ctx, execScript, nil)
619-
if err != nil {
620-
return err
621-
}
622-
623-
// TODO(zasgar): Make this use the Null output. We can't right now
624-
// because of fatal message on vizier failure.
625-
errCh := make(chan error)
626-
// Eat all responses.
627-
go func() {
628-
for {
629-
select {
630-
case <-ctx.Done():
631-
if ctx.Err() != nil {
632-
errCh <- ctx.Err()
633-
return
634-
}
635-
errCh <- nil
636-
return
637-
case msg := <-resp:
638-
if msg == nil {
639-
errCh <- nil
640-
return
641-
}
642-
if msg.Err != nil {
643-
if msg.Err == io.EOF {
644-
errCh <- nil
645-
return
646-
}
647-
errCh <- msg.Err
648-
return
649-
}
650-
if msg.Resp.Status != nil && msg.Resp.Status.Code != 0 {
651-
errCh <- errors.New(msg.Resp.Status.Message)
652-
}
653-
// Eat messages.
654-
}
655-
}
656-
}()
657-
658-
err = <-errCh
659-
return err
660-
}
661-
662608
func waitForHealthCheckTaskGenerator(cloudAddr string, clusterID uuid.UUID) func() error {
663609
return func() error {
664610
timeout := time.NewTimer(5 * time.Minute)
@@ -668,9 +614,15 @@ func waitForHealthCheckTaskGenerator(cloudAddr string, clusterID uuid.UUID) func
668614
case <-timeout.C:
669615
return errors.New("timeout waiting for healthcheck (it is possible that Pixie stabilized after the healthcheck timeout. To check if Pixie successfully deployed, run `px debug pods`)")
670616
default:
671-
err := runSimpleHealthCheckScript(cloudAddr, clusterID)
617+
_, err := vizier.RunSimpleHealthCheckScript(mustCreateBundleReader(), cloudAddr, clusterID)
672618
if err == nil {
673619
return nil
620+
} else {
621+
// The health check warning error indicates the cluster successfully deployed, but there are some warnings.
622+
// Return the error to surface them to the end user.
623+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
624+
return err
625+
}
674626
}
675627
time.Sleep(5 * time.Second)
676628
}
@@ -691,13 +643,17 @@ func waitForHealthCheck(cloudAddr string, clusterID uuid.UUID, clientset *kubern
691643
hc := utils.NewSerialTaskRunner(healthCheckJobs)
692644
err := hc.RunAndMonitor()
693645
if err != nil {
694-
_ = pxanalytics.Client().Enqueue(&analytics.Track{
695-
UserId: pxconfig.Cfg().UniqueClientID,
696-
Event: "Deploy Healthcheck Failed",
697-
Properties: analytics.NewProperties().
698-
Set("err", err.Error()),
699-
})
700-
utils.WithError(err).Fatal("Failed Pixie healthcheck")
646+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
647+
utils.WithError(err).Error("Pixie healthcheck detected the following warnings:")
648+
} else {
649+
_ = pxanalytics.Client().Enqueue(&analytics.Track{
650+
UserId: pxconfig.Cfg().UniqueClientID,
651+
Event: "Deploy Healthcheck Failed",
652+
Properties: analytics.NewProperties().
653+
Set("err", err.Error()),
654+
})
655+
utils.WithError(err).Fatal("Failed Pixie healthcheck")
656+
}
701657
}
702658
_ = pxanalytics.Client().Enqueue(&analytics.Track{
703659
UserId: pxconfig.Cfg().UniqueClientID,

src/pixie_cli/pkg/cmd/root.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ var RootCmd = &cobra.Command{
203203

204204
// Name a variable to store a slice of commands that don't require cloudAddr
205205
var cmdsCloudAddrNotReqd = []*cobra.Command{
206-
CollectLogsCmd,
207206
VersionCmd,
208207
}
209208

@@ -245,7 +244,7 @@ func checkAuthForCmd(c *cobra.Command) {
245244
os.Exit(1)
246245
}
247246
switch c {
248-
case DeployCmd, UpdateCmd, GetCmd, DeployKeyCmd, APIKeyCmd:
247+
case CollectLogsCmd, DeployCmd, UpdateCmd, GetCmd, DeployKeyCmd, APIKeyCmd:
249248
utils.Errorf("These commands are unsupported in Direct Vizier mode.")
250249
os.Exit(1)
251250
default:
@@ -254,7 +253,7 @@ func checkAuthForCmd(c *cobra.Command) {
254253
}
255254

256255
switch c {
257-
case DeployCmd, UpdateCmd, RunCmd, LiveCmd, GetCmd, ScriptCmd, DeployKeyCmd, APIKeyCmd:
256+
case CollectLogsCmd, DeployCmd, UpdateCmd, RunCmd, LiveCmd, GetCmd, ScriptCmd, DeployKeyCmd, APIKeyCmd:
258257
authenticated := auth.IsAuthenticated(viper.GetString("cloud_addr"))
259258
if !authenticated {
260259
utils.Errorf("Failed to authenticate. Please retry `px auth login`.")

src/pixie_cli/pkg/vizier/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ go_library(
2525
"data_formatter.go",
2626
"errors.go",
2727
"lister.go",
28+
"logs.go",
2829
"script.go",
2930
"stream_adapter.go",
3031
"utils.go",

src/pixie_cli/pkg/vizier/logs.go

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
* Copyright 2018- The Pixie Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-License-Identifier: Apache-2.0
17+
*/
18+
19+
package vizier
20+
21+
import (
22+
"archive/zip"
23+
"context"
24+
"errors"
25+
"os"
26+
"strings"
27+
28+
log "github.com/sirupsen/logrus"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/client-go/kubernetes"
31+
"k8s.io/client-go/rest"
32+
33+
"px.dev/pixie/src/utils/script"
34+
"px.dev/pixie/src/utils/shared/k8s"
35+
)
36+
37+
// LogCollector collect logs for Pixie and cluster setup information.
38+
type LogCollector struct {
39+
k8sConfig *rest.Config
40+
k8sClientSet *kubernetes.Clientset
41+
cloudAddr string
42+
br *script.BundleManager
43+
k8s.LogCollector
44+
}
45+
46+
// NewLogCollector creates a new log collector.
47+
func NewLogCollector(br *script.BundleManager, cloudAddr string) *LogCollector {
48+
cfg := k8s.GetConfig()
49+
cs := k8s.GetClientset(cfg)
50+
return &LogCollector{
51+
cfg,
52+
cs,
53+
cloudAddr,
54+
br,
55+
*k8s.NewLogCollector(),
56+
}
57+
}
58+
59+
// CollectPixieLogs collects logs for all Pixie pods and write them to the zip file fName.
60+
func (c *LogCollector) CollectPixieLogs(fName string) error {
61+
if !strings.HasSuffix(fName, ".zip") {
62+
return errors.New("fname must have .zip suffix")
63+
}
64+
f, err := os.Create(fName)
65+
if err != nil {
66+
return err
67+
}
68+
defer f.Close()
69+
70+
zf := zip.NewWriter(f)
71+
defer zf.Close()
72+
73+
vls := k8s.VizierLabelSelector()
74+
vizierLabelSelector := metav1.FormatLabelSelector(&vls)
75+
76+
// We check across all namespaces for the matching pixie pods.
77+
vizierPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: vizierLabelSelector})
78+
if err != nil {
79+
return err
80+
}
81+
82+
// We also need to get the logs the operator logs.
83+
// As the LabelSelectors are ANDed, we need to make a new query and merge
84+
// the results.
85+
ols := k8s.OperatorLabelSelector()
86+
operatorLabelSelector := metav1.FormatLabelSelector(&ols)
87+
88+
operatorPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: operatorLabelSelector})
89+
if err != nil {
90+
return err
91+
}
92+
93+
// Merge the two pod lists
94+
pods := append(vizierPodList.Items, operatorPodList.Items...)
95+
96+
for _, pod := range pods {
97+
for _, containerStatus := range pod.Status.ContainerStatuses {
98+
// Ignore prev logs, they might not exist.
99+
_ = c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, true)
100+
101+
err := c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, false)
102+
if err != nil {
103+
log.WithError(err).Warnf("Failed to log pod: %s", pod.Name)
104+
}
105+
}
106+
err = c.WritePodDescription(zf, pod)
107+
if err != nil {
108+
log.WithError(err).Warnf("failed to write pod description")
109+
}
110+
}
111+
112+
err = c.LogKubeCmd(zf, "nodes.log", "describe", "node")
113+
if err != nil {
114+
log.WithError(err).Warn("failed to log node info")
115+
}
116+
117+
err = c.LogKubeCmd(zf, "services.log", "describe", "services", "--all-namespaces", "-l", vizierLabelSelector)
118+
if err != nil {
119+
log.WithError(err).Warnf("failed to log services")
120+
}
121+
122+
// Describe vizier and write it to vizier.log
123+
err = c.LogKubeCmd(zf, "vizier.log", "describe", "vizier", "--all-namespaces")
124+
if err != nil {
125+
log.WithError(err).Warnf("failed to log vizier crd")
126+
}
127+
128+
clusterID, err := GetCurrentVizier(c.cloudAddr)
129+
if err != nil {
130+
log.WithError(err).Warnf("failed to get cluster ID")
131+
}
132+
outputCh, err := RunSimpleHealthCheckScript(c.br, c.cloudAddr, clusterID)
133+
134+
if err != nil {
135+
entry := log.WithError(err)
136+
if _, ok := err.(*HealthCheckWarning); ok {
137+
entry.Warn("healthcheck script detected the following warnings:")
138+
} else {
139+
entry.Warn("failed to run healthcheck script")
140+
}
141+
}
142+
143+
return c.LogOutputToZipFile(zf, "px_agent_diagnostics.txt", <-outputCh)
144+
}

0 commit comments

Comments
 (0)