-
Notifications
You must be signed in to change notification settings - Fork 215
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(telemetry): Application Insights Shutdown on Retina Operator #1227
base: main
Are you sure you want to change the base?
Changes from all commits
cbf4179
e2dcffb
75dcf89
ae09d71
e91b496
75f2ee7
094e24c
21af2b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -8,7 +8,6 @@ import ( | |||||||||||||
"fmt" | ||||||||||||||
"net/http" | ||||||||||||||
"net/http/pprof" | ||||||||||||||
"os" | ||||||||||||||
"time" | ||||||||||||||
|
||||||||||||||
"go.uber.org/zap/zapcore" | ||||||||||||||
|
@@ -40,6 +39,7 @@ import ( | |||||||||||||
retinaendpointcontroller "github.com/microsoft/retina/pkg/controllers/operator/retinaendpoint" | ||||||||||||||
"github.com/microsoft/retina/pkg/log" | ||||||||||||||
"github.com/microsoft/retina/pkg/telemetry" | ||||||||||||||
"github.com/pkg/errors" | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
var ( | ||||||||||||||
|
@@ -81,7 +81,7 @@ func NewOperator(metricsAddr, probeAddr, configFile string, enableLeaderElection | |||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
func (o *Operator) Start() { | ||||||||||||||
func (o *Operator) Start() error { | ||||||||||||||
mainLogger = log.Logger().Named("main") | ||||||||||||||
|
||||||||||||||
mainLogger.Sugar().Infof("Starting legacy operator") | ||||||||||||||
|
@@ -93,8 +93,7 @@ func (o *Operator) Start() { | |||||||||||||
var err error | ||||||||||||||
oconfig, err = config.GetConfig(o.configFile) | ||||||||||||||
if err != nil { | ||||||||||||||
fmt.Printf("failed to load config with err %s", err.Error()) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "failed to load config") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
mainLogger.Sugar().Infof("Operator configuration", zap.Any("configuration", oconfig)) | ||||||||||||||
|
@@ -103,15 +102,9 @@ func (o *Operator) Start() { | |||||||||||||
oconfig.CaptureConfig.CaptureImageVersion = buildinfo.Version | ||||||||||||||
oconfig.CaptureConfig.CaptureImageVersionSource = captureUtils.VersionSourceOperatorImageVersion | ||||||||||||||
|
||||||||||||||
if err != nil { | ||||||||||||||
fmt.Printf("failed to load config with err %s", err.Error()) | ||||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
err = initLogging(oconfig, buildinfo.ApplicationInsightsID) | ||||||||||||||
if err != nil { | ||||||||||||||
fmt.Printf("failed to initialize logging with err %s", err.Error()) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "failed to initialize logging") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
ctrl.SetLogger(crzap.New(crzap.UseFlagOptions(opts), crzap.Encoder(zapcore.NewConsoleEncoder(log.EncoderConfig())))) | ||||||||||||||
|
@@ -138,15 +131,13 @@ func (o *Operator) Start() { | |||||||||||||
// LeaderElectionReleaseOnCancel: true, | ||||||||||||||
}) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("Unable to start manager", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to start manager") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
ctx := context.Background() | ||||||||||||||
clientset, err := apiextv1.NewForConfig(mgr.GetConfig()) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("Failed to get apiextension clientset", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "failed to get apiextension clientset") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if oconfig.InstallCRDs { | ||||||||||||||
|
@@ -155,8 +146,7 @@ func (o *Operator) Start() { | |||||||||||||
var crds map[string]*v1.CustomResourceDefinition | ||||||||||||||
crds, err = deploy.InstallOrUpdateCRDs(ctx, oconfig.EnableRetinaEndpoint, clientset) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("unable to register CRDs", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to register CRDs") | ||||||||||||||
} | ||||||||||||||
for name := range crds { | ||||||||||||||
mainLogger.Info("CRD registered", zap.String("name", name)) | ||||||||||||||
|
@@ -165,8 +155,7 @@ func (o *Operator) Start() { | |||||||||||||
|
||||||||||||||
apiserverURL, err := telemetry.GetK8SApiserverURLFromKubeConfig() | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("Apiserver URL is cannot be found", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "apiserver URL is cannot be found") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
var tel telemetry.Telemetry | ||||||||||||||
|
@@ -176,10 +165,14 @@ func (o *Operator) Start() { | |||||||||||||
"version": buildinfo.Version, | ||||||||||||||
telemetry.PropertyApiserver: apiserverURL, | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
telemetry.InitAppInsights(buildinfo.ApplicationInsightsID, buildinfo.Version) | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought we verified in the past that operator was sending logs to app insights. I'm confused what this function does We also seem to init app insights here and with a different config: Lines 95 to 100 in a2c261f
@jimassa / @matmerr what's the difference between configuring zap with AI vs initializing AI client? What's the impact of having different configs? |
||||||||||||||
defer telemetry.ShutdownAppInsights() | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: let's shutdown wherever we handle the error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand the request @huntergregory . The defer will run when we return the errors? |
||||||||||||||
defer telemetry.TrackPanic() | ||||||||||||||
|
||||||||||||||
tel, err = telemetry.NewAppInsightsTelemetryClient("retina-operator", properties) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("failed to create telemetry client", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "failed to create telemetry client") | ||||||||||||||
} | ||||||||||||||
} else { | ||||||||||||||
mainLogger.Info("telemetry disabled", zap.String("apiserver", apiserverURL)) | ||||||||||||||
|
@@ -188,20 +181,17 @@ func (o *Operator) Start() { | |||||||||||||
|
||||||||||||||
kubeClient, err := kubernetes.NewForConfig(mgr.GetConfig()) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("Failed to get clientset", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "failed to get clientset") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
captureReconciler, err := captureController.NewCaptureReconciler( | ||||||||||||||
mgr.GetClient(), mgr.GetScheme(), kubeClient, oconfig.CaptureConfig, | ||||||||||||||
) | ||||||||||||||
if err != nil { | ||||||||||||||
mainLogger.Error("Unable to create capture reconciler", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to create capture reconciler") | ||||||||||||||
} | ||||||||||||||
if err = captureReconciler.SetupWithManager(mgr); err != nil { | ||||||||||||||
mainLogger.Error("Unable to setup retina capture controller with manager", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to setup retina capture controller with manager") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
ctrlCtx := ctrl.SetupSignalHandler() | ||||||||||||||
|
@@ -225,37 +215,34 @@ func (o *Operator) Start() { | |||||||||||||
|
||||||||||||||
pc := podcontroller.New(mgr.GetClient(), mgr.GetScheme(), retinaendpointchannel) | ||||||||||||||
if err = (pc).SetupWithManager(mgr); err != nil { | ||||||||||||||
mainLogger.Error("Unable to create controller", zap.String("controller", "podcontroller"), zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to create controller - podcontroller") | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
mc := metricsconfiguration.New(mgr.GetClient(), mgr.GetScheme()) | ||||||||||||||
if err = (mc).SetupWithManager(mgr); err != nil { | ||||||||||||||
mainLogger.Error("Unable to create controller", zap.String("controller", "metricsconfiguration"), zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to create controller - metricsconfiguration") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
//+kubebuilder:scaffold:builder | ||||||||||||||
|
||||||||||||||
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { | ||||||||||||||
mainLogger.Error("Unable to set up health check", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to set up health check") | ||||||||||||||
} | ||||||||||||||
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { | ||||||||||||||
mainLogger.Error("Unable to set up ready check", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "unable to set up ready check") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
mainLogger.Info("Starting manager") | ||||||||||||||
if err := mgr.Start(ctrlCtx); err != nil { | ||||||||||||||
mainLogger.Error("Problem running manager", zap.Error(err)) | ||||||||||||||
os.Exit(1) | ||||||||||||||
return errors.Wrap(err, "problem running manager") | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
// start heartbeat goroutine for application insights | ||||||||||||||
go tel.Heartbeat(ctx, HeartbeatFrequency) | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think Start is a blocking call, so it never gets reached. Heartbeat should be called above |
||||||||||||||
|
||||||||||||||
return nil | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
func EnablePProf() { | ||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ import ( | |
"os" | ||
|
||
"github.com/microsoft/retina/operator/cmd/legacy" | ||
"github.com/pkg/errors" | ||
"github.com/spf13/cobra" | ||
) | ||
|
||
|
@@ -25,10 +26,14 @@ var ( | |
Use: "retina-operator", | ||
Short: "Retina Operator", | ||
Long: "Start Retina Operator", | ||
Run: func(cmd *cobra.Command, args []string) { | ||
RunE: func(_ *cobra.Command, _ []string) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does RunE do? Another option is to leave as Run and os.exit on err? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, and importantly it will return a non-zero exit on |
||
fmt.Println("Starting Retina Operator") | ||
d := legacy.NewOperator(metricsAddr, probeAddr, cfgFile, enableLeaderElection) | ||
d.Start() | ||
|
||
if err := d.Start(); err != nil { | ||
return errors.Wrap(err, "failed to start retina-operator") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's log the error before the program exits There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to amend that. "Let's assure that any error returned is logged immediately before Log in-situ and return error is redundant and carries a lot of mental overhead both in reading and writing. The simple rule of "if an error makes it to the top of the call stack, it gets logged, loudly" is semantically cleaner and does the right thing by default. When combined with "all errors are wrapped," you also get detailed context to track down what went wrong. |
||
} | ||
return nil | ||
}, | ||
} | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.