Skip to content

Commit

Permalink
SYSENG-1875 / add http_client_request metrics for the anexia engine r…
Browse files Browse the repository at this point in the history
…equests (#390)

<!--- Please leave a helpful description of the pull request here. --->

### Checklist

* [x] added release notes to `Unreleased` section in
[CHANGELOG.md](CHANGELOG.md)

### References

This MR enables http_client_request_* metrics towards the anexia engine
to be exported by the anexia ccm. This will help us identify rate
limiting errors that have occured more frequently recently when
accessing the anexia engine.

Started by [this ticket](https://ats.anexia-it.com/browse/SYSENG-1875)

<!---
Are there any other GitHub issues (open or closed) or pull requests that
should be linked here? Vendor blog posts or documentation?
--->
### Community Note
<!--- Please keep this note for the community --->
* Please vote on this issue by adding a 👍
[reaction](https://blog.github.com/2016-03-10-add-reactions-to-pull-requests-issues-and-comments/)
to the original issue to help the community and maintainers prioritize
this request
  • Loading branch information
koflanx authored Mar 6, 2025
1 parent b26281f commit d20a8b6
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 28 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
Please add your changelog entry under this comment in the correct category (Security, Fixed, Added, Changed, Deprecated, Removed - in this order).
-->

### Added

* Add http client metrics for requests towards the anexia engine (#390, @koflanx)

### Fixed

* Handle rate-limiting errors from the Anexia Engine (#382, @nachtjasmin)
Expand Down
87 changes: 83 additions & 4 deletions anx/provider/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ package metrics

import (
"fmt"
anxclient "go.anx.io/go-anxcloud/pkg/client"
"path"
"strings"
"sync"

"github.com/blang/semver/v4"
Expand Down Expand Up @@ -40,6 +43,8 @@ type ProviderMetrics struct {
ReconciliationRetrievedResourcesTotal *k8smetrics.CounterVec
featureState map[string]prometheus.Metric
descriptions []*prometheus.Desc
HttpClientRequestCount *k8smetrics.CounterVec
HttpClientRequestInFlight *k8smetrics.GaugeVec
}

func getCounterOpts(metricName string, helpMessage string) *k8smetrics.CounterOpts {
Expand Down Expand Up @@ -95,13 +100,27 @@ func setReconcileMetrics(providerMetrics *ProviderMetrics) {

providerMetrics.ReconciliationPendingResources = k8smetrics.NewGaugeVec(&k8smetrics.GaugeOpts{
Name: getFQMetricName("reconcile_resources_pending"),
Help: "Gauge of pending creation or deletion operations of resources",
}, []string{"service", "operation"})
Help: "Gauge of pending creation or deletion operations of resources"},
[]string{"service", "operation"},
)

providerMetrics.ReconciliationRetrievedResourcesTotal = k8smetrics.NewCounterVec(&k8smetrics.CounterOpts{
Name: getFQMetricName("reconcile_retrieved_resources_total"),
Help: "Counter of total numbers of resources retrieved grouped by type",
}, []string{"service", "type"})
Help: "Counter of total numbers of resources retrieved grouped by type"},
[]string{"service", "type"},
)

providerMetrics.HttpClientRequestCount = k8smetrics.NewCounterVec(&k8smetrics.CounterOpts{
Name: getFQMetricName("http_client_requests_total"),
Help: "Total amount of requests sent to Anexia Engine"},
[]string{"resource", "method", "status"},
)

providerMetrics.HttpClientRequestInFlight = k8smetrics.NewGaugeVec(&k8smetrics.GaugeOpts{
Name: getFQMetricName("http_client_requests_in_flight"),
Help: "Amount of requests sent to Anexia Engine currently waiting for response"},
[]string{"resource", "method"},
)
}

// NewProviderMetrics returns a prometheus.Collector for Provider Metrics.
Expand Down Expand Up @@ -171,3 +190,63 @@ func (p *ProviderMetrics) FQName() string {
func getFQMetricName(metricName string) string {
return fmt.Sprintf("%s_%s", fqCollectorName, metricName)
}

func (p *ProviderMetrics) MetricReceiver(metrics map[anxclient.Metric]float64, labels map[anxclient.MetricLabel]string) {
var resource, method, status string

for label, value := range labels {
switch label {
case anxclient.MetricLabelResource:
resource = filterResourceLabel(value)
case anxclient.MetricLabelMethod:
method = value
case anxclient.MetricLabelStatus:
status = value
}
}

for metric, value := range metrics {
switch metric {
case anxclient.MetricRequestCount:
p.HttpClientRequestCount.WithLabelValues(resource, method, status).Add(value)
case anxclient.MetricRequestInflight:
p.HttpClientRequestInFlight.WithLabelValues(resource, method).Add(value)
}
}
}

// filterResourceLabel takes the resource label given to the MetricReceiver by go-anxcloud client and tries to
// prevent swamping Prometheus with high-cardinality labels by
// - removing the /api/ prefix (not high-cardinality relevant, but still nice)
// - checking if the second to last path element ends with ".json", truncating the last path element in this case
// - for metrics we do not care for the exact resource but the type of resource
// - "it takes X seconds to retrieve VM infos"
// - some resource-specific handling
//
// Having this here is if course not ideal, but it's the least-invasive way to add metrics to go-anxcloud and use
// them here. Once we have the new generic client in go-anxcloud for everything, this will get better as we then
// just generate metrics by Object type and Operation, not by URL.
func filterResourceLabel(resource string) string {
resource = strings.TrimPrefix(resource, "/api/")

if identifierRemoved := path.Base(path.Dir(resource)); strings.HasSuffix(identifierRemoved, ".json") {
resource = identifierRemoved
}

// the vsphere info API endpoint is at "vsphere/v1/info.json/$identifier/info" for some reason, so the
// identifier stripping above does not catch it
const vsphereInfo = "vsphere/v1/info.json"
if strings.HasPrefix(resource, vsphereInfo+"/") {
resource = vsphereInfo
}

// the vsphere provisioning API endpoint is at
// "vsphere/v1/provisioning/vm.json/$location/$template_type/$template", which again prevents the identifier
// stripping above from catching it
const vsphereProvisioning = "vsphere/v1/provisioning/vm.json"
if strings.HasPrefix(resource, vsphereProvisioning+"/") {
resource = vsphereProvisioning
}

return resource
}
65 changes: 41 additions & 24 deletions anx/provider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package provider
import (
"fmt"
"io"
"net/http"
"os"
"sync"
"time"

"github.com/anexia-it/k8s-anexia-ccm/anx/provider/metrics"
"github.com/go-logr/logr"
Expand Down Expand Up @@ -58,14 +61,23 @@ func newAnxProvider(config configuration.ProviderConfig) (*anxProvider, error) {

logger := klog.NewKlogr()

legacyClient, err := client.New(client.TokenFromString(config.Token))
httpClient := http.Client{Timeout: 30 * time.Second}

providerMetrics := setupProviderMetrics()
legacyClient, err := client.New(
client.TokenFromString(config.Token),
client.WithMetricReceiver(providerMetrics.MetricReceiver),
client.HTTPClient(&httpClient),
)
if err != nil {
return nil, fmt.Errorf("could not create legacy anexia client. %w", err)
}

genericClient, err := api.NewAPI(
api.WithClientOptions(
client.TokenFromString(config.Token),
client.WithMetricReceiver(providerMetrics.MetricReceiver),
client.HTTPClient(&httpClient),
),
api.WithLogger(logger.WithName("go-anxcloud")),
)
Expand All @@ -74,19 +86,18 @@ func newAnxProvider(config configuration.ProviderConfig) (*anxProvider, error) {
}

return &anxProvider{
API: anexia.NewAPI(legacyClient),
genericClient: genericClient,
legacyClient: legacyClient,
logger: logger.WithName("anx/provider"),
config: &config,
API: anexia.NewAPI(legacyClient),
genericClient: genericClient,
legacyClient: legacyClient,
logger: logger.WithName("anx/provider"),
config: &config,
providerMetrics: providerMetrics,
}, nil
}

func (a *anxProvider) Initialize(builder cloudprovider.ControllerClientBuilder, stop <-chan struct{}) {
a.logger.Info("Anexia provider initializing", "version", Version)

a.setupProviderMetrics()

a.initializeLoadBalancerManager(builder)
a.instanceManager = &instanceManager{Provider: a}

Expand All @@ -108,7 +119,6 @@ func (a *anxProvider) initializeLoadBalancerManager(builder cloudprovider.Contro
k8sClient = c
}
}

config := a.Config()
logger := a.logger.WithName("LoadBalancer")

Expand Down Expand Up @@ -165,21 +175,28 @@ func (a anxProvider) Config() *configuration.ProviderConfig {
return a.config
}

func (a *anxProvider) setupProviderMetrics() {
a.providerMetrics = metrics.NewProviderMetrics("anexia", Version)
legacyregistry.MustRegister(&a.providerMetrics)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationTotalDuration)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationCreateErrorsTotal)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationDeleteRetriesTotal)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationDeleteErrorsTotal)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationCreatedTotal)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationDeletedTotal)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationCreateResources)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationPendingResources)
legacyregistry.MustRegister(a.providerMetrics.ReconciliationRetrievedResourcesTotal)

a.providerMetrics.MarkFeatureDisabled(featureNameLoadBalancer)
a.providerMetrics.MarkFeatureDisabled(featureNameInstancesV2)
var registerOnce sync.Once

func setupProviderMetrics() metrics.ProviderMetrics {
providerMetrics := metrics.NewProviderMetrics("anexia", Version)
registerOnce.Do(func() {
legacyregistry.MustRegister(&providerMetrics)
legacyregistry.MustRegister(providerMetrics.ReconciliationTotalDuration)
legacyregistry.MustRegister(providerMetrics.ReconciliationCreateErrorsTotal)
legacyregistry.MustRegister(providerMetrics.ReconciliationDeleteRetriesTotal)
legacyregistry.MustRegister(providerMetrics.ReconciliationDeleteErrorsTotal)
legacyregistry.MustRegister(providerMetrics.ReconciliationCreatedTotal)
legacyregistry.MustRegister(providerMetrics.ReconciliationDeletedTotal)
legacyregistry.MustRegister(providerMetrics.ReconciliationCreateResources)
legacyregistry.MustRegister(providerMetrics.ReconciliationPendingResources)
legacyregistry.MustRegister(providerMetrics.ReconciliationRetrievedResourcesTotal)
legacyregistry.MustRegister(providerMetrics.HttpClientRequestCount)
legacyregistry.MustRegister(providerMetrics.HttpClientRequestInFlight)
})

providerMetrics.MarkFeatureDisabled(featureNameLoadBalancer)
providerMetrics.MarkFeatureDisabled(featureNameInstancesV2)
return providerMetrics
}

func registerCloudProvider() {
Expand Down

0 comments on commit d20a8b6

Please sign in to comment.