Skip to content

Commit

Permalink
Merge pull request #643 from assafad/metrics-names
Browse files Browse the repository at this point in the history
Change metrics names

Signed-off-by: Felix Matouschek <[email protected]>
  • Loading branch information
0xFelix authored Aug 10, 2023
2 parents e531d6c + 7756c30 commit 7fff3eb
Show file tree
Hide file tree
Showing 16 changed files with 91 additions and 68 deletions.
6 changes: 3 additions & 3 deletions controllers/ssp_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (r *sspReconciler) setupController(mgr ctrl.Manager) error {
func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
defer func() {
if err != nil {
common.SSPOperatorReconcilingProperly.Set(0)
common.SSPOperatorReconcileSucceeded.Set(0)
}
}()
reqLogger := r.log.WithValues("ssp", req.NamespacedName)
Expand Down Expand Up @@ -227,9 +227,9 @@ func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ct
sspRequest.Logger.Info("CR status updated")

if sspRequest.Instance.Status.Phase == lifecycleapi.PhaseDeployed {
common.SSPOperatorReconcilingProperly.Set(1)
common.SSPOperatorReconcileSucceeded.Set(1)
} else {
common.SSPOperatorReconcilingProperly.Set(0)
common.SSPOperatorReconcileSucceeded.Set(0)
}

return ctrl.Result{}, nil
Expand Down
16 changes: 11 additions & 5 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,21 @@ This document aims to help users that are not familiar with metrics exposed by t
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## SSP Operator Metrics List
### kubevirt_ssp_common_templates_restored_increase
The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge.
### kubevirt_ssp_common_templates_restored_total
The total number of common templates restored by the operator back to their original state. Type: Counter.
### kubevirt_ssp_num_of_operator_reconciling_properly
### kubevirt_ssp_operator_reconcile_succeeded
Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge.
### kubevirt_ssp_operator_reconcile_succeeded_aggregated
The total number of ssp-operator pods reconciling with no errors. Type: Gauge.
### kubevirt_ssp_operator_up_total
### kubevirt_ssp_operator_up
The total number of running ssp-operator pods. Type: Gauge.
### kubevirt_ssp_rejected_vms_total
The total number of vms rejected by virt-template-validator. Type: Counter.
### kubevirt_ssp_template_validator_up_total
### kubevirt_ssp_template_validator_rejected_increase
The increase in the number of rejected template validators, over the last hour. Type: Gauge.
### kubevirt_ssp_template_validator_rejected_total
The total number of rejected template validators. Type: Counter.
### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.
4 changes: 2 additions & 2 deletions internal/common/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ type reconcileBuilder struct {
var _ ReconcileBuilder = &reconcileBuilder{}

var (
SSPOperatorReconcilingProperly = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "ssp_operator_reconciling_properly",
SSPOperatorReconcileSucceeded = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kubevirt_ssp_operator_reconcile_succeeded",
Help: "Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise",
})
)
Expand Down
2 changes: 1 addition & 1 deletion internal/operands/common-templates/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

var (
CommonTemplatesRestored = prometheus.NewCounter(prometheus.CounterOpts{
Name: "total_restored_common_templates",
Name: "kubevirt_ssp_common_templates_restored_total",
Help: "The total number of common templates restored by the operator back to their original state",
})
)
Expand Down
12 changes: 6 additions & 6 deletions internal/operands/common-templates/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ var _ = Describe("Common-Templates operand", func() {
}

desc, value := getCommonTemplatesRestoredMetric()
Expect(desc).To(ContainSubstring("total_restored_common_templates"))
Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total"))
Expect(value).To(BeZero())
})

Expand Down Expand Up @@ -132,7 +132,7 @@ var _ = Describe("Common-Templates operand", func() {
}

desc, value := getCommonTemplatesRestoredMetric()
Expect(desc).To(ContainSubstring("total_restored_common_templates"))
Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total"))
Expect(value).To(Equal(float64(len(testTemplates))))
})

Expand Down Expand Up @@ -278,7 +278,7 @@ var _ = Describe("Common-Templates operand", func() {
})
})

Context("total_restored_common_templates metric", func() {
Context("kubevirt_ssp_common_templates_restored_total metric", func() {
var template *templatev1.Template
var initialMetricValue float64

Expand All @@ -290,7 +290,7 @@ var _ = Describe("Common-Templates operand", func() {
template.Namespace = namespace

desc, value := getCommonTemplatesRestoredMetric()
Expect(desc).To(ContainSubstring("total_restored_common_templates"))
Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total"))
initialMetricValue = value
})

Expand All @@ -306,7 +306,7 @@ var _ = Describe("Common-Templates operand", func() {
Expect(updatedTpl.Labels[TemplateTypeLabel]).To(Equal(testTemplates[0].Labels[TemplateTypeLabel]))

desc, value := getCommonTemplatesRestoredMetric()
Expect(desc).To(ContainSubstring("total_restored_common_templates"))
Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total"))
Expect(value).To(Equal(initialMetricValue + 1))
})

Expand All @@ -323,7 +323,7 @@ var _ = Describe("Common-Templates operand", func() {
Expect(updatedTpl.Labels[TemplateTypeLabel]).To(Equal(testTemplates[0].Labels[TemplateTypeLabel]))

desc, value := getCommonTemplatesRestoredMetric()
Expect(desc).To(ContainSubstring("total_restored_common_templates"))
Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total"))
Expect(value).To(Equal(initialMetricValue))
})
})
Expand Down
38 changes: 19 additions & 19 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ const (
)

const (
Total_restored_common_templates_increase_query = "sum(increase(total_restored_common_templates{pod=~'ssp-operator.*'}[1h]))"
Total_rejected_vms_increase_query = "sum(increase(total_rejected_vms{pod=~'virt-template-validator.*'}[1h]))"
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

// RecordRulesDesc represent SSP Operator Prometheus Record Rules
Expand All @@ -47,34 +47,34 @@ type RecordRulesDesc struct {
// RecordRulesDescList lists all SSP Operator Prometheus Record Rules
var RecordRulesDescList = []RecordRulesDesc{
{
Name: "kubevirt_ssp_operator_up_total",
Name: "kubevirt_ssp_operator_up",
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
Description: "The total number of running ssp-operator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_up_total",
Name: "kubevirt_ssp_template_validator_up",
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
Description: "The total number of running virt-template-validator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_num_of_operator_reconciling_properly",
Expr: intstr.FromString("sum(ssp_operator_reconciling_properly)"),
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
Description: "The total number of ssp-operator pods reconciling with no errors",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_rejected_vms_total",
Expr: intstr.FromString(Total_rejected_vms_increase_query + " OR on() vector(0)"),
Description: "The total number of vms rejected by virt-template-validator",
Type: "Counter",
Name: "kubevirt_ssp_template_validator_rejected_increase",
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of rejected template validators, over the last hour",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_common_templates_restored_total",
Expr: intstr.FromString(Total_restored_common_templates_increase_query + " OR on() vector(0)"),
Description: "The total number of common templates restored by the operator back to their original state",
Type: "Counter",
Name: "kubevirt_ssp_common_templates_restored_increase",
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
Type: "Gauge",
},
}

Expand All @@ -91,7 +91,7 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up_total == 0"),
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: "5m",
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
Expand All @@ -106,7 +106,7 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up_total == 0"),
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: "5m",
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
Expand All @@ -121,7 +121,7 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_num_of_operator_reconciling_properly == 0) and (kubevirt_ssp_operator_up_total > 0)"),
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: "5m",
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile",
Expand All @@ -136,7 +136,7 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_rejected_vms_total > 5"),
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: "5m",
Annotations: map[string]string{
"summary": "High rate of rejected Vms",
Expand All @@ -151,7 +151,7 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_total > 0"),
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: "0m",
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator",
Expand Down
8 changes: 4 additions & 4 deletions internal/template-validator/webhooks/hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ import (
)

var (
vmsRejected = promauto.NewCounter(prometheus.CounterOpts{
Name: "total_rejected_vms",
Help: "The total number of rejected vms",
templateValidatorRejected = promauto.NewCounter(prometheus.CounterOpts{
Name: "kubevirt_ssp_template_validator_rejected_total",
Help: "The total number of rejected template validators",
})
)

Expand Down Expand Up @@ -102,7 +102,7 @@ func (w *webhooks) admitVm(ar *admissionv1.AdmissionReview) *admissionv1.Admissi

causes := ValidateVm(rules, vm)
if len(causes) > 0 {
vmsRejected.Inc()
templateValidatorRejected.Inc()
return ToAdmissionResponse(causes)
}

Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ const (
func runPrometheusServer(metricsAddr string, tlsOptions common.SSPTLSOptions) error {
setupLog.Info("Starting Prometheus metrics endpoint server with TLS")
metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored)
metrics.Registry.MustRegister(common.SSPOperatorReconcilingProperly)
metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded)
handler := promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{})
mux := http.NewServeMux()
mux.Handle("/metrics", handler)
Expand Down
4 changes: 2 additions & 2 deletions tests/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ var _ = Describe("Metrics", func() {
template = &getTemplates().Items[0]
})

It("[test_id:TODO]should increment total_restored_common_templates during normal reconcile", func() {
It("[test_id:TODO]should increment kubevirt_ssp_common_templates_restored_total during normal reconcile", func() {
skipIfUpgradeLane()

restoredCount := totalRestoredTemplatesCount()
Expand All @@ -179,7 +179,7 @@ var _ = Describe("Metrics", func() {
}, 5*time.Minute, 10*time.Second).Should(Equal(restoredCount + 1))
})

It("[test_id:TODO]should not increment total_restored_common_templates during upgrades", func() {
It("[test_id:TODO]should not increment kubevirt_ssp_common_templates_restored_total during upgrades", func() {
restoredCount := totalRestoredTemplatesCount()

template.Labels[common_templates.TemplateTypeLabel] = "test"
Expand Down
6 changes: 3 additions & 3 deletions tests/metrics_test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ import (
)

var regexpForMetrics = map[string]*regexp.Regexp{
"total_rejected_vms": regexp.MustCompile(`total_rejected_vms ([0-9]+)`),
"total_restored_common_templates": regexp.MustCompile(`total_restored_common_templates ([0-9]+)`),
"ssp_operator_reconciling_properly": regexp.MustCompile(`ssp_operator_reconciling_properly ([0-9]+)`),
"kubevirt_ssp_template_validator_rejected_total": regexp.MustCompile(`kubevirt_ssp_template_validator_rejected_total ([0-9]+)`),
"kubevirt_ssp_common_templates_restored_total": regexp.MustCompile(`kubevirt_ssp_common_templates_restored_total ([0-9]+)`),
"kubevirt_ssp_operator_reconcile_succeeded": regexp.MustCompile(`kubevirt_ssp_operator_reconcile_succeeded ([0-9]+)`),
}

func intMetricValue(metricName string, metricsPort uint16, pod *v1.Pod) int {
Expand Down
8 changes: 4 additions & 4 deletions tests/misc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ var _ = Describe("Observed generation", func() {
})
})

var _ = Describe("SSPOperatorReconcilingProperly metric", func() {
var _ = Describe("SSPOperatorReconcileSucceeded metric", func() {
var (
deploymentRes testResource
finalizerName = "ssp.kubernetes.io/temp-protection"
Expand All @@ -117,7 +117,7 @@ var _ = Describe("SSPOperatorReconcilingProperly metric", func() {
waitUntilDeployed()
})

It("[test_id:7369] should set SSPOperatorReconcilingProperly metrics to 0 on failing to reconcile", func() {
It("[test_id:7369] should set SSPOperatorReconcileSucceeded metrics to 0 on failing to reconcile", func() {
// add a finalizer to the validator deployment, do that it can't be deleted
addFinalizer(deploymentRes, finalizerName)
// send a request to delete the validator deployment
Expand Down Expand Up @@ -166,9 +166,9 @@ func validateSspIsFailingToReconcileMetric() {
Replicas: &newValidatorReplicas,
}
})
// the reconcile cycle should now be failing, so the ssp_operator_reconciling_properly metric should be 0
// the reconcile cycle should now be failing, so the kubevirt_ssp_operator_reconcile_succeeded metric should be 0
Eventually(func() int {
return sspOperatorReconcilingProperlyCount()
return sspOperatorReconcileSucceededCount()
}, env.ShortTimeout(), time.Second).Should(Equal(0))
}

Expand Down
6 changes: 3 additions & 3 deletions tests/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ var _ = Describe("Prometheus Alerts", func() {
})
It("[test_id:8363] Should fire SSPCommonTemplatesModificationReverted", func() {
// we have to wait for prometheus to pick up the series before we increase it.
waitForSeriesToBeDetected(metrics.Total_restored_common_templates_increase_query)
waitForSeriesToBeDetected(metrics.CommonTemplatesRestoredIncreaseQuery)
expectTemplateUpdateToIncreaseTotalRestoredTemplatesCount(testTemplate)
waitForAlertToActivate("SSPCommonTemplatesModificationReverted")
})
Expand All @@ -66,7 +66,7 @@ var _ = Describe("Prometheus Alerts", func() {
deploymentRes = testDeploymentResource()
})

It("[test_id:8364] should set SSPOperatorReconcilingProperly metrics to 0 on failing to reconcile", func() {
It("[test_id:8364] should set SSPOperatorReconcileSucceeded metrics to 0 on failing to reconcile", func() {
// add a finalizer to the validator deployment, do that it can't be deleted
addFinalizer(deploymentRes, finalizerName)
// send a request to delete the validator deployment
Expand Down Expand Up @@ -108,7 +108,7 @@ var _ = Describe("Prometheus Alerts", func() {
})

It("[test_id:8377] Should fire SSPHighRateRejectedVms", func() {
waitForSeriesToBeDetected(metrics.Total_rejected_vms_increase_query)
waitForSeriesToBeDetected(metrics.TemplateValidatorRejectedIncreaseQuery)
Expect(apiClient.Create(ctx, template)).ToNot(HaveOccurred(), "Failed to create template: %s", template.Name)
for range [6]int{} {
time.Sleep(time.Second * 5)
Expand Down
6 changes: 3 additions & 3 deletions tests/tests_common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,18 @@ func expectRecreateAfterDelete(res *testResource) {
Expect(err).ToNot(HaveOccurred())
}

func sspOperatorReconcilingProperlyCount() (sum int) {
func sspOperatorReconcileSucceededCount() (sum int) {
operatorPods, operatorMetricsPort := operatorPodsWithMetricsPort()
for _, sspOperator := range operatorPods {
sum += intMetricValue("ssp_operator_reconciling_properly", operatorMetricsPort, &sspOperator)
sum += intMetricValue("kubevirt_ssp_operator_reconcile_succeeded", operatorMetricsPort, &sspOperator)
}
return
}

func totalRestoredTemplatesCount() (sum int) {
operatorPods, operatorMetricsPort := operatorPodsWithMetricsPort()
for _, sspOperator := range operatorPods {
sum += intMetricValue("total_restored_common_templates", operatorMetricsPort, &sspOperator)
sum += intMetricValue("kubevirt_ssp_common_templates_restored_total", operatorMetricsPort, &sspOperator)
}
return
}
Expand Down
2 changes: 1 addition & 1 deletion tests/validator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ func totalRejectedVmsMetricsValue() (sum int) {
Expect(err).ToNot(HaveOccurred(), "Could not find the validator pods")
Expect(pods.Items).ToNot(BeEmpty())
for _, validatorPod := range pods.Items {
sum += intMetricValue("total_rejected_vms", validator.MetricsPort, &validatorPod)
sum += intMetricValue("kubevirt_ssp_template_validator_rejected_total", validator.MetricsPort, &validatorPod)
}
return
}
Expand Down
Loading

0 comments on commit 7fff3eb

Please sign in to comment.