From 1d32d3c59eac00dc02d09b5803eed8249d720a48 Mon Sep 17 00:00:00 2001 From: Sergio Ballesteros Date: Mon, 28 Nov 2022 16:15:51 +0100 Subject: [PATCH 1/3] Refactor log levels to give more visibility to healer logs --- operator/redisfailover/service/heal.go | 33 +++++++++++++------------- service/k8s/configmap.go | 4 ++-- service/k8s/deployment.go | 4 ++-- service/k8s/pod.go | 4 ++-- service/k8s/poddisruptionbudget.go | 4 ++-- service/k8s/rbac.go | 12 +++++----- service/k8s/service.go | 4 ++-- service/k8s/statefulset.go | 4 ++-- 8 files changed, 35 insertions(+), 34 deletions(-) diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 934915d9d..22a3ffd9f 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -35,6 +35,7 @@ type RedisFailoverHealer struct { // NewRedisFailoverHealer creates an object of the RedisFailoverChecker struct func NewRedisFailoverHealer(k8sService k8s.Services, redisClient redis.Client, logger log.Logger) *RedisFailoverHealer { + logger = logger.With("service", "redis.healer") return &RedisFailoverHealer{ k8sService: k8sService, redisClient: redisClient, @@ -109,9 +110,9 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove for _, pod := range ssp.Items { if newMasterIP == "" { newMasterIP = pod.Status.PodIP - r.logger.Debugf("New master is %s with ip %s", pod.Name, newMasterIP) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("New master is %s with ip %s", pod.Name, newMasterIP) if err := r.redisClient.MakeMaster(newMasterIP, port, password); err != nil { - r.logger.Errorf("Make new master failed, master ip: %s, error: %v", pod.Status.PodIP, err) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make new master failed, master ip: %s, error: %v", pod.Status.PodIP, err) continue } @@ -122,9 +123,9 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove newMasterIP = pod.Status.PodIP } else { - r.logger.Debugf("Making pod %s slave of %s", pod.Name, newMasterIP) + r.logger.Infof("Making pod %s slave of %s", pod.Name, newMasterIP) if err := r.redisClient.MakeSlaveOfWithPort(pod.Status.PodIP, newMasterIP, port, password); err != nil { - r.logger.Errorf("Make slave failed, slave pod ip: %s, master ip: %s, error: %v", pod.Status.PodIP, newMasterIP, err) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make slave failed, slave pod ip: %s, master ip: %s, error: %v", pod.Status.PodIP, newMasterIP, err) } err = r.setSlaveLabelIfNecessary(rf.Namespace, pod) @@ -151,9 +152,9 @@ func (r *RedisFailoverHealer) SetMasterOnAll(masterIP string, rf *redisfailoverv port := getRedisPort(rf.Spec.Redis.Port) for _, pod := range ssp.Items { if pod.Status.PodIP == masterIP { - r.logger.Debugf("Ensure pod %s is master", pod.Name) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Ensure pod %s is master", pod.Name) if err := r.redisClient.MakeMaster(masterIP, port, password); err != nil { - r.logger.Errorf("Make master failed, master ip: %s, error: %v", masterIP, err) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make master failed, master ip: %s, error: %v", masterIP, err) return err } @@ -162,9 +163,9 @@ func (r *RedisFailoverHealer) SetMasterOnAll(masterIP string, rf *redisfailoverv return err } } else { - r.logger.Debugf("Making pod %s slave of %s", pod.Name, masterIP) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Making pod %s slave of %s", pod.Name, masterIP) if err := r.redisClient.MakeSlaveOfWithPort(pod.Status.PodIP, masterIP, port, password); err != nil { - r.logger.Errorf("Make slave failed, slave ip: %s, master ip: %s, error: %v", pod.Status.PodIP, masterIP, err) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make slave failed, slave ip: %s, master ip: %s, error: %v", pod.Status.PodIP, masterIP, err) return err } @@ -191,7 +192,7 @@ func (r *RedisFailoverHealer) SetExternalMasterOnAll(masterIP, masterPort string } for _, pod := range ssp.Items { - r.logger.Debugf("Making pod %s slave of %s:%s", pod.Name, masterIP, masterPort) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Making pod %s slave of %s:%s", pod.Name, masterIP, masterPort) if err := r.redisClient.MakeSlaveOfWithPort(pod.Status.PodIP, masterIP, masterPort, password); err != nil { return err } @@ -202,7 +203,7 @@ func (r *RedisFailoverHealer) SetExternalMasterOnAll(masterIP, masterPort string // NewSentinelMonitor changes the master that Sentinel has to monitor func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf *redisfailoverv1.RedisFailover) error { - r.logger.Debug("Sentinel is not monitoring the correct master, changing...") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -216,7 +217,7 @@ func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf * // NewSentinelMonitorWithPort changes the master that Sentinel has to monitor by the provided IP and Port func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor string, monitorPort string, rf *redisfailoverv1.RedisFailover) error { - r.logger.Debug("Sentinel is not monitoring the correct master, changing...") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -229,19 +230,19 @@ func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor stri // RestoreSentinel clear the number of sentinels on memory func (r *RedisFailoverHealer) RestoreSentinel(ip string) error { - r.logger.Debugf("Restoring sentinel %s...", ip) + r.logger.Infof("Restoring sentinel %s...", ip) return r.redisClient.ResetSentinel(ip) } // SetSentinelCustomConfig will call sentinel to set the configuration given in config func (r *RedisFailoverHealer) SetSentinelCustomConfig(ip string, rf *redisfailoverv1.RedisFailover) error { - r.logger.Debugf("Setting the custom config on sentinel %s...", ip) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Setting the custom config on sentinel %s...", ip) return r.redisClient.SetCustomSentinelConfig(ip, rf.Spec.Sentinel.CustomConfig) } // SetRedisCustomConfig will call redis to set the configuration given in config func (r *RedisFailoverHealer) SetRedisCustomConfig(ip string, rf *redisfailoverv1.RedisFailover) error { - r.logger.Debugf("Setting the custom config on redis %s...", ip) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Setting the custom config on redis %s...", ip) password, err := k8s.GetRedisPassword(r.k8sService, rf) if err != nil { @@ -252,8 +253,8 @@ func (r *RedisFailoverHealer) SetRedisCustomConfig(ip string, rf *redisfailoverv return r.redisClient.SetCustomRedisConfig(ip, port, rf.Spec.Redis.CustomConfig, password) } -//DeletePod delete a failing pod so kubernetes relaunch it again +// DeletePod delete a failing pod so kubernetes relaunch it again func (r *RedisFailoverHealer) DeletePod(podName string, rFailover *redisfailoverv1.RedisFailover) error { - r.logger.Debugf("Deleting pods %s...", podName) + r.logger.WithField("redisfailover", rFailover.ObjectMeta.Name).WithField("namespace", rFailover.ObjectMeta.Namespace).Infof("Deleting pods %s...", podName) return r.k8sService.DeletePod(rFailover.Namespace, podName) } diff --git a/service/k8s/configmap.go b/service/k8s/configmap.go index 3b9d77a8f..1b6fc1424 100644 --- a/service/k8s/configmap.go +++ b/service/k8s/configmap.go @@ -54,7 +54,7 @@ func (p *ConfigMapService) CreateConfigMap(namespace string, configMap *corev1.C if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("configMap", configMap.Name).Infof("configMap created") + p.logger.WithField("namespace", namespace).WithField("configMap", configMap.Name).Debugf("configMap created") return nil } func (p *ConfigMapService) UpdateConfigMap(namespace string, configMap *corev1.ConfigMap) error { @@ -63,7 +63,7 @@ func (p *ConfigMapService) UpdateConfigMap(namespace string, configMap *corev1.C if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("configMap", configMap.Name).Infof("configMap updated") + p.logger.WithField("namespace", namespace).WithField("configMap", configMap.Name).Debugf("configMap updated") return nil } func (p *ConfigMapService) CreateOrUpdateConfigMap(namespace string, configMap *corev1.ConfigMap) error { diff --git a/service/k8s/deployment.go b/service/k8s/deployment.go index b73cc984f..46d63bb93 100644 --- a/service/k8s/deployment.go +++ b/service/k8s/deployment.go @@ -75,7 +75,7 @@ func (d *DeploymentService) CreateDeployment(namespace string, deployment *appsv if err != nil { return err } - d.logger.WithField("namespace", namespace).WithField("deployment", deployment.ObjectMeta.Name).Infof("deployment created") + d.logger.WithField("namespace", namespace).WithField("deployment", deployment.ObjectMeta.Name).Debugf("deployment created") return err } @@ -86,7 +86,7 @@ func (d *DeploymentService) UpdateDeployment(namespace string, deployment *appsv if err != nil { return err } - d.logger.WithField("namespace", namespace).WithField("deployment", deployment.ObjectMeta.Name).Infof("deployment updated") + d.logger.WithField("namespace", namespace).WithField("deployment", deployment.ObjectMeta.Name).Debugf("deployment updated") return err } diff --git a/service/k8s/pod.go b/service/k8s/pod.go index 5a6f9c8e2..0583302ce 100644 --- a/service/k8s/pod.go +++ b/service/k8s/pod.go @@ -58,7 +58,7 @@ func (p *PodService) CreatePod(namespace string, pod *corev1.Pod) error { if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("pod", pod.Name).Infof("pod created") + p.logger.WithField("namespace", namespace).WithField("pod", pod.Name).Debugf("pod created") return nil } func (p *PodService) UpdatePod(namespace string, pod *corev1.Pod) error { @@ -67,7 +67,7 @@ func (p *PodService) UpdatePod(namespace string, pod *corev1.Pod) error { if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("pod", pod.Name).Infof("pod updated") + p.logger.WithField("namespace", namespace).WithField("pod", pod.Name).Debugf("pod updated") return nil } func (p *PodService) CreateOrUpdatePod(namespace string, pod *corev1.Pod) error { diff --git a/service/k8s/poddisruptionbudget.go b/service/k8s/poddisruptionbudget.go index df24685df..48350bc43 100644 --- a/service/k8s/poddisruptionbudget.go +++ b/service/k8s/poddisruptionbudget.go @@ -53,7 +53,7 @@ func (p *PodDisruptionBudgetService) CreatePodDisruptionBudget(namespace string, if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("podDisruptionBudget", podDisruptionBudget.Name).Infof("podDisruptionBudget created") + p.logger.WithField("namespace", namespace).WithField("podDisruptionBudget", podDisruptionBudget.Name).Debugf("podDisruptionBudget created") return nil } @@ -63,7 +63,7 @@ func (p *PodDisruptionBudgetService) UpdatePodDisruptionBudget(namespace string, if err != nil { return err } - p.logger.WithField("namespace", namespace).WithField("podDisruptionBudget", podDisruptionBudget.Name).Infof("podDisruptionBudget updated") + p.logger.WithField("namespace", namespace).WithField("podDisruptionBudget", podDisruptionBudget.Name).Debugf("podDisruptionBudget updated") return nil } diff --git a/service/k8s/rbac.go b/service/k8s/rbac.go index 2ca90bb7e..a5534b445 100644 --- a/service/k8s/rbac.go +++ b/service/k8s/rbac.go @@ -66,7 +66,7 @@ func (r *RBACService) DeleteRole(namespace, name string) error { if err != nil { return err } - r.logger.WithField("namespace", namespace).WithField("role", name).Infof("role deleted") + r.logger.WithField("namespace", namespace).WithField("role", name).Debugf("role deleted") return nil } @@ -76,7 +76,7 @@ func (r *RBACService) CreateRole(namespace string, role *rbacv1.Role) error { if err != nil { return err } - r.logger.WithField("namespace", namespace).WithField("role", role.Name).Infof("role created") + r.logger.WithField("namespace", namespace).WithField("role", role.Name).Debugf("role created") return nil } @@ -86,7 +86,7 @@ func (s *RBACService) UpdateRole(namespace string, role *rbacv1.Role) error { if err != nil { return err } - s.logger.WithField("namespace", namespace).WithField("role", role.ObjectMeta.Name).Infof("role updated") + s.logger.WithField("namespace", namespace).WithField("role", role.ObjectMeta.Name).Debugf("role updated") return err } @@ -114,7 +114,7 @@ func (r *RBACService) DeleteRoleBinding(namespace, name string) error { if err != nil { return err } - r.logger.WithField("namespace", namespace).WithField("binding", name).Infof("role binding deleted") + r.logger.WithField("namespace", namespace).WithField("binding", name).Debugf("role binding deleted") return nil } @@ -124,7 +124,7 @@ func (r *RBACService) CreateRoleBinding(namespace string, binding *rbacv1.RoleBi if err != nil { return err } - r.logger.WithField("namespace", namespace).WithField("binding", binding.Name).Infof("role binding created") + r.logger.WithField("namespace", namespace).WithField("binding", binding.Name).Debugf("role binding created") return nil } @@ -134,7 +134,7 @@ func (r *RBACService) UpdateRoleBinding(namespace string, binding *rbacv1.RoleBi if err != nil { return err } - r.logger.WithField("namespace", namespace).WithField("binding", binding.Name).Infof("role binding updated") + r.logger.WithField("namespace", namespace).WithField("binding", binding.Name).Debugf("role binding updated") return nil } diff --git a/service/k8s/service.go b/service/k8s/service.go index d907b9f85..8cd49ed47 100644 --- a/service/k8s/service.go +++ b/service/k8s/service.go @@ -56,7 +56,7 @@ func (s *ServiceService) CreateService(namespace string, service *corev1.Service if err != nil { return err } - s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Infof("service created") + s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Debugf("service created") return nil } @@ -77,7 +77,7 @@ func (s *ServiceService) UpdateService(namespace string, service *corev1.Service if err != nil { return err } - s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Infof("service updated") + s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Debugf("service updated") return nil } func (s *ServiceService) CreateOrUpdateService(namespace string, service *corev1.Service) error { diff --git a/service/k8s/statefulset.go b/service/k8s/statefulset.go index 8de666ba4..2eebc52c8 100644 --- a/service/k8s/statefulset.go +++ b/service/k8s/statefulset.go @@ -76,7 +76,7 @@ func (s *StatefulSetService) CreateStatefulSet(namespace string, statefulSet *ap if err != nil { return err } - s.logger.WithField("namespace", namespace).WithField("statefulSet", statefulSet.ObjectMeta.Name).Infof("statefulSet created") + s.logger.WithField("namespace", namespace).WithField("statefulSet", statefulSet.ObjectMeta.Name).Debugf("statefulSet created") return err } @@ -87,7 +87,7 @@ func (s *StatefulSetService) UpdateStatefulSet(namespace string, statefulSet *ap if err != nil { return err } - s.logger.WithField("namespace", namespace).WithField("statefulSet", statefulSet.ObjectMeta.Name).Infof("statefulSet updated") + s.logger.WithField("namespace", namespace).WithField("statefulSet", statefulSet.ObjectMeta.Name).Debugf("statefulSet updated") return err } From 5f76b6ece2d6aaf1377948ad66bab54c5bd52732 Mon Sep 17 00:00:00 2001 From: Sergio Ballesteros Date: Mon, 28 Nov 2022 16:51:47 +0100 Subject: [PATCH 2/3] Refactor logs for domain logic and more context --- operator/redisfailover/checker.go | 46 ++++++++++++------------- operator/redisfailover/service/check.go | 6 ++-- operator/redisfailover/service/heal.go | 6 ++-- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index b51755a8f..78690e4aa 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -102,17 +102,15 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // Sentinel has not death nodes // Sentinel knows the correct slave number - err := r.rfChecker.CheckRedisNumber(rf) - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") + if !r.rfChecker.IsRedisRunning(rf) { + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } - err = r.rfChecker.CheckSentinelNumber(rf) - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + if !r.rfChecker.IsSentinelRunning(rf) { + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } @@ -122,7 +120,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } switch nMasters { case 0: - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("No masters detected")) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected")) redisesIP, err := r.rfChecker.GetRedisesIPs(rf) if err != nil { return err @@ -138,21 +136,21 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return err2 } if minTime > timeToPrepare { - r.logger.Debugf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) // We can consider there's an error if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { return err2 } } else { // We'll wait until failover is done - r.logger.Debug("No master found, wait until failover") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover") return nil } case 1: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) default: - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected")) - return errors.New("More than one master, fix manually") + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected")) + return errors.New("more than one master, fix manually") } master, err := r.rfChecker.GetMasterIP(rf) @@ -160,12 +158,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return err } - err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf) + err = r.rfChecker.CheckAllSlavesFromMaster(master, rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err) - if err2 != nil { - r.logger.Debug("Not all slaves have the same master") - if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil { - return err3 + if err != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error()) + if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil { + return err } } @@ -190,7 +188,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err = r.rfChecker.CheckSentinelMonitor(sip, master, port) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) if err != nil { - r.logger.Debug("Sentinel is not monitoring the correct master") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err } @@ -203,7 +201,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err := r.rfChecker.CheckRedisNumber(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) if err != nil { - r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } @@ -226,7 +224,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfChecker.CheckSentinelNumber(rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) if err != nil { - r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } @@ -238,7 +236,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) if err != nil { - r.logger.Debug("Sentinel is not monitoring the correct master") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil { return err } @@ -267,7 +265,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err) if err != nil { - r.logger.Debug("Sentinel has more sentinel in memory than spected") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip) if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err } @@ -278,7 +276,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err) if err != nil { - r.logger.Debug("Sentinel has more slaves in memory than spected") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip) if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err } diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index ad86e858a..63946a255 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -168,7 +168,7 @@ func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ... return err } if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) { - return errors.New("the monitor on the sentinel config does not match with the expected one") + return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort) } return nil } @@ -209,11 +209,13 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove nMasters := 0 rips, err := r.GetRedisesIPs(rf) if err != nil { + r.logger.Errorf(err.Error()) return nMasters, err } password, err := k8s.GetRedisPassword(r.k8sService, rf) if err != nil { + r.logger.Errorf("Error getting password: %s", err.Error()) return nMasters, err } @@ -274,7 +276,7 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF } start := redisNode.Status.StartTime.Round(time.Second) alive := time.Since(start) - r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) + r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) if alive < minTime { minTime = alive } diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 22a3ffd9f..7c87da427 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -203,7 +203,6 @@ func (r *RedisFailoverHealer) SetExternalMasterOnAll(masterIP, masterPort string // NewSentinelMonitor changes the master that Sentinel has to monitor func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -217,7 +216,6 @@ func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf * // NewSentinelMonitorWithPort changes the master that Sentinel has to monitor by the provided IP and Port func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor string, monitorPort string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -230,13 +228,13 @@ func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor stri // RestoreSentinel clear the number of sentinels on memory func (r *RedisFailoverHealer) RestoreSentinel(ip string) error { - r.logger.Infof("Restoring sentinel %s...", ip) + r.logger.Debugf("Restoring sentinel %s", ip) return r.redisClient.ResetSentinel(ip) } // SetSentinelCustomConfig will call sentinel to set the configuration given in config func (r *RedisFailoverHealer) SetSentinelCustomConfig(ip string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Setting the custom config on sentinel %s...", ip) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Setting the custom config on sentinel %s...", ip) return r.redisClient.SetCustomSentinelConfig(ip, rf.Spec.Sentinel.CustomConfig) } From 1e46182984c0dee22b95382acd7eb3c1c7cac6d8 Mon Sep 17 00:00:00 2001 From: Sergio Ballesteros Date: Thu, 1 Dec 2022 09:27:27 +0100 Subject: [PATCH 3/3] Add IsRedisRunning checks to ensure that pods are up and running --- .../service/RedisFailoverCheck.go | 42 +++++++++ operator/redisfailover/checker.go | 15 ++-- operator/redisfailover/checker_test.go | 6 +- operator/redisfailover/service/check.go | 29 +++++++ operator/redisfailover/service/check_test.go | 87 +++++++++++++++++++ 5 files changed, 168 insertions(+), 11 deletions(-) diff --git a/mocks/operator/redisfailover/service/RedisFailoverCheck.go b/mocks/operator/redisfailover/service/RedisFailoverCheck.go index 80bb06610..c21c37f1a 100644 --- a/mocks/operator/redisfailover/service/RedisFailoverCheck.go +++ b/mocks/operator/redisfailover/service/RedisFailoverCheck.go @@ -322,6 +322,48 @@ func (_m *RedisFailoverCheck) GetStatefulSetUpdateRevision(rFailover *v1.RedisFa return r0, r1 } +// IsClusterRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsClusterRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsRedisRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsRedisRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsSentinelRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsSentinelRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + type mockConstructorTestingTNewRedisFailoverCheck interface { mock.TestingT Cleanup(func()) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 78690e4aa..0324b341b 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -198,14 +198,14 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error { - err := r.rfChecker.CheckRedisNumber(rf) - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { + + if !r.rfChecker.IsRedisRunning(rf) { + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } - err = r.UpdateRedisesPods(rf) + err := r.UpdateRedisesPods(rf) if err != nil { return err } @@ -221,10 +221,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red } if rf.SentinelsAllowed() { - err = r.rfChecker.CheckSentinelNumber(rf) - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") + if !r.rfChecker.IsSentinelRunning(rf) { + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } diff --git a/operator/redisfailover/checker_test.go b/operator/redisfailover/checker_test.go index 02031e724..c8ae26970 100644 --- a/operator/redisfailover/checker_test.go +++ b/operator/redisfailover/checker_test.go @@ -265,14 +265,14 @@ func TestCheckAndHeal(t *testing.T) { mrfh := &mRFService.RedisFailoverHeal{} if test.redisCheckNumberOK { - mrfc.On("CheckRedisNumber", rf).Once().Return(nil) + mrfc.On("IsRedisRunning", rf).Once().Return(true) } else { continueTests = false - mrfc.On("CheckRedisNumber", rf).Once().Return(errors.New("")) + mrfc.On("IsRedisRunning", rf).Once().Return(false) } if allowSentinels { - mrfc.On("CheckSentinelNumber", rf).Once().Return(nil) + mrfc.On("IsSentinelRunning", rf).Once().Return(true) } if bootstrappingTests && continueTests { diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 63946a255..1a2a94914 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -34,6 +34,9 @@ type RedisFailoverCheck interface { GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error) CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error) + IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool + IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool + IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool } // RedisFailoverChecker is our implementation of RedisFailoverCheck interface @@ -385,6 +388,32 @@ func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redis return r.redisClient.SlaveIsReady(ip, port, password) } +// IsRedisRunning returns true if all the pods are Running +func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool { + dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover)) + return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) +} + +// IsSentinelRunning returns true if all the pods are Running +func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool { + dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover)) + return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) +} + +// IsClusterRunning returns true if all the pods in the given redisfailover are Running +func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool { + return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover) +} + func getRedisPort(p int32) string { return strconv.Itoa(int(p)) } + +func AreAllRunning(pods *corev1.PodList) bool { + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil { + return false + } + } + return true +} diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index 79ceb84a7..cd81c901a 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -869,3 +869,90 @@ func TestGetRedisRevisionHash(t *testing.T) { } } + +func TestClusterRunning(t *testing.T) { + assert := assert.New(t) + + rf := generateRF() + + allRunning := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + notAllRunning := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodPending, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + notAllReplicas := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + ms := &mK8SService.Services{} + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil) + mr := &mRedisService.Client{} + + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) + + assert.True(checker.IsClusterRunning(rf)) + + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(notAllReplicas, nil) + assert.False(checker.IsClusterRunning(rf)) + + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(notAllRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil) + assert.False(checker.IsClusterRunning(rf)) + +}