Skip to content

Commit

Permalink
Merge pull request #533 from spotahome/logging
Browse files Browse the repository at this point in the history
Refactor logging to give more visibility to check and heal services
  • Loading branch information
ese authored Dec 1, 2022
2 parents 716d0d6 + 1e46182 commit 3ceb1b2
Show file tree
Hide file tree
Showing 13 changed files with 226 additions and 70 deletions.
42 changes: 42 additions & 0 deletions mocks/operator/redisfailover/service/RedisFailoverCheck.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 28 additions & 31 deletions operator/redisfailover/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,15 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
// Sentinel has not death nodes
// Sentinel knows the correct slave number

err := r.rfChecker.CheckRedisNumber(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset")
if !r.rfChecker.IsRedisRunning(rf) {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
return nil
}

err = r.rfChecker.CheckSentinelNumber(rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment")
if !r.rfChecker.IsSentinelRunning(rf) {
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
return nil
}

Expand All @@ -122,7 +120,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
}
switch nMasters {
case 0:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("No masters detected"))
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
redisesIP, err := r.rfChecker.GetRedisesIPs(rf)
if err != nil {
return err
Expand All @@ -138,34 +136,34 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
return err2
}
if minTime > timeToPrepare {
r.logger.Debugf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
// We can consider there's an error
if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil {
return err2
}
} else {
// We'll wait until failover is done
r.logger.Debug("No master found, wait until failover")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover")
return nil
}
case 1:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil)
default:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected"))
return errors.New("More than one master, fix manually")
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected"))
return errors.New("more than one master, fix manually")
}

master, err := r.rfChecker.GetMasterIP(rf)
if err != nil {
return err
}

err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf)
err = r.rfChecker.CheckAllSlavesFromMaster(master, rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err)
if err2 != nil {
r.logger.Debug("Not all slaves have the same master")
if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil {
return err3
if err != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error())
if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil {
return err
}
}

Expand All @@ -190,7 +188,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
err = r.rfChecker.CheckSentinelMonitor(sip, master, port)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
if err != nil {
r.logger.Debug("Sentinel is not monitoring the correct master")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil {
return err
}
Expand All @@ -200,14 +198,14 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
}

func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error {
err := r.rfChecker.CheckRedisNumber(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset")

if !r.rfChecker.IsRedisRunning(rf) {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
return nil
}

err = r.UpdateRedisesPods(rf)
err := r.UpdateRedisesPods(rf)
if err != nil {
return err
}
Expand All @@ -223,10 +221,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
}

if rf.SentinelsAllowed() {
err = r.rfChecker.CheckSentinelNumber(rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment")
if !r.rfChecker.IsSentinelRunning(rf) {
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
return nil
}

Expand All @@ -238,7 +235,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
if err != nil {
r.logger.Debug("Sentinel is not monitoring the correct master")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil {
return err
}
Expand Down Expand Up @@ -267,7 +264,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa
err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err)
if err != nil {
r.logger.Debug("Sentinel has more sentinel in memory than spected")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip)
if err := r.rfHealer.RestoreSentinel(sip); err != nil {
return err
}
Expand All @@ -278,7 +275,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa
err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err)
if err != nil {
r.logger.Debug("Sentinel has more slaves in memory than spected")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip)
if err := r.rfHealer.RestoreSentinel(sip); err != nil {
return err
}
Expand Down
6 changes: 3 additions & 3 deletions operator/redisfailover/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,14 @@ func TestCheckAndHeal(t *testing.T) {
mrfh := &mRFService.RedisFailoverHeal{}

if test.redisCheckNumberOK {
mrfc.On("CheckRedisNumber", rf).Once().Return(nil)
mrfc.On("IsRedisRunning", rf).Once().Return(true)
} else {
continueTests = false
mrfc.On("CheckRedisNumber", rf).Once().Return(errors.New(""))
mrfc.On("IsRedisRunning", rf).Once().Return(false)
}

if allowSentinels {
mrfc.On("CheckSentinelNumber", rf).Once().Return(nil)
mrfc.On("IsSentinelRunning", rf).Once().Return(true)
}

if bootstrappingTests && continueTests {
Expand Down
35 changes: 33 additions & 2 deletions operator/redisfailover/service/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ type RedisFailoverCheck interface {
GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error)
GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error)
CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error)
IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool
IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool
IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool
}

// RedisFailoverChecker is our implementation of RedisFailoverCheck interface
Expand Down Expand Up @@ -168,7 +171,7 @@ func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ...
return err
}
if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) {
return errors.New("the monitor on the sentinel config does not match with the expected one")
return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort)
}
return nil
}
Expand Down Expand Up @@ -209,11 +212,13 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove
nMasters := 0
rips, err := r.GetRedisesIPs(rf)
if err != nil {
r.logger.Errorf(err.Error())
return nMasters, err
}

password, err := k8s.GetRedisPassword(r.k8sService, rf)
if err != nil {
r.logger.Errorf("Error getting password: %s", err.Error())
return nMasters, err
}

Expand Down Expand Up @@ -274,7 +279,7 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF
}
start := redisNode.Status.StartTime.Round(time.Second)
alive := time.Since(start)
r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds())
r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds())
if alive < minTime {
minTime = alive
}
Expand Down Expand Up @@ -383,6 +388,32 @@ func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redis
return r.redisClient.SlaveIsReady(ip, port, password)
}

// IsRedisRunning returns true if all the pods are Running
func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool {
dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover))
return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
}

// IsSentinelRunning returns true if all the pods are Running
func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool {
dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover))
return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
}

// IsClusterRunning returns true if all the pods in the given redisfailover are Running
func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool {
return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover)
}

func getRedisPort(p int32) string {
return strconv.Itoa(int(p))
}

func AreAllRunning(pods *corev1.PodList) bool {
for _, pod := range pods.Items {
if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil {
return false
}
}
return true
}
87 changes: 87 additions & 0 deletions operator/redisfailover/service/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -869,3 +869,90 @@ func TestGetRedisRevisionHash(t *testing.T) {
}

}

func TestClusterRunning(t *testing.T) {
assert := assert.New(t)

rf := generateRF()

allRunning := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

notAllRunning := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodPending,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

notAllReplicas := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

ms := &mK8SService.Services{}
ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil)
mr := &mRedisService.Client{}

checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy)

assert.True(checker.IsClusterRunning(rf))

ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(notAllReplicas, nil)
assert.False(checker.IsClusterRunning(rf))

ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(notAllRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil)
assert.False(checker.IsClusterRunning(rf))

}
Loading

0 comments on commit 3ceb1b2

Please sign in to comment.