From 45537f3e50c68ef7664a5a44054d04f8388717be Mon Sep 17 00:00:00 2001 From: Ryan Old Date: Fri, 22 Nov 2024 13:15:43 -0800 Subject: [PATCH 1/3] Enable stats in common recommendations --- test/kubernetes/e2e/tests/manifests/common-recommendations.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/kubernetes/e2e/tests/manifests/common-recommendations.yaml b/test/kubernetes/e2e/tests/manifests/common-recommendations.yaml index 2529898d53a..d3d707f480b 100644 --- a/test/kubernetes/e2e/tests/manifests/common-recommendations.yaml +++ b/test/kubernetes/e2e/tests/manifests/common-recommendations.yaml @@ -74,6 +74,8 @@ gloo: limits: cpu: 1000m memory: 10Gi + stats: + enabled: true # enable stats server for gloo so we can collect the metrics in CI # Configuration for the statically deployed gateway-proxy that ships by default with Gloo Gateway gatewayProxies: From ff88db6454d48c7697e0fd7d1bb71bb30ac555f9 Mon Sep 17 00:00:00 2001 From: Ryan Old Date: Fri, 22 Nov 2024 17:29:56 -0800 Subject: [PATCH 2/3] Fetch and save Gloo metrics and some snapshots on failure --- .../virtualhost_options/vhost_opt_suite.go | 2 + test/kubernetes/e2e/test.go | 54 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/test/kubernetes/e2e/features/virtualhost_options/vhost_opt_suite.go b/test/kubernetes/e2e/features/virtualhost_options/vhost_opt_suite.go index bf0f3bb0fc4..d7fd61e838f 100644 --- a/test/kubernetes/e2e/features/virtualhost_options/vhost_opt_suite.go +++ b/test/kubernetes/e2e/features/virtualhost_options/vhost_opt_suite.go @@ -225,6 +225,8 @@ func (s *testingSuite) TestConfigureVirtualHostOptionsWithSectionNameManualSetup []string{"conflict with more specific or older VirtualHostOptions"}, defaults.KubeGatewayReporter, ) + + s.Assert().Equal(true, false, "intentionally failing to trigger drump, remove when done debugging") } // The goal here is to test the behavior when multiple VHOs are targeting a gateway without sectionName. The expected diff --git a/test/kubernetes/e2e/test.go b/test/kubernetes/e2e/test.go index 7690d5cb474..bf800c10363 100644 --- a/test/kubernetes/e2e/test.go +++ b/test/kubernetes/e2e/test.go @@ -1,6 +1,7 @@ package e2e import ( + "bytes" "context" "errors" "fmt" @@ -314,6 +315,59 @@ func (i *TestInstallation) PreFailHandler(ctx context.Context) { kubectlGetResourcesCmd := i.Actions.Kubectl().Command(ctx, "get", strings.Join(resourcesToGet, ","), "-A", "-owide") _ = kubectlGetResourcesCmd.WithStdout(clusterStateFile).WithStderr(clusterStateFile).Run() clusterStateFile.WriteString("\n") + + podStdOut := bytes.NewBuffer(nil) + podStdErr := bytes.NewBuffer(nil) + + // Fetch the name of the Gloo Gateway controller pod + getGlooPodNameCmd := i.Actions.Kubectl().Command(ctx, "get", "pod", "-n", i.Metadata.InstallNamespace, + "--selector", "gloo=gloo", "--output", "jsonpath='{.items[0].metadata.name}'") + _ = getGlooPodNameCmd.WithStdout(podStdOut).WithStderr(podStdErr).Run() + + // Clean up and check the output + glooPodName := strings.Trim(podStdOut.String(), "'") + if glooPodName == "" { + fmt.Printf("Failed to get the name of the Gloo Gateway controller pod: %s\n", podStdErr.String()) + return + } + + // Get the metrics from the Gloo Gateway controller pod and write them to a file + metricsFilePath := filepath.Join(failureDir, "metrics.log") + metricsFile, err := os.OpenFile(metricsFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, os.ModePerm) + i.Assertions.Require.NoError(err) + + // Using an ephemeral debug pod fetch the metrics from the Gloo Gateway controller + metricsCmd := i.Actions.Kubectl().Command(ctx, "debug", "-n", i.Metadata.InstallNamespace, + "-it", "--image=curlimages/curl:7.83.1", glooPodName, "--", + "curl", "http://localhost:9091/metrics") + _ = metricsCmd.WithStdout(metricsFile).WithStderr(metricsFile).Run() + metricsFile.Close() + + // Get krt snapshot from the Gloo Gateway controller pod and write it to a file + krtSnapshotFilePath := filepath.Join(failureDir, "krt_snapshot.log") + krtSnapshotFile, err := os.OpenFile(krtSnapshotFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, os.ModePerm) + i.Assertions.Require.NoError(err) + + // Using an ephemeral debug pod fetch the krt snapshot from the Gloo Gateway controller + krtSnapshotCmd := i.Actions.Kubectl().Command(ctx, "debug", "-n", i.Metadata.InstallNamespace, + "-it", "--image=curlimages/curl:7.83.1", glooPodName, "--", + "curl", "http://localhost:9095/snapshots/krt") + _ = krtSnapshotCmd.WithStdout(krtSnapshotFile).WithStderr(krtSnapshotFile).Run() + krtSnapshotFile.Close() + + // Get xds snapshot from the Gloo Gateway controller pod and write it to a file + xdsSnapshotFilePath := filepath.Join(failureDir, "xds_snapshot.log") + xdsSnapshotFile, err := os.OpenFile(xdsSnapshotFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, os.ModePerm) + i.Assertions.Require.NoError(err) + + // Using an ephemeral debug pod fetch the xds snapshot from the Gloo Gateway controller + xdsSnapshotCmd := i.Actions.Kubectl().Command(ctx, "debug", "-n", i.Metadata.InstallNamespace, + "-it", "--image=curlimages/curl:7.83.1", glooPodName, "--", + "curl", "http://localhost:9095/snapshots/xds") + _ = xdsSnapshotCmd.WithStdout(xdsSnapshotFile).WithStderr(xdsSnapshotFile).Run() + xdsSnapshotFile.Close() + + fmt.Printf("Test failed. Logs and cluster state are available in %s\n", failureDir) } // GeneratedFiles is a collection of files that are generated during the execution of a set of tests From e2b43ec2db5129ee1e39afcdf269a793770837d8 Mon Sep 17 00:00:00 2001 From: Ryan Old Date: Fri, 22 Nov 2024 17:41:57 -0800 Subject: [PATCH 3/3] Added changelog --- .../collect-more-artifacts-on-ci-failure.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 changelog/v1.18.0-rc2/collect-more-artifacts-on-ci-failure.yaml diff --git a/changelog/v1.18.0-rc2/collect-more-artifacts-on-ci-failure.yaml b/changelog/v1.18.0-rc2/collect-more-artifacts-on-ci-failure.yaml new file mode 100644 index 00000000000..6e1cb3b66c0 --- /dev/null +++ b/changelog/v1.18.0-rc2/collect-more-artifacts-on-ci-failure.yaml @@ -0,0 +1,9 @@ +changelog: + - type: NON_USER_FACING + description: >- + Gloo Gateway controller metrics and xds/krt snaphots are now collected and included + the test failure artifacts. + After encountering some test failures that proved difficult to debug without knowing more + about the state of the cluster, we have added additional artifacts to be collected when + a test fails. + This will help us to more easily diagnose the cause of test failures.