From 304919acd7d6d829d9861b17340867ef3d35d54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 11:36:10 +0100 Subject: [PATCH 01/24] Use metrics macros in persisted queries layer Notes: - Fixed a typo in the not found attribute: `persisted_quieries.not_found` -> `persisted_queries.not_found`. - Added description, it would be useful for someone to check it. It reads to me like *every* request is measured? --- apollo-router/src/metrics/mod.rs | 31 ++++++++++++++- .../services/layers/persisted_queries/mod.rs | 38 +++++++++++++------ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/apollo-router/src/metrics/mod.rs b/apollo-router/src/metrics/mod.rs index e24317cd06..4b2aa0b888 100644 --- a/apollo-router/src/metrics/mod.rs +++ b/apollo-router/src/metrics/mod.rs @@ -496,7 +496,9 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider { } #[macro_export] -/// Get or create a u64 monotonic counter metric and add a value to it +/// Get or create a `u64` monotonic counter metric and add a value to it. +/// +/// Each metric needs a description. /// /// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. /// The reason for this is that the MetricsLayer has: @@ -506,6 +508,33 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider { /// * Imperfect mapping to metrics API that can only be checked at runtime. /// /// New metrics should be added using these macros. +/// +/// # Examples +/// ```ignore +/// // Count a thing: +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1 +/// ); +/// // Count a thing with attributes: +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1, +/// frobbles.color = "blue" +/// ); +/// // Count a thing with dynamic attributes: +/// let attributes = [ +/// opentelemetry::KeyValue::new("frobbles.color".to_string(), "blue".into()), +/// ]; +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1, +/// attributes +/// ); +/// ``` #[allow(unused_macros)] macro_rules! u64_counter { ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { diff --git a/apollo-router/src/services/layers/persisted_queries/mod.rs b/apollo-router/src/services/layers/persisted_queries/mod.rs index 457ea67820..48c4dcba1d 100644 --- a/apollo-router/src/services/layers/persisted_queries/mod.rs +++ b/apollo-router/src/services/layers/persisted_queries/mod.rs @@ -122,7 +122,11 @@ impl PersistedQueryLayer { .context .extensions() .with_lock(|mut lock| lock.insert(UsedQueryIdFromManifest)); - tracing::info!(monotonic_counter.apollo.router.operations.persisted_queries = 1u64); + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1 + ); Ok(request) } else if manifest_poller.augmenting_apq_with_pre_registration_and_no_safelisting() { // The query ID isn't in our manifest, but we have APQ enabled @@ -131,8 +135,10 @@ impl PersistedQueryLayer { // safelist later for log_unknown!) Ok(request) } else { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.not_found = true ); // if APQ is not enabled, return an error indicating the query was not found @@ -209,28 +215,38 @@ impl PersistedQueryLayer { match manifest_poller.action_for_freeform_graphql(Ok(&doc.ast)) { FreeformGraphQLAction::Allow => { - tracing::info!(monotonic_counter.apollo.router.operations.persisted_queries = 1u64,); + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1 + ); Ok(request) } FreeformGraphQLAction::Deny => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, - persisted_queries.safelist.rejected.unknown = false, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, + persisted_queries.safelist.rejected.unknown = false ); Err(supergraph_err_operation_not_in_safelist(request)) } // Note that this might even include complaining about an operation that came via APQs. FreeformGraphQLAction::AllowAndLog => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.logged = true ); log_unknown_operation(operation_body); Ok(request) } FreeformGraphQLAction::DenyAndLog => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.safelist.rejected.unknown = true, persisted_queries.logged = true ); From 30fc35e07618820ab218bd7a2063e939a12b66cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 11:42:43 +0100 Subject: [PATCH 02/24] Use new metrics macro for counters in subgraph service Notes: - Removes the `apollo_router_deduplicated_subscriptions_total` metric. This is already captured by `apollo.router.operations.subscriptions` in the `subscriptions.deduplicated` attribute. - The `apollo.router.operations.batching` metric appears to use an older style of attribute naming? --- .../src/services/subgraph_service.rs | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/apollo-router/src/services/subgraph_service.rs b/apollo-router/src/services/subgraph_service.rs index 9dbb9fb773..09e25090f0 100644 --- a/apollo-router/src/services/subgraph_service.rs +++ b/apollo-router/src/services/subgraph_service.rs @@ -300,17 +300,15 @@ impl tower::Service for SubgraphService { })?; stream_tx.send(Box::pin(handle.into_stream())).await?; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions = 1u64, - subscriptions.mode = %"callback", + u64_counter!( + "apollo.router.operations.subscriptions", + "Total requests with subscription operations", + 1, + subscriptions.mode = "callback", subscriptions.deduplicated = !created, - subgraph.service.name = service_name, + subgraph.service.name = service_name.clone() ); if !created { - tracing::info!( - monotonic_counter.apollo_router_deduplicated_subscriptions_total = 1u64, - mode = %"callback", - ); // Dedup happens here return Ok(SubgraphResponse::builder() .subgraph_name(service_name.clone()) @@ -507,20 +505,18 @@ async fn call_websocket( let (handle, created) = notify .create_or_subscribe(subscription_hash.clone(), false) .await?; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions = 1u64, - subscriptions.mode = %"passthrough", + u64_counter!( + "apollo.router.operations.subscriptions", + "Total requests with subscription operations", + 1, + subscriptions.mode = "passthrough", subscriptions.deduplicated = !created, - subgraph.service.name = service_name, + subgraph.service.name = service_name.clone() ); if !created { subscription_stream_tx .send(Box::pin(handle.into_stream())) .await?; - tracing::info!( - monotonic_counter.apollo_router_deduplicated_subscriptions_total = 1u64, - mode = %"passthrough", - ); // Dedup happens here return Ok(SubgraphResponse::builder() @@ -868,9 +864,14 @@ pub(crate) async fn process_batch( subgraph = &service ); - tracing::info!(monotonic_counter.apollo.router.operations.batching = 1u64, - mode = %BatchingMode::BatchHttpLink, // Only supported mode right now - subgraph = &service + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + // XXX(@goto-bus-stop): Should these be `batching.mode`, `batching.subgraph`? + // Also, other metrics use a different convention to report the subgraph name + mode = BatchingMode::BatchHttpLink.to_string(), // Only supported mode right now + subgraph = service.clone() ); // Perform the actual fetch. If this fails then we didn't manage to make the call at all, so we can't do anything with it. From 148d9c4473e8f4b936ab8cd6d4c02be77ecdbb6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 14:53:05 +0100 Subject: [PATCH 03/24] Use new metrics macro for counters in graphql::Request --- apollo-router/src/graphql/request.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/apollo-router/src/graphql/request.rs b/apollo-router/src/graphql/request.rs index 1e51262dbf..fc572f70cb 100644 --- a/apollo-router/src/graphql/request.rs +++ b/apollo-router/src/graphql/request.rs @@ -202,9 +202,11 @@ impl Request { mode = %BatchingMode::BatchHttpLink // Only supported mode right now ); - tracing::info!( - monotonic_counter.apollo.router.operations.batching = 1u64, - mode = %BatchingMode::BatchHttpLink // Only supported mode right now + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); for entry in value .as_array() @@ -229,9 +231,11 @@ impl Request { mode = "batch_http_link" // Only supported mode right now ); - tracing::info!( - monotonic_counter.apollo.router.operations.batching = 1u64, - mode = "batch_http_link" // Only supported mode right now + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); for entry in value .as_array() From be452c99ee71accf89b0275b519c4b1e359217a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 14:58:10 +0100 Subject: [PATCH 04/24] Use new metrics macro for counting defer requests and responses --- apollo-router/src/query_planner/execution.rs | 6 +++++- apollo-router/src/query_planner/fetch.rs | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/query_planner/execution.rs b/apollo-router/src/query_planner/execution.rs index bd23f549ea..9a5b647350 100644 --- a/apollo-router/src/query_planner/execution.rs +++ b/apollo-router/src/query_planner/execution.rs @@ -82,7 +82,11 @@ impl QueryPlan { ) .await; if !deferred_fetches.is_empty() { - tracing::info!(monotonic_counter.apollo.router.operations.defer = 1u64); + u64_counter!( + "apollo.router.operations.defer", + "Number of requests that request deferred data", + 1 + ); } Response::builder().data(value).errors(errors).build() diff --git a/apollo-router/src/query_planner/fetch.rs b/apollo-router/src/query_planner/fetch.rs index 47069283ca..3ec89c1e95 100644 --- a/apollo-router/src/query_planner/fetch.rs +++ b/apollo-router/src/query_planner/fetch.rs @@ -521,7 +521,11 @@ impl FetchNode { self.response_at_path(parameters.schema, current_dir, paths, response); if let Some(id) = &self.id { if let Some(sender) = parameters.deferred_fetches.get(id.as_str()) { - tracing::info!(monotonic_counter.apollo.router.operations.defer.fetch = 1u64); + u64_counter!( + "apollo.router.operations.defer.fetch", + "Number of deferred responses fetched from subgraphs", + 1 + ); if let Err(e) = sender.clone().send((value.clone(), errors.clone())) { tracing::error!("error sending fetch result at path {} and id {:?} for deferred response building: {}", current_dir, self.id, e); } From 7f744e98b82636e2d47e485655ea245c85464a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 15:01:06 +0100 Subject: [PATCH 05/24] Use new metrics macro for counting uplink fetches --- apollo-router/src/uplink/mod.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/apollo-router/src/uplink/mod.rs b/apollo-router/src/uplink/mod.rs index 6a8974699e..2ea483daf4 100644 --- a/apollo-router/src/uplink/mod.rs +++ b/apollo-router/src/uplink/mod.rs @@ -210,7 +210,7 @@ where Response: Send + 'static + Debug, TransformedResponse: Send + 'static + Debug, { - let query = query_name::(); + let query_name = query_name::(); let (sender, receiver) = channel(2); let client = match reqwest::Client::builder() .no_gzip() @@ -245,10 +245,12 @@ where .await { Ok(response) => { - tracing::info!( - monotonic_counter.apollo_router_uplink_fetch_count_total = 1u64, + u64_counter!( + "apollo_router_uplink_fetch_count_total", + "Total number of requests to Apollo Uplink", + 1u64, status = "success", - query + query = query_name ); match response { UplinkResponse::New { @@ -294,10 +296,12 @@ where } } Err(err) => { - tracing::info!( - monotonic_counter.apollo_router_uplink_fetch_count_total = 1u64, + u64_counter!( + "apollo_router_uplink_fetch_count_total", + "Total number of requests to Apollo Uplink", + 1u64, status = "failure", - query + query = query_name ); if let Err(e) = sender.send(Err(err)).await { tracing::debug!("failed to send error to uplink stream. This is likely to be because the router is shutting down: {e}"); From eb3a8f7f0c10f7ae3e49a42467561a1911024565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 15:03:14 +0100 Subject: [PATCH 06/24] Use new metrics macro for counting cache hits --- apollo-router/src/cache/storage.rs | 40 ++++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 15f452ed28..4408d24c5c 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -170,10 +170,12 @@ where match res { Some(v) => { - tracing::info!( - monotonic_counter.apollo_router_cache_hit_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), + u64_counter!( + "apollo_router_cache_hit_count", + "Number of cache hits", + 1, + kind = self.caller, + storage = CacheStorageName::Memory.to_string() ); let duration = instant_memory.elapsed().as_secs_f64(); tracing::info!( @@ -190,10 +192,12 @@ where kind = %self.caller, storage = &tracing::field::display(CacheStorageName::Memory), ); - tracing::info!( - monotonic_counter.apollo_router_cache_miss_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), + u64_counter!( + "apollo_router_cache_miss_count", + "Number of cache misses", + 1, + kind = self.caller, + storage = CacheStorageName::Memory.to_string() ); let instant_redis = Instant::now(); @@ -214,10 +218,12 @@ where Some(v) => { self.insert_in_memory(key.clone(), v.0.clone()).await; - tracing::info!( - monotonic_counter.apollo_router_cache_hit_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), + u64_counter!( + "apollo_router_cache_hit_count", + "Number of cache hits", + 1, + kind = self.caller, + storage = CacheStorageName::Redis.to_string() ); let duration = instant_redis.elapsed().as_secs_f64(); tracing::info!( @@ -228,10 +234,12 @@ where Some(v.0) } None => { - tracing::info!( - monotonic_counter.apollo_router_cache_miss_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), + u64_counter!( + "apollo_router_cache_miss_count", + "Number of cache misses", + 1, + kind = self.caller, + storage = CacheStorageName::Redis.to_string() ); let duration = instant_redis.elapsed().as_secs_f64(); tracing::info!( From a93408daf1ba4f149312acb4ca9ba39bcae4f74e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Mon, 25 Nov 2024 16:38:22 +0100 Subject: [PATCH 07/24] Use new metrics macro for counting authentication uses --- .../src/plugins/authentication/mod.rs | 25 ++++++------------- .../src/plugins/authentication/subgraph.rs | 16 +++++++----- .../src/plugins/authorization/mod.rs | 14 +++++++---- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/apollo-router/src/plugins/authentication/mod.rs b/apollo-router/src/plugins/authentication/mod.rs index 9f9abc8040..bf3889adf2 100644 --- a/apollo-router/src/plugins/authentication/mod.rs +++ b/apollo-router/src/plugins/authentication/mod.rs @@ -539,8 +539,6 @@ fn authenticate( jwks_manager: &JwksManager, request: router::Request, ) -> ControlFlow { - const AUTHENTICATION_KIND: &str = "JWT"; - // We are going to do a lot of similar checking so let's define a local function // to help reduce repetition fn failure_message( @@ -549,17 +547,10 @@ fn authenticate( status: StatusCode, ) -> ControlFlow { // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_authentication_failure_count = 1u64, - kind = %AUTHENTICATION_KIND - ); - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .authentication - .jwt = 1, + u64_counter!( + "apollo.router.operations.authentication.jwt", + "Number of requests with JWT authentication", + 1, authentication.jwt.failed = true ); tracing::info!(message = %error, "jwt authentication failure"); @@ -662,11 +653,11 @@ fn authenticate( ); } // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_authentication_success_count = 1u64, - kind = %AUTHENTICATION_KIND + u64_counter!( + "apollo.router.operations.jwt", + "Number of requests with JWT authentication", + 1 ); - tracing::info!(monotonic_counter.apollo.router.operations.jwt = 1u64); return ControlFlow::Continue(request); } diff --git a/apollo-router/src/plugins/authentication/subgraph.rs b/apollo-router/src/plugins/authentication/subgraph.rs index 568aa9c8ac..fb43467d83 100644 --- a/apollo-router/src/plugins/authentication/subgraph.rs +++ b/apollo-router/src/plugins/authentication/subgraph.rs @@ -409,17 +409,21 @@ impl SigningParamsConfig { } fn increment_success_counter(subgraph_name: &str) { - tracing::info!( - monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64, + u64_counter!( + "apollo.router.operations.authentication.aws.sigv4", + "", + 1, authentication.aws.sigv4.failed = false, - subgraph.service.name = %subgraph_name, + subgraph.service.name = subgraph_name.to_string() ); } fn increment_failure_counter(subgraph_name: &str) { - tracing::info!( - monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64, + u64_counter!( + "apollo.router.operations.authentication.aws.sigv4", + "", + 1, authentication.aws.sigv4.failed = true, - subgraph.service.name = %subgraph_name, + subgraph.service.name = subgraph_name.to_string() ); } diff --git a/apollo-router/src/plugins/authorization/mod.rs b/apollo-router/src/plugins/authorization/mod.rs index 331641a726..4da5c4ef99 100644 --- a/apollo-router/src/plugins/authorization/mod.rs +++ b/apollo-router/src/plugins/authorization/mod.rs @@ -556,8 +556,10 @@ impl Plugin for AuthorizationPlugin { Ok(ControlFlow::Continue(request)) } else { // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_require_authentication_failure_count = 1u64, + u64_counter!( + "apollo_require_authentication_failure_count", + "", + 1 ); tracing::error!("rejecting unauthenticated request"); let response = supergraph::Response::error_builder() @@ -588,11 +590,13 @@ impl Plugin for AuthorizationPlugin { let needs_requires_scopes = request.context.contains_key(REQUIRED_SCOPES_KEY); if needs_authenticated || needs_requires_scopes { - tracing::info!( - monotonic_counter.apollo.router.operations.authorization = 1u64, + u64_counter!( + "apollo.router.operations.authorization", + "", + 1, authorization.filtered = filtered, authorization.needs_authenticated = needs_authenticated, - authorization.needs_requires_scopes = needs_requires_scopes, + authorization.needs_requires_scopes = needs_requires_scopes ); } From f3bc20b00ae39971a4ef0d210dca5f0a08edea1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 12:00:33 +0100 Subject: [PATCH 08/24] Use new metrics macros in timeout layer --- apollo-router/src/plugins/traffic_shaping/timeout/future.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs index 8a390b393e..99e32e6767 100644 --- a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs +++ b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs @@ -49,7 +49,7 @@ where match Pin::new(&mut this.sleep).poll(cx) { Poll::Pending => Poll::Pending, Poll::Ready(_) => { - tracing::info!(monotonic_counter.apollo_router_timeout = 1u64,); + u64_counter!("apollo_router_timeout", "Number of timed out client requests", 1); Poll::Ready(Err(Elapsed::new().into())) } } From 9130dbc5eb99d8880c66f7fa2e276931d5c02abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 12:02:40 +0100 Subject: [PATCH 09/24] Use new metrics macros for reporting graphql error counts --- apollo-router/src/services/router/service.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/apollo-router/src/services/router/service.rs b/apollo-router/src/services/router/service.rs index e5792c1a4d..acc67e332d 100644 --- a/apollo-router/src/services/router/service.rs +++ b/apollo-router/src/services/router/service.rs @@ -378,8 +378,10 @@ impl RouterService { Ok(RouterResponse { response, context }) } else { - tracing::info!( - monotonic_counter.apollo.router.graphql_error = 1u64, + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + 1, code = "INVALID_ACCEPT_HEADER" ); // Useful for selector in spans/instruments/events @@ -799,12 +801,18 @@ impl RouterService { for (code, count) in map { match code { None => { - tracing::info!(monotonic_counter.apollo.router.graphql_error = count,); + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + count + ); } Some(code) => { - tracing::info!( - monotonic_counter.apollo.router.graphql_error = count, - code = code + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + count, + code = code.to_string() ); } } From 551170f90dc4aa70a22531641f9e1d5db1ac293f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 12:03:51 +0100 Subject: [PATCH 10/24] Use new metrics macros for reporting subscription events Notes: - The description for `apollo_router_skipped_event_count` may not entirely be correct? --- apollo-router/src/notification.rs | 2 +- apollo-router/src/plugins/subscription.rs | 20 ++++++++++++-------- apollo-router/src/protocols/websocket.rs | 22 ++++++++-------------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/apollo-router/src/notification.rs b/apollo-router/src/notification.rs index 7cfba87e7a..1536a8a0de 100644 --- a/apollo-router/src/notification.rs +++ b/apollo-router/src/notification.rs @@ -510,7 +510,7 @@ where match Pin::new(&mut this.msg_receiver).poll_next(cx) { Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => { - tracing::info!(monotonic_counter.apollo_router_skipped_event_count = 1u64,); + u64_counter!("apollo_router_skipped_event_count", "Amount of times the router missed subscription events", 1u64); self.poll_next(cx) } Poll::Ready(None) => Poll::Ready(None), diff --git a/apollo-router/src/plugins/subscription.rs b/apollo-router/src/plugins/subscription.rs index 50d5e78ead..1d342f586f 100644 --- a/apollo-router/src/plugins/subscription.rs +++ b/apollo-router/src/plugins/subscription.rs @@ -503,10 +503,12 @@ impl Service for CallbackService { }; // Keep the subscription to the client opened payload.subscribed = Some(true); - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions.events = 1u64, - subscriptions.mode="callback" - ); + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, + subscriptions.mode = "callback" + ); handle.send_sync(payload)?; Ok(router::Response { @@ -626,10 +628,12 @@ impl Service for CallbackService { }); } }; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions.events = 1u64, - subscriptions.mode="callback", - subscriptions.complete=true + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, + subscriptions.mode = "callback", + subscriptions.complete = true ); if let Err(_err) = handle.send_sync( graphql::Response::builder().errors(errors).build(), diff --git a/apollo-router/src/protocols/websocket.rs b/apollo-router/src/protocols/websocket.rs index bd556232ac..13700e84ce 100644 --- a/apollo-router/src/protocols/websocket.rs +++ b/apollo-router/src/protocols/websocket.rs @@ -300,13 +300,10 @@ where request: graphql::Request, heartbeat_interval: Option, ) -> Result, graphql::Error> { - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .subscriptions - .events = 1u64, + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, subscriptions.mode = "passthrough" ); @@ -443,13 +440,10 @@ where tracing::trace!("cannot shutdown sink: {err:?}"); }; - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .subscriptions - .events = 1u64, + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, subscriptions.mode = "passthrough", subscriptions.complete = true ); From b5497c65934bd1296f553c84706d6283afa67dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 12:09:31 +0100 Subject: [PATCH 11/24] Use new metrics macros for reporting state changes Notes: - This combined a log message and a metric: now they are separate. --- apollo-router/src/state_machine.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/state_machine.rs b/apollo-router/src/state_machine.rs index e3ce6c3a67..ed5765d4e3 100644 --- a/apollo-router/src/state_machine.rs +++ b/apollo-router/src/state_machine.rs @@ -557,13 +557,20 @@ where #[cfg(test)] self.notify_updated.notify_one(); - tracing::debug!( - monotonic_counter.apollo_router_state_change_total = 1u64, + tracing::info!( event = event_name, state = ?state, previous_state, "state machine transitioned" ); + u64_counter!( + "apollo_router_state_change_total", + "Router state changes", + 1, + event = event_name, + state = format!("{state:?}"), + previous_state = previous_state + ); // If we've errored then exit even if there are potentially more messages if matches!(&state, Stopped | Errored(_)) { From 69e71b18f75310b38ef7ed9146ada89f011ac70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 12:16:23 +0100 Subject: [PATCH 12/24] Use new metrics macros for reporting which telemetry system is enabled --- apollo-router/src/plugins/telemetry/mod.rs | 33 ++++++++++++++++------ 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index b6a0e7a1a2..1ac923654b 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -131,7 +131,6 @@ use crate::plugins::telemetry::reload::OPENTELEMETRY_TRACER_HANDLE; use crate::plugins::telemetry::tracing::apollo_telemetry::decode_ftv1_trace; use crate::plugins::telemetry::tracing::apollo_telemetry::APOLLO_PRIVATE_OPERATION_SIGNATURE; use crate::plugins::telemetry::tracing::TracingConfigurator; -use crate::plugins::telemetry::utils::TracingUtils; use crate::query_planner::OperationKind; use crate::register_plugin; use crate::router_factory::Endpoint; @@ -1761,14 +1760,30 @@ impl Telemetry { || tracing_zipkin_used || tracing_datadog_used { - ::tracing::info!( - monotonic_counter.apollo.router.operations.telemetry = 1u64, - telemetry.metrics.otlp = metrics_otlp_used.or_empty(), - telemetry.metrics.prometheus = metrics_prom_used.or_empty(), - telemetry.tracing.otlp = tracing_otlp_used.or_empty(), - telemetry.tracing.datadog = tracing_datadog_used.or_empty(), - telemetry.tracing.jaeger = tracing_jaeger_used.or_empty(), - telemetry.tracing.zipkin = tracing_zipkin_used.or_empty(), + let mut attributes = Vec::new(); + if metrics_otlp_used { + attributes.push(KeyValue::new("telemetry.metrics.otlp", true)); + } + if metrics_prom_used { + attributes.push(KeyValue::new("telemetry.metrics.prometheus", true)); + } + if tracing_otlp_used { + attributes.push(KeyValue::new("telemetry.tracing.otlp", true)); + } + if tracing_datadog_used { + attributes.push(KeyValue::new("telemetry.tracing.datadog", true)); + } + if tracing_jaeger_used { + attributes.push(KeyValue::new("telemetry.tracing.jaeger", true)); + } + if tracing_zipkin_used { + attributes.push(KeyValue::new("telemetry.tracing.zipkin", true)); + } + u64_counter!( + "apollo.router.operations.telemetry", + "Telemetry systems in use", + 1, + &attributes ); } } From 4858d2400db8d17d5b135d0574834eddb45a2709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 14:58:29 +0100 Subject: [PATCH 13/24] Add back the deprecated metrics --- apollo-router/src/plugins/authentication/mod.rs | 12 ++++++++++++ apollo-router/src/plugins/authorization/mod.rs | 2 +- apollo-router/src/services/subgraph_service.rs | 12 ++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/apollo-router/src/plugins/authentication/mod.rs b/apollo-router/src/plugins/authentication/mod.rs index bf3889adf2..0239e1f005 100644 --- a/apollo-router/src/plugins/authentication/mod.rs +++ b/apollo-router/src/plugins/authentication/mod.rs @@ -547,6 +547,12 @@ fn authenticate( status: StatusCode, ) -> ControlFlow { // This is a metric and will not appear in the logs + u64_counter!( + "apollo_authentication_failure_count", + "Number of requests with failed JWT authentication (deprecated)", + 1, + kind = "JWT" + ); u64_counter!( "apollo.router.operations.authentication.jwt", "Number of requests with JWT authentication", @@ -653,6 +659,12 @@ fn authenticate( ); } // This is a metric and will not appear in the logs + u64_counter!( + "apollo_authentication_success_count", + "Number of requests with successful JWT authentication (deprecated)", + 1, + kind = "JWT" + ); u64_counter!( "apollo.router.operations.jwt", "Number of requests with JWT authentication", diff --git a/apollo-router/src/plugins/authorization/mod.rs b/apollo-router/src/plugins/authorization/mod.rs index 4da5c4ef99..e4c5a77d86 100644 --- a/apollo-router/src/plugins/authorization/mod.rs +++ b/apollo-router/src/plugins/authorization/mod.rs @@ -558,7 +558,7 @@ impl Plugin for AuthorizationPlugin { // This is a metric and will not appear in the logs u64_counter!( "apollo_require_authentication_failure_count", - "", + "Number of unauthenticated requests (deprecated)", 1 ); tracing::error!("rejecting unauthenticated request"); diff --git a/apollo-router/src/services/subgraph_service.rs b/apollo-router/src/services/subgraph_service.rs index 09e25090f0..dc93499143 100644 --- a/apollo-router/src/services/subgraph_service.rs +++ b/apollo-router/src/services/subgraph_service.rs @@ -309,6 +309,12 @@ impl tower::Service for SubgraphService { subgraph.service.name = service_name.clone() ); if !created { + u64_counter!( + "apollo_router_deduplicated_subscriptions_total", + "Total deduplicated subscription requests (deprecated)", + 1, + mode = "callback" + ); // Dedup happens here return Ok(SubgraphResponse::builder() .subgraph_name(service_name.clone()) @@ -517,6 +523,12 @@ async fn call_websocket( subscription_stream_tx .send(Box::pin(handle.into_stream())) .await?; + u64_counter!( + "apollo_router_deduplicated_subscriptions_total", + "Total deduplicated subscription requests (deprecated)", + 1, + mode = "passthrough" + ); // Dedup happens here return Ok(SubgraphResponse::builder() From fce826c471a882c39e85b6914e979ef18ec5e290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Wed, 27 Nov 2024 15:04:15 +0100 Subject: [PATCH 14/24] fmt --- apollo-router/src/notification.rs | 6 +++++- apollo-router/src/plugins/traffic_shaping/timeout/future.rs | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/notification.rs b/apollo-router/src/notification.rs index 1536a8a0de..667ee797ed 100644 --- a/apollo-router/src/notification.rs +++ b/apollo-router/src/notification.rs @@ -510,7 +510,11 @@ where match Pin::new(&mut this.msg_receiver).poll_next(cx) { Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => { - u64_counter!("apollo_router_skipped_event_count", "Amount of times the router missed subscription events", 1u64); + u64_counter!( + "apollo_router_skipped_event_count", + "Amount of times the router missed subscription events", + 1u64 + ); self.poll_next(cx) } Poll::Ready(None) => Poll::Ready(None), diff --git a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs index 99e32e6767..eda4100198 100644 --- a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs +++ b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs @@ -49,7 +49,11 @@ where match Pin::new(&mut this.sleep).poll(cx) { Poll::Pending => Poll::Pending, Poll::Ready(_) => { - u64_counter!("apollo_router_timeout", "Number of timed out client requests", 1); + u64_counter!( + "apollo_router_timeout", + "Number of timed out client requests", + 1 + ); Poll::Ready(Err(Elapsed::new().into())) } } From 3c43714b642ad558237f289e018efa15c713b338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Thu, 28 Nov 2024 11:53:25 +0100 Subject: [PATCH 15/24] Simplify plugin_metrics --- apollo-router/src/plugins/telemetry/mod.rs | 56 +++++++++------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 1ac923654b..c76dae4aa2 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -1746,44 +1746,32 @@ impl Telemetry { } fn plugin_metrics(config: &Arc) { - let metrics_prom_used = config.exporters.metrics.prometheus.enabled; - let metrics_otlp_used = MetricsConfigurator::enabled(&config.exporters.metrics.otlp); - let tracing_otlp_used = TracingConfigurator::enabled(&config.exporters.tracing.otlp); - let tracing_datadog_used = config.exporters.tracing.datadog.enabled(); - let tracing_jaeger_used = config.exporters.tracing.jaeger.enabled(); - let tracing_zipkin_used = config.exporters.tracing.zipkin.enabled(); - - if metrics_prom_used - || metrics_otlp_used - || tracing_jaeger_used - || tracing_otlp_used - || tracing_zipkin_used - || tracing_datadog_used - { - let mut attributes = Vec::new(); - if metrics_otlp_used { - attributes.push(KeyValue::new("telemetry.metrics.otlp", true)); - } - if metrics_prom_used { - attributes.push(KeyValue::new("telemetry.metrics.prometheus", true)); - } - if tracing_otlp_used { - attributes.push(KeyValue::new("telemetry.tracing.otlp", true)); - } - if tracing_datadog_used { - attributes.push(KeyValue::new("telemetry.tracing.datadog", true)); - } - if tracing_jaeger_used { - attributes.push(KeyValue::new("telemetry.tracing.jaeger", true)); - } - if tracing_zipkin_used { - attributes.push(KeyValue::new("telemetry.tracing.zipkin", true)); - } + let mut attributes = Vec::new(); + if MetricsConfigurator::enabled(&config.exporters.metrics.otlp) { + attributes.push(KeyValue::new("telemetry.metrics.otlp", true)); + } + if config.exporters.metrics.prometheus.enabled { + attributes.push(KeyValue::new("telemetry.metrics.prometheus", true)); + } + if TracingConfigurator::enabled(&config.exporters.tracing.otlp) { + attributes.push(KeyValue::new("telemetry.tracing.otlp", true)); + } + if config.exporters.tracing.datadog.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.datadog", true)); + } + if config.exporters.tracing.jaeger.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.jaeger", true)); + } + if config.exporters.tracing.zipkin.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.zipkin", true)); + } + + if !attributes.is_empty() { u64_counter!( "apollo.router.operations.telemetry", "Telemetry systems in use", 1, - &attributes + attributes ); } } From fd39d9b973773e5cf3f23529d442223aa55b31b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e?= Date: Thu, 28 Nov 2024 15:23:36 +0100 Subject: [PATCH 16/24] Tweak `apollo.router.operations.telemetry` description Co-authored-by: Coenen Benjamin --- apollo-router/src/plugins/telemetry/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index c76dae4aa2..5ffbb2e5f6 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -1769,7 +1769,7 @@ impl Telemetry { if !attributes.is_empty() { u64_counter!( "apollo.router.operations.telemetry", - "Telemetry systems in use", + "Telemetry exporters enabled", 1, attributes ); From 0d6e222c88525c5b9d1b272ab966d67b5c80469f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Thu, 28 Nov 2024 15:36:05 +0100 Subject: [PATCH 17/24] docs: list the newly deprecated metrics --- .../telemetry/instrumentation/standard-instruments.mdx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index da9b655036..1cfe504680 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -125,4 +125,8 @@ The initial call to Uplink during router startup is not reflected in metrics. The following metrics have been deprecated and should not be used. -- `apollo_router_span` - **Deprecated**—use `apollo_router_processing_time` instead. +- `apollo_router_span` - **Deprecated**: use `apollo_router_processing_time` instead. +- `apollo_router_deduplicated_subscriptions_total` - **Deprecated**: use the `apollo.router.operations.subscriptions` metric's `subscriptions.deduplicated` attribute. +- `apollo_authentication_failure_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute. +- `apollo_authentication_success_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded. +- `apollo_require_authentication_failure_count` - **Deprecated**: TODO @goto-bus-stop: no replacement? From 01ebdef7b8c55dd42a4caaa9dd11c3a2f6800418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Thu, 28 Nov 2024 15:43:39 +0100 Subject: [PATCH 18/24] Add a description for subgraph SigV4 signing --- apollo-router/src/plugins/authentication/subgraph.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/plugins/authentication/subgraph.rs b/apollo-router/src/plugins/authentication/subgraph.rs index fb43467d83..4a0bcd4d65 100644 --- a/apollo-router/src/plugins/authentication/subgraph.rs +++ b/apollo-router/src/plugins/authentication/subgraph.rs @@ -411,7 +411,7 @@ impl SigningParamsConfig { fn increment_success_counter(subgraph_name: &str) { u64_counter!( "apollo.router.operations.authentication.aws.sigv4", - "", + "Number of subgraph requests signed with AWS SigV4", 1, authentication.aws.sigv4.failed = false, subgraph.service.name = subgraph_name.to_string() @@ -420,7 +420,7 @@ fn increment_success_counter(subgraph_name: &str) { fn increment_failure_counter(subgraph_name: &str) { u64_counter!( "apollo.router.operations.authentication.aws.sigv4", - "", + "Number of subgraph requests signed with AWS SigV4", 1, authentication.aws.sigv4.failed = true, subgraph.service.name = subgraph_name.to_string() From aca60241dae63773e4095fd91aeb878833855aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Thu, 28 Nov 2024 15:45:05 +0100 Subject: [PATCH 19/24] Add apollo.router.operations.authorization description --- apollo-router/src/plugins/authorization/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/plugins/authorization/mod.rs b/apollo-router/src/plugins/authorization/mod.rs index e4c5a77d86..63b2062a05 100644 --- a/apollo-router/src/plugins/authorization/mod.rs +++ b/apollo-router/src/plugins/authorization/mod.rs @@ -592,7 +592,7 @@ impl Plugin for AuthorizationPlugin { if needs_authenticated || needs_requires_scopes { u64_counter!( "apollo.router.operations.authorization", - "", + "Number of subgraph requests requiring authorization", 1, authorization.filtered = filtered, authorization.needs_authenticated = needs_authenticated, From cdb79407da4f671db688cb8edc052d451697aa2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e?= Date: Thu, 28 Nov 2024 16:40:15 +0100 Subject: [PATCH 20/24] Tweak `apollo_router_skipped_event_count` description --- apollo-router/src/notification.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/notification.rs b/apollo-router/src/notification.rs index 667ee797ed..ac1f33645d 100644 --- a/apollo-router/src/notification.rs +++ b/apollo-router/src/notification.rs @@ -512,7 +512,7 @@ where Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => { u64_counter!( "apollo_router_skipped_event_count", - "Amount of times the router missed subscription events", + "Amount of events dropped from the internal message queue", 1u64 ); self.poll_next(cx) From 54b62dd56d98490b0fca24d68a2faedbcb967bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Nov 2024 09:57:42 +0100 Subject: [PATCH 21/24] docs: add migration instructions for metrics deprecated in favour of custom instrumentation --- .../instrumentation/standard-instruments.mdx | 81 ++++++++++++++++++- 1 file changed, 77 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index 1cfe504680..f50cbbe638 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -16,14 +16,11 @@ These instruments can be consumed by configuring a [metrics exporter](/router/co - `apollo_router_http_request_duration_seconds_bucket` - HTTP subgraph request duration, attributes: - `subgraph`: (Optional) The subgraph being queried - `apollo_router_http_requests_total` - Total number of HTTP requests by HTTP status -- `apollo_router_timeout` - Number of triggered timeouts - `apollo_router_http_request_retry_total` - Number of subgraph requests retried, attributes: - `subgraph`: The subgraph being queried - `status` : If the retry was aborted (`aborted`) -### GraphQL - -- `apollo_router_graphql_error` - counts GraphQL errors in responses, attributes: +- `apollo.router.graphql_error` - counts GraphQL errors in responses, attributes: - `code`: error code ### Session @@ -130,3 +127,79 @@ The following metrics have been deprecated and should not be used. - `apollo_authentication_failure_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute. - `apollo_authentication_success_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded. - `apollo_require_authentication_failure_count` - **Deprecated**: TODO @goto-bus-stop: no replacement? +- `apollo_router_timeout` - **Deprecated**: this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. See [migration instructions](#migrating-from-apollo_router_timeout). +- `apollo.router.graphql_error`- **Deprecated**: this metric can significantly overcount errors and is not correlated with particular requests. See [migration instructions](#migrating-from-apollo-router-graphql_error). + +#### Migrating from `apollo_router_timeout` + +Reporting timed-out requests can be done with a [custom instrumentation] attribute. This will report router request timeouts and subgraph request timeouts separately. + +```yaml title="router.yaml" +telemetry: + instrumentation: + instruments: + router: + http.server.request.duration: + # Adding response status code from the router + attributes: + # If status code == 504 then the router HTTP request timed out. + http.response.status_code: true + subgraph: + # Adding subgraph name & response status code from the subgraph + http.client.request.duration: + attributes: + subgraph.name: true + # If status code == 504 then the subgraph HTTP request timed out. + http.response.status_code: + subgraph_response_status: code +``` + +#### Migrating from `apollo_router_graphql_error` + +The `apollo.router.graphql_error` counts all individual GraphQL errors in router responses, which can be problematic. For example, a single invalid GraphQL document may have many validation errors, and these would all be counted individually. The metric could report dozens of validation errors, that actually all originate from one request. + +There is no exact equivalent for this deprecated metric. Instead, use a [custom instrumentation] attribute and pick a strategy for how you'd like the errors to be reported. An instrument at the `supergraph` stage can inspect JSON error responses using the `response_errors` selector. + +This example reports only the first error code for each error response: + +```yaml title="router.yaml" +telemetry: + instrumentation: + instruments: + supergraph: + supergraph.response.errors: + type: counter + value: event_unit + description: Number of requests with GraphQL errors + unit: req + attributes: + graphql.operation.name: true + errors.code: + response_errors: $[0].extensions.code + condition: + exists: + response_errors: $.*.extensions.code +``` + +This example reports all error codes in each error response as an array attribute: + +```yaml title="router.yaml" +telemetry: + instrumentation: + instruments: + supergraph: + supergraph.response.errors: + type: counter + value: event_unit + description: Number of requests with GraphQL errors + unit: req + attributes: + graphql.operation.name: true + errors.codes: + response_errors: $.*.extensions.code + condition: + exists: + response_errors: $.*.extensions.code +``` + +[custom instrumentation]: https://www.apollographql.com/docs/graphos/reference/router/telemetry/instrumentation/instruments#custom-instruments From 7e025844bc56dc70d6d3b063255e6b85288ba38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Nov 2024 09:59:53 +0100 Subject: [PATCH 22/24] Docs-deprecate `apollo.router.graphql_error` metric --- apollo-router/src/services/router/service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apollo-router/src/services/router/service.rs b/apollo-router/src/services/router/service.rs index acc67e332d..05f5c5c289 100644 --- a/apollo-router/src/services/router/service.rs +++ b/apollo-router/src/services/router/service.rs @@ -380,7 +380,7 @@ impl RouterService { } else { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router", + "Number of GraphQL error responses returned by the router (deprecated)", 1, code = "INVALID_ACCEPT_HEADER" ); @@ -803,14 +803,14 @@ impl RouterService { None => { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router", + "Number of GraphQL error responses returned by the router (deprecated)", count ); } Some(code) => { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router", + "Number of GraphQL error responses returned by the router (deprecated)", count, code = code.to_string() ); From 35a24bbdd0be2370fb26c88f5db9928912136083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Nov 2024 10:21:57 +0100 Subject: [PATCH 23/24] Revert "Docs-deprecate `apollo.router.graphql_error` metric" This reverts commit 7e025844bc56dc70d6d3b063255e6b85288ba38d. --- apollo-router/src/services/router/service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apollo-router/src/services/router/service.rs b/apollo-router/src/services/router/service.rs index 05f5c5c289..acc67e332d 100644 --- a/apollo-router/src/services/router/service.rs +++ b/apollo-router/src/services/router/service.rs @@ -380,7 +380,7 @@ impl RouterService { } else { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router (deprecated)", + "Number of GraphQL error responses returned by the router", 1, code = "INVALID_ACCEPT_HEADER" ); @@ -803,14 +803,14 @@ impl RouterService { None => { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router (deprecated)", + "Number of GraphQL error responses returned by the router", count ); } Some(code) => { u64_counter!( "apollo.router.graphql_error", - "Number of GraphQL error responses returned by the router (deprecated)", + "Number of GraphQL error responses returned by the router", count, code = code.to_string() ); From 947cb6dd100d622e3295d0c5281380cf3c56b9f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Nov 2024 10:22:10 +0100 Subject: [PATCH 24/24] Update deprecated metrics docs --- .../instrumentation/standard-instruments.mdx | 77 +------------------ 1 file changed, 1 insertion(+), 76 deletions(-) diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index f50cbbe638..88e30ad096 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -127,79 +127,4 @@ The following metrics have been deprecated and should not be used. - `apollo_authentication_failure_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute. - `apollo_authentication_success_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded. - `apollo_require_authentication_failure_count` - **Deprecated**: TODO @goto-bus-stop: no replacement? -- `apollo_router_timeout` - **Deprecated**: this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. See [migration instructions](#migrating-from-apollo_router_timeout). -- `apollo.router.graphql_error`- **Deprecated**: this metric can significantly overcount errors and is not correlated with particular requests. See [migration instructions](#migrating-from-apollo-router-graphql_error). - -#### Migrating from `apollo_router_timeout` - -Reporting timed-out requests can be done with a [custom instrumentation] attribute. This will report router request timeouts and subgraph request timeouts separately. - -```yaml title="router.yaml" -telemetry: - instrumentation: - instruments: - router: - http.server.request.duration: - # Adding response status code from the router - attributes: - # If status code == 504 then the router HTTP request timed out. - http.response.status_code: true - subgraph: - # Adding subgraph name & response status code from the subgraph - http.client.request.duration: - attributes: - subgraph.name: true - # If status code == 504 then the subgraph HTTP request timed out. - http.response.status_code: - subgraph_response_status: code -``` - -#### Migrating from `apollo_router_graphql_error` - -The `apollo.router.graphql_error` counts all individual GraphQL errors in router responses, which can be problematic. For example, a single invalid GraphQL document may have many validation errors, and these would all be counted individually. The metric could report dozens of validation errors, that actually all originate from one request. - -There is no exact equivalent for this deprecated metric. Instead, use a [custom instrumentation] attribute and pick a strategy for how you'd like the errors to be reported. An instrument at the `supergraph` stage can inspect JSON error responses using the `response_errors` selector. - -This example reports only the first error code for each error response: - -```yaml title="router.yaml" -telemetry: - instrumentation: - instruments: - supergraph: - supergraph.response.errors: - type: counter - value: event_unit - description: Number of requests with GraphQL errors - unit: req - attributes: - graphql.operation.name: true - errors.code: - response_errors: $[0].extensions.code - condition: - exists: - response_errors: $.*.extensions.code -``` - -This example reports all error codes in each error response as an array attribute: - -```yaml title="router.yaml" -telemetry: - instrumentation: - instruments: - supergraph: - supergraph.response.errors: - type: counter - value: event_unit - description: Number of requests with GraphQL errors - unit: req - attributes: - graphql.operation.name: true - errors.codes: - response_errors: $.*.extensions.code - condition: - exists: - response_errors: $.*.extensions.code -``` - -[custom instrumentation]: https://www.apollographql.com/docs/graphos/reference/router/telemetry/instrumentation/instruments#custom-instruments +- `apollo_router_timeout` - **Deprecated**: this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. Timed-out requests have HTTP status code 504. Use the `http.response.status_code` attribute on the `http.server.request.duration` metric to identify timed-out router requests, and on the `http.client.request.duration` metric to identify timed-out subgraph requests.