Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate monotonic counter metrics to u64_counter! #6350

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
304919a
Use metrics macros in persisted queries layer
goto-bus-stop Nov 25, 2024
30fc35e
Use new metrics macro for counters in subgraph service
goto-bus-stop Nov 25, 2024
148d9c4
Use new metrics macro for counters in graphql::Request
goto-bus-stop Nov 25, 2024
be452c9
Use new metrics macro for counting defer requests and responses
goto-bus-stop Nov 25, 2024
7f744e9
Use new metrics macro for counting uplink fetches
goto-bus-stop Nov 25, 2024
eb3a8f7
Use new metrics macro for counting cache hits
goto-bus-stop Nov 25, 2024
a93408d
Use new metrics macro for counting authentication uses
goto-bus-stop Nov 25, 2024
f3bc20b
Use new metrics macros in timeout layer
goto-bus-stop Nov 27, 2024
9130dbc
Use new metrics macros for reporting graphql error counts
goto-bus-stop Nov 27, 2024
551170f
Use new metrics macros for reporting subscription events
goto-bus-stop Nov 27, 2024
b5497c6
Use new metrics macros for reporting state changes
goto-bus-stop Nov 27, 2024
69e71b1
Use new metrics macros for reporting which telemetry system is enabled
goto-bus-stop Nov 27, 2024
4858d24
Add back the deprecated metrics
goto-bus-stop Nov 27, 2024
fce826c
fmt
goto-bus-stop Nov 27, 2024
3c43714
Simplify plugin_metrics
goto-bus-stop Nov 28, 2024
fd39d9b
Tweak `apollo.router.operations.telemetry` description
goto-bus-stop Nov 28, 2024
0d6e222
docs: list the newly deprecated metrics
goto-bus-stop Nov 28, 2024
01ebdef
Add a description for subgraph SigV4 signing
goto-bus-stop Nov 28, 2024
aca6024
Add apollo.router.operations.authorization description
goto-bus-stop Nov 28, 2024
cdb7940
Tweak `apollo_router_skipped_event_count` description
goto-bus-stop Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 24 additions & 16 deletions apollo-router/src/cache/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,12 @@ where

match res {
Some(v) => {
tracing::info!(
monotonic_counter.apollo_router_cache_hit_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
u64_counter!(
"apollo_router_cache_hit_count",
"Number of cache hits",
1,
kind = self.caller,
storage = CacheStorageName::Memory.to_string()
);
let duration = instant_memory.elapsed().as_secs_f64();
tracing::info!(
Expand All @@ -190,10 +192,12 @@ where
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
);
tracing::info!(
monotonic_counter.apollo_router_cache_miss_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
u64_counter!(
"apollo_router_cache_miss_count",
"Number of cache misses",
1,
kind = self.caller,
storage = CacheStorageName::Memory.to_string()
);

let instant_redis = Instant::now();
Expand All @@ -214,10 +218,12 @@ where
Some(v) => {
self.insert_in_memory(key.clone(), v.0.clone()).await;

tracing::info!(
monotonic_counter.apollo_router_cache_hit_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Redis),
u64_counter!(
"apollo_router_cache_hit_count",
"Number of cache hits",
1,
kind = self.caller,
storage = CacheStorageName::Redis.to_string()
);
let duration = instant_redis.elapsed().as_secs_f64();
tracing::info!(
Expand All @@ -228,10 +234,12 @@ where
Some(v.0)
}
None => {
tracing::info!(
monotonic_counter.apollo_router_cache_miss_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Redis),
u64_counter!(
"apollo_router_cache_miss_count",
"Number of cache misses",
1,
kind = self.caller,
storage = CacheStorageName::Redis.to_string()
);
let duration = instant_redis.elapsed().as_secs_f64();
tracing::info!(
Expand Down
16 changes: 10 additions & 6 deletions apollo-router/src/graphql/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,11 @@ impl Request {
mode = %BatchingMode::BatchHttpLink // Only supported mode right now
);

tracing::info!(
monotonic_counter.apollo.router.operations.batching = 1u64,
mode = %BatchingMode::BatchHttpLink // Only supported mode right now
u64_counter!(
"apollo.router.operations.batching",
"Total requests with batched operations",
1,
mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now
);
for entry in value
.as_array()
Expand All @@ -229,9 +231,11 @@ impl Request {
mode = "batch_http_link" // Only supported mode right now
);

tracing::info!(
monotonic_counter.apollo.router.operations.batching = 1u64,
mode = "batch_http_link" // Only supported mode right now
u64_counter!(
"apollo.router.operations.batching",
"Total requests with batched operations",
1,
mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now
);
for entry in value
.as_array()
Expand Down
31 changes: 30 additions & 1 deletion apollo-router/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,9 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider {
}

#[macro_export]
/// Get or create a u64 monotonic counter metric and add a value to it
/// Get or create a `u64` monotonic counter metric and add a value to it.
///
/// Each metric needs a description.
///
/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer.
/// The reason for this is that the MetricsLayer has:
Expand All @@ -506,6 +508,33 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider {
/// * Imperfect mapping to metrics API that can only be checked at runtime.
///
/// New metrics should be added using these macros.
///
/// # Examples
/// ```ignore
/// // Count a thing:
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1
/// );
/// // Count a thing with attributes:
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1,
/// frobbles.color = "blue"
/// );
/// // Count a thing with dynamic attributes:
/// let attributes = [
/// opentelemetry::KeyValue::new("frobbles.color".to_string(), "blue".into()),
/// ];
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1,
/// attributes
/// );
/// ```
#[allow(unused_macros)]
macro_rules! u64_counter {
($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => {
Expand Down
6 changes: 5 additions & 1 deletion apollo-router/src/notification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,11 @@ where

match Pin::new(&mut this.msg_receiver).poll_next(cx) {
Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => {
tracing::info!(monotonic_counter.apollo_router_skipped_event_count = 1u64,);
u64_counter!(
"apollo_router_skipped_event_count",
"Amount of events dropped from the internal message queue",
1u64
);
self.poll_next(cx)
}
Poll::Ready(None) => Poll::Ready(None),
Expand Down
35 changes: 19 additions & 16 deletions apollo-router/src/plugins/authentication/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -539,8 +539,6 @@ fn authenticate(
jwks_manager: &JwksManager,
request: router::Request,
) -> ControlFlow<router::Response, router::Request> {
const AUTHENTICATION_KIND: &str = "JWT";

// We are going to do a lot of similar checking so let's define a local function
// to help reduce repetition
fn failure_message(
Expand All @@ -549,17 +547,16 @@ fn authenticate(
status: StatusCode,
) -> ControlFlow<router::Response, router::Request> {
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_authentication_failure_count = 1u64,
kind = %AUTHENTICATION_KIND
u64_counter!(
"apollo_authentication_failure_count",
"Number of requests with failed JWT authentication (deprecated)",
1,
kind = "JWT"
);
tracing::info!(
monotonic_counter
.apollo
.router
.operations
.authentication
.jwt = 1,
u64_counter!(
"apollo.router.operations.authentication.jwt",
"Number of requests with JWT authentication",
1,
authentication.jwt.failed = true
);
tracing::info!(message = %error, "jwt authentication failure");
Expand Down Expand Up @@ -662,11 +659,17 @@ fn authenticate(
);
}
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_authentication_success_count = 1u64,
kind = %AUTHENTICATION_KIND
u64_counter!(
"apollo_authentication_success_count",
"Number of requests with successful JWT authentication (deprecated)",
1,
kind = "JWT"
);
u64_counter!(
"apollo.router.operations.jwt",
"Number of requests with JWT authentication",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if we should not add authentication.jwt.failed = false in the future (for 2.0) to be consistent with what sigv4 is doing for example

1
);
tracing::info!(monotonic_counter.apollo.router.operations.jwt = 1u64);
return ControlFlow::Continue(request);
}

Expand Down
16 changes: 10 additions & 6 deletions apollo-router/src/plugins/authentication/subgraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,21 @@ impl SigningParamsConfig {
}

fn increment_success_counter(subgraph_name: &str) {
tracing::info!(
monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64,
u64_counter!(
"apollo.router.operations.authentication.aws.sigv4",
"Number of subgraph requests signed with AWS SigV4",
1,
authentication.aws.sigv4.failed = false,
subgraph.service.name = %subgraph_name,
subgraph.service.name = subgraph_name.to_string()
);
}
fn increment_failure_counter(subgraph_name: &str) {
tracing::info!(
monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64,
u64_counter!(
"apollo.router.operations.authentication.aws.sigv4",
"Number of subgraph requests signed with AWS SigV4",
1,
authentication.aws.sigv4.failed = true,
subgraph.service.name = %subgraph_name,
subgraph.service.name = subgraph_name.to_string()
);
}

Expand Down
14 changes: 9 additions & 5 deletions apollo-router/src/plugins/authorization/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,10 @@ impl Plugin for AuthorizationPlugin {
Ok(ControlFlow::Continue(request))
} else {
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_require_authentication_failure_count = 1u64,
u64_counter!(
"apollo_require_authentication_failure_count",
"Number of unauthenticated requests (deprecated)",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I marked this as deprecated because the naming doesn't seem to follow convention, but I don't know if we have a proper alternative for this? We have apollo.router.operations.authorization below but I'm not sure it reports the same thing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's the right thing to do. cc @BrynCooke you probably have more context

1
);
tracing::error!("rejecting unauthenticated request");
let response = supergraph::Response::error_builder()
Expand Down Expand Up @@ -588,11 +590,13 @@ impl Plugin for AuthorizationPlugin {
let needs_requires_scopes = request.context.contains_key(REQUIRED_SCOPES_KEY);

if needs_authenticated || needs_requires_scopes {
tracing::info!(
monotonic_counter.apollo.router.operations.authorization = 1u64,
u64_counter!(
"apollo.router.operations.authorization",
"Number of subgraph requests requiring authorization",
1,
authorization.filtered = filtered,
authorization.needs_authenticated = needs_authenticated,
authorization.needs_requires_scopes = needs_requires_scopes,
authorization.needs_requires_scopes = needs_requires_scopes
);
}

Expand Down
20 changes: 12 additions & 8 deletions apollo-router/src/plugins/subscription.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,10 +503,12 @@ impl Service<router::Request> for CallbackService {
};
// Keep the subscription to the client opened
payload.subscribed = Some(true);
tracing::info!(
monotonic_counter.apollo.router.operations.subscriptions.events = 1u64,
subscriptions.mode="callback"
);
u64_counter!(
"apollo.router.operations.subscriptions.events",
"Number of subscription events",
1,
subscriptions.mode = "callback"
);
handle.send_sync(payload)?;

Ok(router::Response {
Expand Down Expand Up @@ -626,10 +628,12 @@ impl Service<router::Request> for CallbackService {
});
}
};
tracing::info!(
monotonic_counter.apollo.router.operations.subscriptions.events = 1u64,
subscriptions.mode="callback",
subscriptions.complete=true
u64_counter!(
"apollo.router.operations.subscriptions.events",
"Number of subscription events",
1,
subscriptions.mode = "callback",
subscriptions.complete = true
);
if let Err(_err) = handle.send_sync(
graphql::Response::builder().errors(errors).build(),
Expand Down
49 changes: 26 additions & 23 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ use crate::plugins::telemetry::reload::OPENTELEMETRY_TRACER_HANDLE;
use crate::plugins::telemetry::tracing::apollo_telemetry::decode_ftv1_trace;
use crate::plugins::telemetry::tracing::apollo_telemetry::APOLLO_PRIVATE_OPERATION_SIGNATURE;
use crate::plugins::telemetry::tracing::TracingConfigurator;
use crate::plugins::telemetry::utils::TracingUtils;
use crate::query_planner::OperationKind;
use crate::register_plugin;
use crate::router_factory::Endpoint;
Expand Down Expand Up @@ -1747,28 +1746,32 @@ impl Telemetry {
}

fn plugin_metrics(config: &Arc<Conf>) {
let metrics_prom_used = config.exporters.metrics.prometheus.enabled;
let metrics_otlp_used = MetricsConfigurator::enabled(&config.exporters.metrics.otlp);
let tracing_otlp_used = TracingConfigurator::enabled(&config.exporters.tracing.otlp);
let tracing_datadog_used = config.exporters.tracing.datadog.enabled();
let tracing_jaeger_used = config.exporters.tracing.jaeger.enabled();
let tracing_zipkin_used = config.exporters.tracing.zipkin.enabled();

if metrics_prom_used
|| metrics_otlp_used
|| tracing_jaeger_used
|| tracing_otlp_used
|| tracing_zipkin_used
|| tracing_datadog_used
{
::tracing::info!(
monotonic_counter.apollo.router.operations.telemetry = 1u64,
telemetry.metrics.otlp = metrics_otlp_used.or_empty(),
telemetry.metrics.prometheus = metrics_prom_used.or_empty(),
telemetry.tracing.otlp = tracing_otlp_used.or_empty(),
telemetry.tracing.datadog = tracing_datadog_used.or_empty(),
telemetry.tracing.jaeger = tracing_jaeger_used.or_empty(),
telemetry.tracing.zipkin = tracing_zipkin_used.or_empty(),
let mut attributes = Vec::new();
if MetricsConfigurator::enabled(&config.exporters.metrics.otlp) {
attributes.push(KeyValue::new("telemetry.metrics.otlp", true));
}
if config.exporters.metrics.prometheus.enabled {
attributes.push(KeyValue::new("telemetry.metrics.prometheus", true));
}
if TracingConfigurator::enabled(&config.exporters.tracing.otlp) {
attributes.push(KeyValue::new("telemetry.tracing.otlp", true));
}
if config.exporters.tracing.datadog.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.datadog", true));
}
if config.exporters.tracing.jaeger.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.jaeger", true));
}
if config.exporters.tracing.zipkin.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.zipkin", true));
}

if !attributes.is_empty() {
u64_counter!(
"apollo.router.operations.telemetry",
"Telemetry exporters enabled",
1,
attributes
);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ where
match Pin::new(&mut this.sleep).poll(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(_) => {
tracing::info!(monotonic_counter.apollo_router_timeout = 1u64,);
u64_counter!(
"apollo_router_timeout",
"Number of timed out client requests",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can mark it as deprecated

Copy link
Member Author

@goto-bus-stop goto-bus-stop Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alternative is to use a custom instrument, right? So that I can document it in the deprecated metrics section

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

telemetry:
  instrumentation:
    instruments:
      router:
        http.server.request.duration:
          # Adding subgraph name, response status code from the router and the operation name
          attributes:
            http.response.status_code: true # If status code == 504 then it's a timeout at the router http request level
            graphql.operation.name:
              operation_name: string
      subgraph:
        # Adding subgraph name, response status code from the subgraph and original operation name from the supergraph
        http.client.request.duration:
          attributes:
            subgraph.name: true
            http.response.status_code: # If status code == 504 then it's a timeout at the subgraph http request level
              subgraph_response_status: code

1
);
Poll::Ready(Err(Elapsed::new().into()))
}
}
Expand Down
Loading