Skip to content

Commit b936580

Browse files
lqiu96blakeli0
andauthored
feat: Introduce OpenTelemetry Metrics Recording (#2500)
Builds off #2433, based on the design in go/java-gapic-otel-metrics-design. Discovered two issues via showcase tests: 1. #2502 2. #2503 These issues are not blocking for this PR. --------- Co-authored-by: Blake Li <[email protected]>
1 parent ff56a20 commit b936580

File tree

15 files changed

+1315
-69
lines changed

15 files changed

+1315
-69
lines changed

gapic-generator-java-bom/pom.xml

+7
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,13 @@
7070
<type>pom</type>
7171
<scope>import</scope>
7272
</dependency>
73+
<dependency>
74+
<groupId>io.opentelemetry</groupId>
75+
<artifactId>opentelemetry-bom</artifactId>
76+
<version>${opentelemetry.version}</version>
77+
<type>pom</type>
78+
<scope>import</scope>
79+
</dependency>
7380

7481
<!-- Libraries published from this repositories -->
7582
<dependency>

gapic-generator-java-pom-parent/pom.xml

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<gson.version>2.10.1</gson.version>
3333
<guava.version>32.1.3-jre</guava.version>
3434
<protobuf.version>3.25.2</protobuf.version>
35+
<opentelemetry.version>1.35.0</opentelemetry.version>
3536
<maven.compiler.release>8</maven.compiler.release>
3637
</properties>
3738

gax-java/dependencies.properties

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ maven.com_google_api_grpc_proto_google_common_protos=com.google.api.grpc:proto-g
3939
maven.com_google_api_grpc_grpc_google_common_protos=com.google.api.grpc:grpc-google-common-protos:2.36.0
4040
maven.com_google_auth_google_auth_library_oauth2_http=com.google.auth:google-auth-library-oauth2-http:1.23.0
4141
maven.com_google_auth_google_auth_library_credentials=com.google.auth:google-auth-library-credentials:1.23.0
42+
maven.io_opentelemetry_opentelemetry_api=io.opentelemetry:opentelemetry-api:1.35.0
4243
maven.io_opencensus_opencensus_api=io.opencensus:opencensus-api:0.31.1
4344
maven.io_opencensus_opencensus_contrib_grpc_metrics=io.opencensus:opencensus-contrib-grpc-metrics:0.31.1
4445
maven.io_opencensus_opencensus_contrib_http_util=io.opencensus:opencensus-contrib-http-util:0.31.1

gax-java/gax/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ _COMPILE_DEPS = [
1818
"@com_google_code_findbugs_jsr305//jar",
1919
"@com_google_errorprone_error_prone_annotations//jar",
2020
"@com_google_guava_guava//jar",
21+
"@io_opentelemetry_opentelemetry_api//jar",
2122
"@io_opencensus_opencensus_api//jar",
2223
"@io_opencensus_opencensus_contrib_http_util//jar",
2324
"@io_grpc_grpc_java//context:context",

gax-java/gax/pom.xml

+4
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@
5757
<artifactId>graal-sdk</artifactId>
5858
<scope>provided</scope>
5959
</dependency>
60+
<dependency>
61+
<groupId>io.opentelemetry</groupId>
62+
<artifactId>opentelemetry-api</artifactId>
63+
</dependency>
6064
</dependencies>
6165

6266
<build>

gax-java/gax/src/main/java/com/google/api/gax/tracing/MetricsTracer.java

+28-13
Original file line numberDiff line numberDiff line change
@@ -40,40 +40,50 @@
4040
import java.util.Map;
4141
import java.util.concurrent.CancellationException;
4242
import java.util.concurrent.TimeUnit;
43+
import java.util.concurrent.atomic.AtomicBoolean;
4344
import javax.annotation.Nullable;
4445
import org.threeten.bp.Duration;
4546

4647
/**
4748
* This class computes generic metrics that can be observed in the lifecycle of an RPC operation.
4849
* The responsibility of recording metrics should delegate to {@link MetricsRecorder}, hence this
4950
* class should not have any knowledge about the observability framework used for metrics recording.
51+
* method_name and language will be autopopulated attributes. Default value of language is 'Java'.
5052
*/
5153
@BetaApi
5254
@InternalApi
5355
public class MetricsTracer implements ApiTracer {
54-
55-
private static final String STATUS_ATTRIBUTE = "status";
56-
56+
public static final String METHOD_NAME_ATTRIBUTE = "method_name";
57+
public static final String LANGUAGE_ATTRIBUTE = "language";
58+
public static final String STATUS_ATTRIBUTE = "status";
59+
public static final String DEFAULT_LANGUAGE = "Java";
60+
private static final String OPERATION_FINISHED_STATUS_MESSAGE =
61+
"Operation has already been completed";
5762
private Stopwatch attemptTimer;
58-
5963
private final Stopwatch operationTimer = Stopwatch.createStarted();
60-
6164
private final Map<String, String> attributes = new HashMap<>();
62-
63-
private MetricsRecorder metricsRecorder;
65+
private final MetricsRecorder metricsRecorder;
66+
private final AtomicBoolean operationFinished;
6467

6568
public MetricsTracer(MethodName methodName, MetricsRecorder metricsRecorder) {
66-
this.attributes.put("method_name", methodName.toString());
69+
this.attributes.put(METHOD_NAME_ATTRIBUTE, methodName.toString());
70+
this.attributes.put(LANGUAGE_ATTRIBUTE, DEFAULT_LANGUAGE);
6771
this.metricsRecorder = metricsRecorder;
72+
this.operationFinished = new AtomicBoolean();
6873
}
6974

7075
/**
7176
* Signals that the overall operation has finished successfully. The tracer is now considered
7277
* closed and should no longer be used. Successful operation adds "OK" value to the status
7378
* attribute key.
79+
*
80+
* @throws IllegalStateException if an operation completion call has already been invoked
7481
*/
7582
@Override
7683
public void operationSucceeded() {
84+
if (operationFinished.getAndSet(true)) {
85+
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
86+
}
7787
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
7888
metricsRecorder.recordOperationLatency(
7989
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
@@ -84,9 +94,14 @@ public void operationSucceeded() {
8494
* Signals that the operation was cancelled by the user. The tracer is now considered closed and
8595
* should no longer be used. Cancelled operation adds "CANCELLED" value to the status attribute
8696
* key.
97+
*
98+
* @throws IllegalStateException if an operation completion call has already been invoked
8799
*/
88100
@Override
89101
public void operationCancelled() {
102+
if (operationFinished.getAndSet(true)) {
103+
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
104+
}
90105
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
91106
metricsRecorder.recordOperationLatency(
92107
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
@@ -97,9 +112,14 @@ public void operationCancelled() {
97112
* Signals that the operation was cancelled by the user. The tracer is now considered closed and
98113
* should no longer be used. Failed operation extracts the error from the throwable and adds it to
99114
* the status attribute key.
115+
*
116+
* @throws IllegalStateException if an operation completion call has already been invoked
100117
*/
101118
@Override
102119
public void operationFailed(Throwable error) {
120+
if (operationFinished.getAndSet(true)) {
121+
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
122+
}
103123
attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
104124
metricsRecorder.recordOperationLatency(
105125
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
@@ -126,7 +146,6 @@ public void attemptStarted(Object request, int attemptNumber) {
126146
*/
127147
@Override
128148
public void attemptSucceeded() {
129-
130149
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
131150
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
132151
metricsRecorder.recordAttemptCount(1, attributes);
@@ -138,7 +157,6 @@ public void attemptSucceeded() {
138157
*/
139158
@Override
140159
public void attemptCancelled() {
141-
142160
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
143161
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
144162
metricsRecorder.recordAttemptCount(1, attributes);
@@ -154,7 +172,6 @@ public void attemptCancelled() {
154172
*/
155173
@Override
156174
public void attemptFailed(Throwable error, Duration delay) {
157-
158175
attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
159176
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
160177
metricsRecorder.recordAttemptCount(1, attributes);
@@ -169,7 +186,6 @@ public void attemptFailed(Throwable error, Duration delay) {
169186
*/
170187
@Override
171188
public void attemptFailedRetriesExhausted(Throwable error) {
172-
173189
attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
174190
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
175191
metricsRecorder.recordAttemptCount(1, attributes);
@@ -184,7 +200,6 @@ public void attemptFailedRetriesExhausted(Throwable error) {
184200
*/
185201
@Override
186202
public void attemptPermanentFailure(Throwable error) {
187-
188203
attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
189204
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
190205
metricsRecorder.recordAttemptCount(1, attributes);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are
6+
* met:
7+
*
8+
* * Redistributions of source code must retain the above copyright
9+
* notice, this list of conditions and the following disclaimer.
10+
* * Redistributions in binary form must reproduce the above
11+
* copyright notice, this list of conditions and the following disclaimer
12+
* in the documentation and/or other materials provided with the
13+
* distribution.
14+
* * Neither the name of Google LLC nor the names of its
15+
* contributors may be used to endorse or promote products derived from
16+
* this software without specific prior written permission.
17+
*
18+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22+
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23+
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24+
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*/
30+
31+
package com.google.api.gax.tracing;
32+
33+
import com.google.api.core.BetaApi;
34+
import com.google.api.core.InternalApi;
35+
import com.google.api.gax.core.GaxProperties;
36+
import com.google.common.annotations.VisibleForTesting;
37+
import com.google.common.base.Preconditions;
38+
import io.opentelemetry.api.OpenTelemetry;
39+
import io.opentelemetry.api.common.Attributes;
40+
import io.opentelemetry.api.common.AttributesBuilder;
41+
import io.opentelemetry.api.metrics.DoubleHistogram;
42+
import io.opentelemetry.api.metrics.LongCounter;
43+
import io.opentelemetry.api.metrics.Meter;
44+
import java.util.Map;
45+
46+
/**
47+
* OpenTelemetry implementation of recording metrics. This implementation collections the
48+
* measurements related to the lifecyle of an RPC.
49+
*
50+
* <p>For the Otel implementation, an attempt is a single RPC invocation and an operation is the
51+
* collection of all the attempts made before a response is returned (either as a success or an
52+
* error). A single call (i.e. `EchoClient.echo()`) should have an operation_count of 1 and may have
53+
* an attempt_count of 1+ (depending on the retry configurations).
54+
*/
55+
@BetaApi
56+
@InternalApi
57+
public class OpenTelemetryMetricsRecorder implements MetricsRecorder {
58+
private final DoubleHistogram attemptLatencyRecorder;
59+
private final DoubleHistogram operationLatencyRecorder;
60+
private final LongCounter operationCountRecorder;
61+
private final LongCounter attemptCountRecorder;
62+
63+
/**
64+
* Creates the following instruments for the following metrics:
65+
*
66+
* <ul>
67+
* <li>Attempt Latency: Histogram
68+
* <li>Operation Latency: Histogram
69+
* <li>Attempt Count: Counter
70+
* <li>Operation Count: Counter
71+
* </ul>
72+
*
73+
* @param openTelemetry OpenTelemetry instance
74+
* @param serviceName Service Name
75+
*/
76+
public OpenTelemetryMetricsRecorder(OpenTelemetry openTelemetry, String serviceName) {
77+
Meter meter =
78+
openTelemetry
79+
.meterBuilder("gax-java")
80+
.setInstrumentationVersion(GaxProperties.getGaxVersion())
81+
.build();
82+
this.attemptLatencyRecorder =
83+
meter
84+
.histogramBuilder(serviceName + "/attempt_latency")
85+
.setDescription("Time an individual attempt took")
86+
.setUnit("ms")
87+
.build();
88+
this.operationLatencyRecorder =
89+
meter
90+
.histogramBuilder(serviceName + "/operation_latency")
91+
.setDescription(
92+
"Total time until final operation success or failure, including retries and backoff.")
93+
.setUnit("ms")
94+
.build();
95+
this.attemptCountRecorder =
96+
meter
97+
.counterBuilder(serviceName + "/attempt_count")
98+
.setDescription("Number of Attempts")
99+
.setUnit("1")
100+
.build();
101+
this.operationCountRecorder =
102+
meter
103+
.counterBuilder(serviceName + "/operation_count")
104+
.setDescription("Number of Operations")
105+
.setUnit("1")
106+
.build();
107+
}
108+
109+
/**
110+
* Record the latency for an individual attempt. Data is stored in a Histogram.
111+
*
112+
* @param attemptLatency Attempt Latency in ms
113+
* @param attributes Map of the attributes to store
114+
*/
115+
@Override
116+
public void recordAttemptLatency(double attemptLatency, Map<String, String> attributes) {
117+
attemptLatencyRecorder.record(attemptLatency, toOtelAttributes(attributes));
118+
}
119+
120+
/**
121+
* Record an attempt made. The attempt count number is stored in a LongCounter.
122+
*
123+
* <p>The count should be set as 1 every time this is invoked (each retry attempt)
124+
*
125+
* @param count The number of attempts made
126+
* @param attributes Map of the attributes to store
127+
*/
128+
@Override
129+
public void recordAttemptCount(long count, Map<String, String> attributes) {
130+
attemptCountRecorder.add(count, toOtelAttributes(attributes));
131+
}
132+
133+
/**
134+
* Record the latency for the entire operation. This is the latency for the entire RPC, including
135+
* all the retry attempts
136+
*
137+
* @param operationLatency Operation Latency in ms
138+
* @param attributes Map of the attributes to store
139+
*/
140+
@Override
141+
public void recordOperationLatency(double operationLatency, Map<String, String> attributes) {
142+
operationLatencyRecorder.record(operationLatency, toOtelAttributes(attributes));
143+
}
144+
145+
/**
146+
* Record an operation made. The operation count number is stored in a LongCounter.
147+
*
148+
* <p>The operation count should always be 1 and this should be invoked once.
149+
*
150+
* @param count The number of operations made
151+
* @param attributes Map of the attributes to store
152+
*/
153+
@Override
154+
public void recordOperationCount(long count, Map<String, String> attributes) {
155+
operationCountRecorder.add(count, toOtelAttributes(attributes));
156+
}
157+
158+
@VisibleForTesting
159+
Attributes toOtelAttributes(Map<String, String> attributes) {
160+
Preconditions.checkNotNull(attributes, "Attributes map cannot be null");
161+
AttributesBuilder attributesBuilder = Attributes.builder();
162+
attributes.forEach(attributesBuilder::put);
163+
return attributesBuilder.build();
164+
}
165+
}

0 commit comments

Comments
 (0)