Skip to content

Commit 23083a5

Browse files
jmacdbogdandrutucodeboten
authored
OTel-Arrow receiver timeout propagation (#34742)
**Description:** Receiver side of open-telemetry/otel-arrow#227. The exporter side is #34733. **Link to tracking Issue:** open-telemetry/otel-arrow#227 **Testing:** A new end-to-end integration test. ✅ **Documentation:** Since this is expected of gRPC receivers, no docs are changed. --------- Signed-off-by: Alex Boten <[email protected]> Co-authored-by: Bogdan Drutu <[email protected]> Co-authored-by: Alex Boten <[email protected]>
1 parent 41e26ab commit 23083a5

File tree

5 files changed

+222
-34
lines changed

5 files changed

+222
-34
lines changed
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: otelarrowreceiver
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Add gRPC timeout propagation.
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [34742]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

internal/otelarrow/test/e2e_test.go

+56-2
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,15 @@ import (
4848
type testParams struct {
4949
threadCount int
5050
requestUntil func(*testConsumer) bool
51+
52+
// missingDeadline is configured so the zero value implies a deadline,
53+
// which is the default.
54+
missingDeadline bool
5155
}
5256

5357
type testConsumer struct {
58+
t *testing.T
59+
5460
sink consumertest.TracesSink
5561
sentSpans atomic.Int64
5662

@@ -62,6 +68,8 @@ type testConsumer struct {
6268

6369
recvSpans *tracetest.InMemoryExporter
6470
expSpans *tracetest.InMemoryExporter
71+
72+
expectDeadline bool
6573
}
6674

6775
var _ consumer.Traces = &testConsumer{}
@@ -80,6 +88,19 @@ func (*testConsumer) Capabilities() consumer.Capabilities {
8088

8189
func (tc *testConsumer) ConsumeTraces(ctx context.Context, td ptrace.Traces) error {
8290
time.Sleep(time.Duration(float64(time.Millisecond) * (1 + rand.Float64())))
91+
92+
dead, hasDeadline := ctx.Deadline()
93+
timeout := time.Until(dead)
94+
95+
require.Equal(tc.t, tc.expectDeadline, hasDeadline, "deadline set or not set: %v", timeout)
96+
if tc.expectDeadline {
97+
// expect allows 1/6 of the deadline to elapse in transit,
98+
// so 1m becomes 50s.
99+
expect := tc.expCfg.TimeoutSettings.Timeout * 5 / 6
100+
require.Less(tc.t, expect, timeout)
101+
require.Greater(tc.t, tc.expCfg.TimeoutSettings.Timeout, timeout)
102+
}
103+
83104
return tc.sink.ConsumeTraces(ctx, td)
84105
}
85106

@@ -100,7 +121,7 @@ func testLoggerSettings(_ *testing.T) (component.TelemetrySettings, *observer.Ob
100121
return tset, obslogs, exp
101122
}
102123

103-
func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces, receiver.Traces) {
124+
func basicTestConfig(t *testing.T, tp testParams, cfgF CfgFunc) (*testConsumer, exporter.Traces, receiver.Traces) {
104125
ctx := context.Background()
105126

106127
efact := otelarrowexporter.NewFactory()
@@ -115,6 +136,7 @@ func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces
115136
addr := testutil.GetAvailableLocalAddress(t)
116137

117138
receiverCfg.Protocols.GRPC.NetAddr.Endpoint = addr
139+
118140
exporterCfg.ClientConfig.Endpoint = addr
119141
exporterCfg.ClientConfig.WaitForReady = true
120142
exporterCfg.ClientConfig.TLSSetting.Insecure = true
@@ -123,6 +145,7 @@ func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces
123145
exporterCfg.RetryConfig.Enabled = true
124146
exporterCfg.Arrow.NumStreams = 1
125147
exporterCfg.Arrow.MaxStreamLifetime = 5 * time.Second
148+
exporterCfg.Arrow.DisableDowngrade = true
126149

127150
if cfgF != nil {
128151
cfgF(exporterCfg, receiverCfg)
@@ -132,6 +155,8 @@ func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces
132155
recvTset, recvLogs, recvSpans := testLoggerSettings(t)
133156

134157
testCon := &testConsumer{
158+
t: t,
159+
135160
recvCfg: receiverCfg,
136161
expCfg: exporterCfg,
137162

@@ -140,6 +165,8 @@ func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces
140165

141166
recvSpans: recvSpans,
142167
expSpans: expSpans,
168+
169+
expectDeadline: !tp.missingDeadline,
143170
}
144171

145172
receiver, err := rfact.CreateTracesReceiver(ctx, receiver.Settings{
@@ -161,7 +188,7 @@ func basicTestConfig(t *testing.T, cfgF CfgFunc) (*testConsumer, exporter.Traces
161188
func testIntegrationTraces(ctx context.Context, t *testing.T, tp testParams, cfgf CfgFunc, mkgen MkGen, errf ConsumerErrFunc, endf EndFunc) {
162189
host := componenttest.NewNopHost()
163190

164-
testCon, exporter, receiver := basicTestConfig(t, cfgf)
191+
testCon, exporter, receiver := basicTestConfig(t, tp, cfgf)
165192

166193
var startWG sync.WaitGroup
167194
var exporterShutdownWG sync.WaitGroup
@@ -426,6 +453,33 @@ func TestIntegrationTracesSimple(t *testing.T) {
426453
}
427454
}
428455

456+
func TestIntegrationDeadlinePropagation(t *testing.T) {
457+
for _, hasDeadline := range []bool{false, true} {
458+
t.Run(fmt.Sprint("deadline=", hasDeadline), func(t *testing.T) {
459+
ctx, cancel := context.WithCancel(context.Background())
460+
defer cancel()
461+
462+
// Until at least one span is written.
463+
var params = testParams{
464+
threadCount: 1,
465+
requestUntil: func(test *testConsumer) bool {
466+
return test.sink.SpanCount() < 1
467+
},
468+
missingDeadline: !hasDeadline,
469+
}
470+
471+
testIntegrationTraces(ctx, t, params, func(ecfg *ExpConfig, _ *RecvConfig) {
472+
if !hasDeadline {
473+
// 0 disables the exporthelper-set timeout.
474+
ecfg.TimeoutSettings.Timeout = 0
475+
} else {
476+
ecfg.TimeoutSettings.Timeout = 37 * time.Minute
477+
}
478+
}, func() GenFunc { return makeTestTraces }, consumerSuccess, standardEnding)
479+
})
480+
}
481+
}
482+
429483
func TestIntegrationMemoryLimited(t *testing.T) {
430484
ctx, cancel := context.WithCancel(context.Background())
431485
defer cancel()

receiver/otelarrowreceiver/go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelar
33
go 1.22.0
44

55
require (
6+
github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil v0.109.0
67
github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow v0.109.0
78
github.com/open-telemetry/opentelemetry-collector-contrib/internal/sharedcomponent v0.109.0
89
github.com/open-telemetry/otel-arrow v0.26.0

receiver/otelarrowreceiver/internal/arrow/arrow.go

+71-32
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"google.golang.org/grpc/status"
4242
"google.golang.org/protobuf/proto"
4343

44+
"github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil"
4445
"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/admission"
4546
"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/netstats"
4647
internalmetadata "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver/internal/metadata"
@@ -173,7 +174,9 @@ func newHeaderReceiver(streamCtx context.Context, as auth.Server, includeMetadat
173174
// client.Info with additional key:values associated with the arrow batch.
174175
func (h *headerReceiver) combineHeaders(ctx context.Context, hdrsBytes []byte) (context.Context, map[string][]string, error) {
175176
if len(hdrsBytes) == 0 && len(h.streamHdrs) == 0 {
176-
return ctx, nil, nil
177+
// Note: call newContext in this case to ensure that
178+
// connInfo is added to the context, for Auth.
179+
return h.newContext(ctx, nil), nil, nil
177180
}
178181

179182
if len(hdrsBytes) == 0 {
@@ -420,8 +423,8 @@ func (r *Receiver) anyStream(serverStream anyStreamServer, method string) (retEr
420423
}
421424
}
422425

423-
func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) (context.Context, *inFlightData) {
424-
ctx, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
426+
func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) *inFlightData {
427+
_, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
425428

426429
r.inFlightWG.Add(1)
427430
r.telemetryBuilder.OtelArrowReceiverInFlightRequests.Add(ctx, 1)
@@ -433,7 +436,7 @@ func (r *receiverStream) newInFlightData(ctx context.Context, method string, bat
433436
span: span,
434437
}
435438
id.refs.Add(1)
436-
return ctx, id
439+
return id
437440
}
438441

439442
// inFlightData is responsible for storing the resources held by one request.
@@ -549,35 +552,43 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
549552

550553
// Receive a batch corresponding with one ptrace.Traces, pmetric.Metrics,
551554
// or plog.Logs item.
552-
req, err := serverStream.Recv()
555+
req, recvErr := serverStream.Recv()
556+
557+
// the incoming stream context is the parent of the in-flight context, which
558+
// carries a span covering sequential stream-processing work. the context
559+
// is severed at this point, with flight.span a contextless child that will be
560+
// finished in recvDone().
561+
flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
553562

554563
// inflightCtx is carried through into consumeAndProcess on the success path.
555-
inflightCtx, flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
564+
// this inherits the stream context so that its auth headers are present
565+
// when the per-data Auth call is made.
566+
inflightCtx := streamCtx
556567
defer flight.recvDone(inflightCtx, &retErr)
557568

558-
if err != nil {
559-
if errors.Is(err, io.EOF) {
560-
return err
569+
if recvErr != nil {
570+
if errors.Is(recvErr, io.EOF) {
571+
return recvErr
561572

562-
} else if errors.Is(err, context.Canceled) {
573+
} else if errors.Is(recvErr, context.Canceled) {
563574
// This is a special case to avoid introducing a span error
564575
// for a canceled operation.
565576
return io.EOF
566577

567-
} else if status, ok := status.FromError(err); ok && status.Code() == codes.Canceled {
578+
} else if status, ok := status.FromError(recvErr); ok && status.Code() == codes.Canceled {
568579
// This is a special case to avoid introducing a span error
569580
// for a canceled operation.
570581
return io.EOF
571582
}
572583
// Note: err is directly from gRPC, should already have status.
573-
return err
584+
return recvErr
574585
}
575586

576587
// Check for optional headers and set the incoming context.
577-
inflightCtx, authHdrs, err := hrcv.combineHeaders(inflightCtx, req.GetHeaders())
578-
if err != nil {
588+
inflightCtx, authHdrs, hdrErr := hrcv.combineHeaders(inflightCtx, req.GetHeaders())
589+
if hdrErr != nil {
579590
// Failing to parse the incoming headers breaks the stream.
580-
return status.Errorf(codes.Internal, "arrow metadata error: %v", err)
591+
return status.Errorf(codes.Internal, "arrow metadata error: %v", hdrErr)
581592
}
582593

583594
// start this span after hrcv.combineHeaders returns extracted context. This will allow this span
@@ -601,9 +612,29 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
601612
// This is a compressed size so make sure to acquire the difference when request is decompressed.
602613
prevAcquiredBytes = int64(proto.Size(req))
603614
} else {
604-
prevAcquiredBytes, err = strconv.ParseInt(uncompSizeHeaderStr[0], 10, 64)
605-
if err != nil {
606-
return status.Errorf(codes.Internal, "failed to convert string to request size: %v", err)
615+
var parseErr error
616+
prevAcquiredBytes, parseErr = strconv.ParseInt(uncompSizeHeaderStr[0], 10, 64)
617+
if parseErr != nil {
618+
return status.Errorf(codes.Internal, "failed to convert string to request size: %v", parseErr)
619+
}
620+
}
621+
622+
var callerCancel context.CancelFunc
623+
if encodedTimeout, has := authHdrs["grpc-timeout"]; has && len(encodedTimeout) == 1 {
624+
if timeout, decodeErr := grpcutil.DecodeTimeout(encodedTimeout[0]); decodeErr != nil {
625+
r.telemetry.Logger.Debug("grpc-timeout parse error", zap.Error(decodeErr))
626+
} else {
627+
// timeout parsed successfully
628+
inflightCtx, callerCancel = context.WithTimeout(inflightCtx, timeout)
629+
630+
// if we return before the new goroutine is started below
631+
// cancel the context. callerCancel will be non-nil until
632+
// the new goroutine is created at the end of this function.
633+
defer func() {
634+
if callerCancel != nil {
635+
callerCancel()
636+
}
637+
}()
607638
}
608639
}
609640

@@ -612,19 +643,19 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
612643
// immediately if there are too many waiters, or will
613644
// otherwise block until timeout or enough memory becomes
614645
// available.
615-
err = r.boundedQueue.Acquire(inflightCtx, prevAcquiredBytes)
616-
if err != nil {
617-
return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue: %v", err)
646+
acquireErr := r.boundedQueue.Acquire(inflightCtx, prevAcquiredBytes)
647+
if acquireErr != nil {
648+
return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue: %v", acquireErr)
618649
}
619650
flight.numAcquired = prevAcquiredBytes
620651

621-
data, numItems, uncompSize, err := r.consumeBatch(ac, req)
652+
data, numItems, uncompSize, consumeErr := r.consumeBatch(ac, req)
622653

623-
if err != nil {
624-
if errors.Is(err, arrowRecord.ErrConsumerMemoryLimit) {
625-
return status.Errorf(codes.ResourceExhausted, "otel-arrow decode: %v", err)
654+
if consumeErr != nil {
655+
if errors.Is(consumeErr, arrowRecord.ErrConsumerMemoryLimit) {
656+
return status.Errorf(codes.ResourceExhausted, "otel-arrow decode: %v", consumeErr)
626657
}
627-
return status.Errorf(codes.Internal, "otel-arrow decode: %v", err)
658+
return status.Errorf(codes.Internal, "otel-arrow decode: %v", consumeErr)
628659
}
629660

630661
flight.uncompSize = uncompSize
@@ -633,27 +664,35 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
633664
r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(inflightCtx, uncompSize)
634665
r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(inflightCtx, int64(numItems))
635666

636-
numAcquired, err := r.acquireAdditionalBytes(inflightCtx, prevAcquiredBytes, uncompSize, hrcv.connInfo.Addr, uncompSizeHeaderFound)
667+
numAcquired, secondAcquireErr := r.acquireAdditionalBytes(inflightCtx, prevAcquiredBytes, uncompSize, hrcv.connInfo.Addr, uncompSizeHeaderFound)
637668

638669
flight.numAcquired = numAcquired
639-
if err != nil {
640-
return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue re-acquire: %v", err)
670+
if secondAcquireErr != nil {
671+
return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue re-acquire: %v", secondAcquireErr)
641672
}
642673

643674
// Recognize that the request is still in-flight via consumeAndRespond()
644675
flight.refs.Add(1)
645676

646677
// consumeAndRespond consumes the data and returns control to the sender loop.
647-
go r.consumeAndRespond(inflightCtx, data, flight)
678+
go func(callerCancel context.CancelFunc) {
679+
if callerCancel != nil {
680+
defer callerCancel()
681+
}
682+
r.consumeAndRespond(inflightCtx, streamCtx, data, flight)
683+
}(callerCancel)
684+
685+
// Reset callerCancel so the deferred function above does not call it here.
686+
callerCancel = nil
648687

649688
return nil
650689
}
651690

652691
// consumeAndRespond finishes the span started in recvOne and logs the
653692
// result after invoking the pipeline to consume the data.
654-
func (r *Receiver) consumeAndRespond(ctx context.Context, data any, flight *inFlightData) {
693+
func (r *Receiver) consumeAndRespond(ctx, streamCtx context.Context, data any, flight *inFlightData) {
655694
var err error
656-
defer flight.consumeDone(ctx, &err)
695+
defer flight.consumeDone(streamCtx, &err)
657696

658697
// recoverErr is a special function because it recovers panics, so we
659698
// keep it in a separate defer than the processing above, which will

0 commit comments

Comments
 (0)