From cd183c2c700a1a07a1b8798305b976fd9f5c0000 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 3 Jul 2022 21:50:41 +0800 Subject: [PATCH 01/40] feat: add internalSingleSend to send single message --- pulsar/internal/commands.go | 23 +++++++- pulsar/producer_partition.go | 109 +++++++++++++++++++++++------------ 2 files changed, 94 insertions(+), 38 deletions(-) diff --git a/pulsar/internal/commands.go b/pulsar/internal/commands.go index 7fd18850f4..cba97ddd18 100644 --- a/pulsar/internal/commands.go +++ b/pulsar/internal/commands.go @@ -21,7 +21,6 @@ import ( "encoding/binary" "errors" "fmt" - "github.com/gogo/protobuf/proto" "github.com/apache/pulsar-client-go/pulsar/internal/compression" @@ -297,6 +296,28 @@ func serializeBatch(wb Buffer, return nil } +func SingleSend(wb Buffer, + producerID, sequenceID uint64, + msgMetadata *pb.MessageMetadata, + uncompressedPayload Buffer, + compressionType pb.CompressionType, + level compression.Level, + encryptor crypto.Encryptor) error { + cmdSend := baseCommand( + pb.BaseCommand_SEND, + &pb.CommandSend{ + ProducerId: &producerID, + }, + ) + cmdSend.Send.SequenceId = &sequenceID + if msgMetadata.GetTotalChunkMsgSize() > 1 { + isChunk := true + cmdSend.Send.IsChunk = &isChunk + } + // todo: rename serializeBatch + return serializeBatch(wb, cmdSend, msgMetadata, uncompressedPayload, getCompressionProvider(compressionType, level), encryptor) +} + // ConvertFromStringMap convert a string map to a KeyValue []byte func ConvertFromStringMap(m map[string]string) []*pb.KeyValue { list := make([]*pb.KeyValue, len(m)) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 5a2694c02c..a0a80ba7c3 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -74,6 +74,7 @@ type partitionProducer struct { batchBuilder internal.BatchBuilder sequenceIDGenerator *uint64 batchFlushTicker *time.Ticker + encryptor internalcrypto.Encryptor // Channel where app is posting messages to be published eventsChan chan interface{} @@ -219,27 +220,18 @@ func (p *partitionProducer) grabCnx() error { p.producerName = res.Response.ProducerSuccess.GetProducerName() - var encryptor internalcrypto.Encryptor + //var encryptor internalcrypto.Encryptor if p.options.Encryption != nil { - encryptor = internalcrypto.NewProducerEncryptor(p.options.Encryption.Keys, + p.encryptor = internalcrypto.NewProducerEncryptor(p.options.Encryption.Keys, p.options.Encryption.KeyReader, p.options.Encryption.MessageCrypto, p.options.Encryption.ProducerCryptoFailureAction, p.log) } else { - encryptor = internalcrypto.NewNoopEncryptor() + p.encryptor = internalcrypto.NewNoopEncryptor() } if p.options.DisableBatching { - provider, _ := GetBatcherBuilderProvider(DefaultBatchBuilder) - p.batchBuilder, err = provider(p.options.BatchingMaxMessages, p.options.BatchingMaxSize, - p.producerName, p.producerID, pb.CompressionType(p.options.CompressionType), - compression.Level(p.options.CompressionLevel), - p, - p.log, - encryptor) - if err != nil { - return err - } + // removed BatchBuilder creation } else if p.batchBuilder == nil { provider, err := GetBatcherBuilderProvider(p.options.BatcherBuilderType) if err != nil { @@ -251,7 +243,7 @@ func (p *partitionProducer) grabCnx() error { compression.Level(p.options.CompressionLevel), p, p.log, - encryptor) + p.encryptor) if err != nil { return err } @@ -469,38 +461,79 @@ func (p *partitionProducer) internalSend(request *sendRequest) { smm.SequenceId = proto.Uint64(sequenceID) } - if !sendAsBatch { - p.internalFlushCurrentBatch() - } - if msg.DisableReplication { msg.ReplicationClusters = []string{"__local__"} } - added := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, - msg.ReplicationClusters, deliverAt) - if !added { - // The current batch is full.. flush it and retry + if !sendAsBatch { + p.internalSingleSend(smm, payload, request) + } else { + added := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, + msg.ReplicationClusters, deliverAt) + if !added { + // The current batch is full.. flush it and retry - p.internalFlushCurrentBatch() + p.internalFlushCurrentBatch() - // after flushing try again to add the current payload - if ok := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, - msg.ReplicationClusters, deliverAt); !ok { - p.publishSemaphore.Release() - request.callback(nil, request.msg, errFailAddToBatch) - p.log.WithField("size", len(payload)). - WithField("properties", msg.Properties). - Error("unable to add message to batch") - return + // after flushing try again to add the current payload + if ok := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, + msg.ReplicationClusters, deliverAt); !ok { + p.publishSemaphore.Release() + request.callback(nil, request.msg, errFailAddToBatch) + p.log.WithField("size", len(payload)). + WithField("properties", msg.Properties). + Error("unable to add message to batch") + return + } } - } + if request.flushImmediately { - if !sendAsBatch || request.flushImmediately { + p.internalFlushCurrentBatch() - p.internalFlushCurrentBatch() + } + } +} +func (p *partitionProducer) internalSingleSend(smm *pb.SingleMessageMetadata, payload []byte, request *sendRequest) { + msg := request.msg + sid := internal.GetAndAdd(p.sequenceIDGenerator, 1) + mm := &pb.MessageMetadata{ + ProducerName: &p.producerName, + SequenceId: proto.Uint64(sid), + PublishTime: proto.Uint64(internal.TimestampMillis(time.Now())), + Properties: smm.Properties, + PartitionKey: smm.PartitionKey, + ReplicateTo: msg.ReplicationClusters, + } + if msg.DeliverAt.UnixNano() > 0 { + mm.DeliverAtTime = proto.Int64(int64(internal.TimestampMillis(msg.DeliverAt))) + } + + payloadBuf := internal.NewBuffer(len(payload)) + payloadBuf.Write(payload) + + buffer := p.GetBuffer() + if buffer == nil { + buffer = internal.NewBuffer(int(payloadBuf.ReadableBytes() * 3 / 2)) + } + + if err := internal.SingleSend( + buffer, p.producerID, sid, mm, payloadBuf, + pb.CompressionType(p.options.CompressionType), + compression.Level(p.options.CompressionLevel), + p.encryptor, + ); err != nil { + p.log.WithError(err).Errorf("Single message serialize failed %s", msg.Value) + return } + + p.pendingQueue.Put(&pendingItem{ + sentAt: time.Now(), + batchData: buffer, + sequenceID: sid, + sendRequests: []interface{}{request}, + }) + p._getConn().WriteData(buffer) } type pendingItem struct { @@ -860,8 +893,10 @@ func (p *partitionProducer) internalClose(req *closeProducer) { p.log.Info("Closed producer") } - if err = p.batchBuilder.Close(); err != nil { - p.log.WithError(err).Warn("Failed to close batch builder") + if p.batchBuilder != nil { + if err = p.batchBuilder.Close(); err != nil { + p.log.WithError(err).Warn("Failed to close batch builder") + } } p.setProducerState(producerClosed) From 342e8e567176321ac5fa8f2b9fabd218b0d5c450 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 10 Jul 2022 15:01:48 +0800 Subject: [PATCH 02/40] feat: support chunking for producer --- pulsar/internal/batch_builder.go | 8 +- pulsar/internal/commands.go | 22 ++- pulsar/producer.go | 9 + pulsar/producer_impl.go | 4 + pulsar/producer_partition.go | 291 +++++++++++++++++++++++-------- pulsar/producer_test.go | 29 +++ 6 files changed, 273 insertions(+), 90 deletions(-) diff --git a/pulsar/internal/batch_builder.go b/pulsar/internal/batch_builder.go index fb7598e96f..eb36e0766b 100644 --- a/pulsar/internal/batch_builder.go +++ b/pulsar/internal/batch_builder.go @@ -122,7 +122,7 @@ func newBatchContainer( ProducerName: &producerName, }, callbacks: []interface{}{}, - compressionProvider: getCompressionProvider(compressionType, level), + compressionProvider: GetCompressionProvider(compressionType, level), buffersPool: bufferPool, log: logger, encryptor: encryptor, @@ -241,8 +241,8 @@ func (bc *batchContainer) Flush() ( buffer = NewBuffer(int(uncompressedSize * 3 / 2)) } - if err = serializeBatch( - buffer, bc.cmdSend, bc.msgMetadata, bc.buffer, bc.compressionProvider, bc.encryptor, + if err = serializeMessage( + buffer, bc.cmdSend, bc.msgMetadata, bc.buffer, bc.compressionProvider, bc.encryptor, true, ); err == nil { // no error in serializing Batch sequenceID = bc.cmdSend.Send.GetSequenceId() } @@ -268,7 +268,7 @@ func (bc *batchContainer) Close() error { return bc.compressionProvider.Close() } -func getCompressionProvider( +func GetCompressionProvider( compressionType pb.CompressionType, level compression.Level, ) compression.Provider { diff --git a/pulsar/internal/commands.go b/pulsar/internal/commands.go index cba97ddd18..6ea3018a34 100644 --- a/pulsar/internal/commands.go +++ b/pulsar/internal/commands.go @@ -233,17 +233,23 @@ func addSingleMessageToBatch(wb Buffer, smm *pb.SingleMessageMetadata, payload [ wb.Write(payload) } -func serializeBatch(wb Buffer, +func serializeMessage(wb Buffer, cmdSend *pb.BaseCommand, msgMetadata *pb.MessageMetadata, - uncompressedPayload Buffer, + payload Buffer, compressionProvider compression.Provider, - encryptor crypto.Encryptor) error { + encryptor crypto.Encryptor, + doCompress bool) error { // Wire format // [TOTAL_SIZE] [CMD_SIZE][CMD] [MAGIC_NUMBER][CHECKSUM] [METADATA_SIZE][METADATA] [PAYLOAD] // compress the payload - compressedPayload := compressionProvider.Compress(nil, uncompressedPayload.ReadableSlice()) + var compressedPayload []byte + if doCompress { + compressedPayload = compressionProvider.Compress(nil, payload.ReadableSlice()) + } else { + compressedPayload = payload.ReadableSlice() + } // encrypt the compressed payload encryptedPayload, err := encryptor.Encrypt(compressedPayload, msgMetadata) @@ -299,9 +305,7 @@ func serializeBatch(wb Buffer, func SingleSend(wb Buffer, producerID, sequenceID uint64, msgMetadata *pb.MessageMetadata, - uncompressedPayload Buffer, - compressionType pb.CompressionType, - level compression.Level, + compressedPayload Buffer, encryptor crypto.Encryptor) error { cmdSend := baseCommand( pb.BaseCommand_SEND, @@ -314,8 +318,8 @@ func SingleSend(wb Buffer, isChunk := true cmdSend.Send.IsChunk = &isChunk } - // todo: rename serializeBatch - return serializeBatch(wb, cmdSend, msgMetadata, uncompressedPayload, getCompressionProvider(compressionType, level), encryptor) + // payload has been compressed so compressionProvider can be nil + return serializeMessage(wb, cmdSend, msgMetadata, compressedPayload, nil, encryptor, false) } // ConvertFromStringMap convert a string map to a KeyValue []byte diff --git a/pulsar/producer.go b/pulsar/producer.go index d9b2307a33..aea2ea40fd 100644 --- a/pulsar/producer.go +++ b/pulsar/producer.go @@ -168,6 +168,15 @@ type ProducerOptions struct { // Encryption specifies the fields required to encrypt a message Encryption *ProducerEncryptionInfo + + // EnableChunking controls whether automatic chunking of messages is enabled for the producer. By default, chunking + // is disabled. + // Chunking can not be enabled when batching is not closed. + EnableChunking bool + + // ChunkMaxMessageSize is the max size of single chunk payload. + // It will actually only take effect if it is smaller than broker.MaxMessageSize + ChunkMaxMessageSize uint } // Producer is used to publish messages on a topic diff --git a/pulsar/producer_impl.go b/pulsar/producer_impl.go index 9bbfccbd74..3c45b597d0 100644 --- a/pulsar/producer_impl.go +++ b/pulsar/producer_impl.go @@ -94,6 +94,10 @@ func newProducer(client *client, options *ProducerOptions) (*producer, error) { options.PartitionsAutoDiscoveryInterval = defaultPartitionsAutoDiscoveryInterval } + if !options.DisableBatching && options.EnableChunking { + return nil, fmt.Errorf("batching and chunking can not be enabled together") + } + p := &producer{ options: options, topic: options.Topic, diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index a0a80ba7c3..5dd12a3da0 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -19,6 +19,8 @@ package pulsar import ( "context" + "fmt" + "math" "strings" "sync" "sync/atomic" @@ -52,6 +54,7 @@ var ( errSendQueueIsFull = newError(ProducerQueueIsFull, "producer send queue is full") errContextExpired = newError(TimeoutError, "message send context expired") errMessageTooLarge = newError(MessageTooBig, "message size exceeds MaxMessageSize") + errMetaTooLarge = newError(InvalidMessage, "message metadata size exceeds MaxMessageSize") errProducerClosed = newError(ProducerClosed, "producer already been closed") buffersPool sync.Pool @@ -75,6 +78,7 @@ type partitionProducer struct { sequenceIDGenerator *uint64 batchFlushTicker *time.Ticker encryptor internalcrypto.Encryptor + compressionProvider compression.Provider // Channel where app is posting messages to be published eventsChan chan interface{} @@ -120,6 +124,8 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions connectClosedCh: make(chan connectionClosed, 10), closeCh: make(chan struct{}), batchFlushTicker: time.NewTicker(batchingMaxPublishDelay), + compressionProvider: internal.GetCompressionProvider(pb.CompressionType(options.CompressionType), + compression.Level(options.CompressionLevel)), publishSemaphore: internal.NewSemaphore(int32(maxPendingMessages)), pendingQueue: internal.NewBlockingQueue(maxPendingMessages), lastSequenceID: -1, @@ -282,7 +288,7 @@ func (p *partitionProducer) grabCnx() error { pi.sentAt = time.Now() pi.Unlock() p.pendingQueue.Put(pi) - p._getConn().WriteData(pi.batchData) + p._getConn().WriteData(pi.buffer) if pi == lastViewItem { break @@ -397,13 +403,17 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg := request.msg // read payload from message - payload := msg.Payload + uncompressedPayload := msg.Payload var err error + if !p.canAddToQueue(request) { + return + } + // payload and schema are mutually exclusive // try to get payload from schema value only if payload is not set - if payload == nil && p.options.Schema != nil { + if uncompressedPayload == nil && p.options.Schema != nil { var schemaPayload []byte schemaPayload, err = p.options.Schema.Encode(msg.Value) if err != nil { @@ -412,63 +422,106 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.log.WithError(err).Errorf("Schema encode message failed %s", msg.Value) return } - payload = schemaPayload + uncompressedPayload = schemaPayload } - // if msg is too large - if len(payload) > int(p._getConn().GetMaxMessageSize()) { + uncompressedSize := len(uncompressedPayload) + + // compress payload + compressedPayload := p.compressionProvider.Compress(nil, uncompressedPayload) + compressedSize := len(compressedPayload) + + // if msg is too large and chunking is disabled + if compressedSize > int(p._getConn().GetMaxMessageSize()) && !p.options.EnableChunking { p.publishSemaphore.Release() request.callback(nil, request.msg, errMessageTooLarge) p.log.WithError(errMessageTooLarge). - WithField("size", len(payload)). + WithField("size", compressedSize). WithField("properties", msg.Properties). Errorf("MaxMessageSize %d", int(p._getConn().GetMaxMessageSize())) p.metrics.PublishErrorsMsgTooLarge.Inc() return } + mm := p.genMetadata(msg, uncompressedSize) + + // set default ReplicationClusters when DisableReplication + if msg.DisableReplication { + msg.ReplicationClusters = []string{"__local__"} + } + + // todo: deliverAt has calculated in genMetadata but it's not a good idea to make genMetadata() return it. deliverAt := msg.DeliverAt if msg.DeliverAfter.Nanoseconds() > 0 { deliverAt = time.Now().Add(msg.DeliverAfter) } - sendAsBatch := !p.options.DisableBatching && msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 - smm := &pb.SingleMessageMetadata{ - PayloadSize: proto.Int(len(payload)), - } - - if msg.EventTime.UnixNano() != 0 { - smm.EventTime = proto.Uint64(internal.TimestampMillis(msg.EventTime)) - } - - if msg.Key != "" { - smm.PartitionKey = proto.String(msg.Key) - } - - if len(msg.OrderingKey) != 0 { - smm.OrderingKey = []byte(msg.OrderingKey) - } - - if msg.Properties != nil { - smm.Properties = internal.ConvertFromStringMap(msg.Properties) + var totalChunks int + // max chunk payload size + var payloadChunkSize int + if sendAsBatch || !p.options.EnableChunking { + totalChunks = 1 + payloadChunkSize = int(p._getConn().GetMaxMessageSize()) + } else { + payloadChunkSize = int(p._getConn().GetMaxMessageSize()) - mm.Size() + if payloadChunkSize <= 0 { + request.callback(nil, msg, errMetaTooLarge) + p.log.WithError(errMetaTooLarge). + WithField("metadata size", mm.Size()). + WithField("properties", msg.Properties). + Errorf("MaxMessageSize %d", int(p._getConn().GetMaxMessageSize())) + p.metrics.PublishErrorsMsgTooLarge.Inc() + return + } + // set ChunkMaxMessageSize + if p.options.ChunkMaxMessageSize != 0 { + payloadChunkSize = int(math.Min(float64(payloadChunkSize), float64(p.options.ChunkMaxMessageSize))) + } + totalChunks = int(math.Max(1, math.Ceil(float64(compressedSize)/float64(payloadChunkSize)))) } - if msg.SequenceID != nil { - sequenceID := uint64(*msg.SequenceID) - smm.SequenceId = proto.Uint64(sequenceID) + // correct limit queue when chunked + for i := 0; i < totalChunks-1; i++ { + if !p.canAddToQueue(request) { + return + } } - if msg.DisableReplication { - msg.ReplicationClusters = []string{"__local__"} - } + // set total chunks to send request + request.totalChunks = totalChunks if !sendAsBatch { - p.internalSingleSend(smm, payload, request) + if totalChunks > 1 { + var lhs, rhs int + uuid := fmt.Sprintf("%s-%d", p.producerName, mm.SequenceId) + mm.Uuid = proto.String(uuid) + mm.NumChunksFromMsg = proto.Int(totalChunks) + mm.TotalChunkMsgSize = proto.Int(compressedSize) + for chunkId := 0; chunkId < totalChunks; chunkId++ { + lhs = chunkId * payloadChunkSize + if rhs = lhs + payloadChunkSize; rhs > compressedSize { + rhs = compressedSize + } + // update chunk id + mm.ChunkId = proto.Int(chunkId) + nsr := &sendRequest{ + ctx: request.ctx, + msg: request.msg, + callback: request.callback, + publishTime: request.publishTime, + chunkId: chunkId, + } + p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr) + } + } else { + p.internalSingleSend(mm, compressedPayload, request) + } } else { - added := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, + smm := p.genSingleMetaMessage(msg, uncompressedSize) + added := p.batchBuilder.Add(smm, p.sequenceIDGenerator, uncompressedPayload, request, msg.ReplicationClusters, deliverAt) if !added { // The current batch is full.. flush it and retry @@ -476,11 +529,11 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.internalFlushCurrentBatch() // after flushing try again to add the current payload - if ok := p.batchBuilder.Add(smm, p.sequenceIDGenerator, payload, request, + if ok := p.batchBuilder.Add(smm, p.sequenceIDGenerator, uncompressedPayload, request, msg.ReplicationClusters, deliverAt); !ok { p.publishSemaphore.Release() request.callback(nil, request.msg, errFailAddToBatch) - p.log.WithField("size", len(payload)). + p.log.WithField("size", uncompressedSize). WithField("properties", msg.Properties). Error("unable to add message to batch") return @@ -494,33 +547,93 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } } -func (p *partitionProducer) internalSingleSend(smm *pb.SingleMessageMetadata, payload []byte, request *sendRequest) { - msg := request.msg - sid := internal.GetAndAdd(p.sequenceIDGenerator, 1) - mm := &pb.MessageMetadata{ - ProducerName: &p.producerName, - SequenceId: proto.Uint64(sid), - PublishTime: proto.Uint64(internal.TimestampMillis(time.Now())), - Properties: smm.Properties, - PartitionKey: smm.PartitionKey, - ReplicateTo: msg.ReplicationClusters, +func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize int) (mm *pb.MessageMetadata) { + var sequenceID uint64 + if msg.SequenceID != nil { + sequenceID = uint64(*msg.SequenceID) + } else { + sequenceID = internal.GetAndAdd(p.sequenceIDGenerator, 1) + } + + mm = &pb.MessageMetadata{ + ProducerName: &p.producerName, + SequenceId: proto.Uint64(sequenceID), + PublishTime: proto.Uint64(internal.TimestampMillis(time.Now())), + ReplicateTo: msg.ReplicationClusters, + UncompressedSize: proto.Uint32(uint32(uncompressedSize)), + } + + if msg.Key != "" { + mm.PartitionKey = proto.String(msg.Key) + } + + if msg.Properties != nil { + mm.Properties = internal.ConvertFromStringMap(msg.Properties) + } + + deliverAt := msg.DeliverAt + if msg.DeliverAfter.Nanoseconds() > 0 { + deliverAt = time.Now().Add(msg.DeliverAfter) + } + if deliverAt.UnixNano() > 0 { + mm.DeliverAtTime = proto.Int64(int64(internal.TimestampMillis(deliverAt))) } - if msg.DeliverAt.UnixNano() > 0 { - mm.DeliverAtTime = proto.Int64(int64(internal.TimestampMillis(msg.DeliverAt))) + + return +} + +func (p *partitionProducer) genSingleMetaMessage(msg *ProducerMessage, uncompressedSize int) (smm *pb.SingleMessageMetadata) { + smm = &pb.SingleMessageMetadata{ + PayloadSize: proto.Int(uncompressedSize), } - payloadBuf := internal.NewBuffer(len(payload)) - payloadBuf.Write(payload) + if msg.EventTime.UnixNano() != 0 { + smm.EventTime = proto.Uint64(internal.TimestampMillis(msg.EventTime)) + } + + if msg.Key != "" { + smm.PartitionKey = proto.String(msg.Key) + } + + if len(msg.OrderingKey) != 0 { + smm.OrderingKey = []byte(msg.OrderingKey) + } + + if msg.Properties != nil { + smm.Properties = internal.ConvertFromStringMap(msg.Properties) + } + + var sequenceID uint64 + if msg.SequenceID != nil { + sequenceID = uint64(*msg.SequenceID) + } else { + sequenceID = internal.GetAndAdd(p.sequenceIDGenerator, 1) + } + + smm.SequenceId = proto.Uint64(sequenceID) + + return +} + +func (p *partitionProducer) internalSingleSend(mm *pb.MessageMetadata, compressedPayload []byte, request *sendRequest) { + msg := request.msg + + payloadBuf := internal.NewBuffer(len(compressedPayload)) + payloadBuf.Write(compressedPayload) buffer := p.GetBuffer() if buffer == nil { buffer = internal.NewBuffer(int(payloadBuf.ReadableBytes() * 3 / 2)) } + sid := *mm.SequenceId + if err := internal.SingleSend( - buffer, p.producerID, sid, mm, payloadBuf, - pb.CompressionType(p.options.CompressionType), - compression.Level(p.options.CompressionLevel), + buffer, + p.producerID, + sid, + mm, + payloadBuf, p.encryptor, ); err != nil { p.log.WithError(err).Errorf("Single message serialize failed %s", msg.Value) @@ -529,7 +642,7 @@ func (p *partitionProducer) internalSingleSend(smm *pb.SingleMessageMetadata, pa p.pendingQueue.Put(&pendingItem{ sentAt: time.Now(), - batchData: buffer, + buffer: buffer, sequenceID: sid, sendRequests: []interface{}{request}, }) @@ -538,7 +651,7 @@ func (p *partitionProducer) internalSingleSend(smm *pb.SingleMessageMetadata, pa type pendingItem struct { sync.Mutex - batchData internal.Buffer + buffer internal.Buffer sequenceID uint64 sentAt time.Time sendRequests []interface{} @@ -569,7 +682,7 @@ func (p *partitionProducer) internalFlushCurrentBatch() { p.pendingQueue.Put(&pendingItem{ sentAt: time.Now(), - batchData: batchData, + buffer: batchData, sequenceID: sequenceID, sendRequests: callbacks, }) @@ -653,6 +766,7 @@ func (p *partitionProducer) failTimeoutMessages() { for _, i := range pi.sendRequests { sr := i.(*sendRequest) if sr.msg != nil { + // todo: it's not correct. the size should be schemaed uncompressed payload size size := len(sr.msg.Payload) p.publishSemaphore.Release() p.metrics.MessagesPending.Dec() @@ -662,7 +776,8 @@ func (p *partitionProducer) failTimeoutMessages() { WithField("size", size). WithField("properties", sr.msg.Properties) } - if sr.callback != nil { + // the callback is only invoked by last chunk when chunked + if sr.callback != nil && (sr.totalChunks <= 1 || (sr.chunkId == sr.totalChunks-1)) { sr.callback(nil, sr.msg, errSendTimeout) } } @@ -702,7 +817,7 @@ func (p *partitionProducer) internalFlushCurrentBatches() { } p.pendingQueue.Put(&pendingItem{ sentAt: time.Now(), - batchData: batchesData[i], + buffer: batchesData[i], sequenceID: sequenceIDs[i], sendRequests: callbacks[i], }) @@ -779,33 +894,24 @@ func (p *partitionProducer) internalSendAsync(ctx context.Context, msg *Producer return } + // bc only works when DisableBlockIfQueueFull is false + bc := make(chan struct{}) sr := &sendRequest{ ctx: ctx, msg: msg, callback: callback, flushImmediately: flushImmediately, publishTime: time.Now(), + blockCh: bc, } p.options.Interceptors.BeforeSend(p, msg) - if p.options.DisableBlockIfQueueFull { - if !p.publishSemaphore.TryAcquire() { - if callback != nil { - callback(nil, msg, errSendQueueIsFull) - } - return - } - } else { - if !p.publishSemaphore.Acquire(ctx) { - callback(nil, msg, errContextExpired) - return - } - } - - p.metrics.MessagesPending.Inc() - p.metrics.BytesPending.Add(float64(len(sr.msg.Payload))) - p.eventsChan <- sr + + if !p.options.DisableBlockIfQueueFull { + // block if queue full + <-bc + } } func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) { @@ -843,10 +949,10 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) if sr.msg != nil { atomic.StoreInt64(&p.lastSequenceID, int64(pi.sequenceID)) p.publishSemaphore.Release() - p.metrics.PublishLatency.Observe(float64(now-sr.publishTime.UnixNano()) / 1.0e9) p.metrics.MessagesPublished.Inc() p.metrics.MessagesPending.Dec() + // todo: it's not correct. the size should be schemaed uncompressed payload size payloadSize := float64(len(sr.msg.Payload)) p.metrics.BytesPublished.Add(payloadSize) p.metrics.BytesPending.Sub(payloadSize) @@ -955,6 +1061,10 @@ type sendRequest struct { callback func(MessageID, *ProducerMessage, error) publishTime time.Time flushImmediately bool + blockCh chan struct{} + blockOnce sync.Once + totalChunks int + chunkId int } type closeProducer struct { @@ -971,7 +1081,7 @@ func (i *pendingItem) Complete() { return } i.completed = true - buffersPool.Put(i.batchData) + buffersPool.Put(i.buffer) } // _setConn sets the internal connection field of this partition producer atomically. @@ -988,3 +1098,30 @@ func (p *partitionProducer) _getConn() internal.Connection { // invariant is broken return p.conn.Load().(internal.Connection) } + +func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { + if p.options.DisableBlockIfQueueFull { + if !p.publishSemaphore.TryAcquire() { + if sr.callback != nil { + sr.callback(nil, sr.msg, errSendQueueIsFull) + } + return false + } + } else { + if !p.publishSemaphore.Acquire(sr.ctx) { + sr.callback(nil, sr.msg, errContextExpired) + sr.blockOnce.Do(func() { + sr.blockCh <- struct{}{} + }) + return false + } else { + sr.blockOnce.Do(func() { + sr.blockCh <- struct{}{} + }) + } + } + + p.metrics.MessagesPending.Inc() + p.metrics.BytesPending.Add(float64(len(sr.msg.Payload))) + return true +} diff --git a/pulsar/producer_test.go b/pulsar/producer_test.go index 6b2b5d9183..eda32fcead 100644 --- a/pulsar/producer_test.go +++ b/pulsar/producer_test.go @@ -948,6 +948,35 @@ func TestMaxMessageSize(t *testing.T) { } } +func TestProducerChunking(t *testing.T) { + serverMaxMessageSize := 1024 * 1024 + + client, err := NewClient(ClientOptions{ + URL: serviceURL, + }) + assert.NoError(t, err) + defer client.Close() + + // both EnableChunking and DisableBatching should be set true to open chunking + producer, err := client.CreateProducer(ProducerOptions{ + Topic: newTopicName(), + EnableChunking: true, + DisableBatching: true, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + payload := make([]byte, serverMaxMessageSize+1) + ID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: payload, + }) + + assert.NoError(t, err) + assert.NotNil(t, ID) + +} + func TestFailedSchemaEncode(t *testing.T) { client, err := NewClient(ClientOptions{ URL: lookupURL, From 3a90ce57b7707dae52926a005ac26ecb0939598c Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 11 Jul 2022 10:36:49 +0800 Subject: [PATCH 03/40] fix: fix the bug of Send() block when SingleSend() failed; fix the bug of sequenceID generate; fix the bug where batch messages are compressed twice; add internal error handle, fix inaccurate PublishErrorsMsgTooLarge metric; fix the incorrect producer queueFullBlock --- pulsar/internal/batch_builder.go | 32 ++-- pulsar/internal/commands.go | 15 +- pulsar/internal/key_based_batch_builder.go | 6 +- pulsar/producer_partition.go | 163 ++++++++++++--------- pulsar/producer_test.go | 33 +++++ 5 files changed, 165 insertions(+), 84 deletions(-) diff --git a/pulsar/internal/batch_builder.go b/pulsar/internal/batch_builder.go index 3b1e387099..ef4552b045 100644 --- a/pulsar/internal/batch_builder.go +++ b/pulsar/internal/batch_builder.go @@ -35,7 +35,7 @@ type BuffersPool interface { // BatcherBuilderProvider defines func which returns the BatchBuilder. type BatcherBuilderProvider func( - maxMessages uint, maxBatchSize uint, producerName string, producerID uint64, + maxMessages uint, maxBatchSize uint, maxMessageSize uint32, producerName string, producerID uint64, compressionType pb.CompressionType, level compression.Level, bufferPool BuffersPool, logger log.Logger, encryptor crypto.Encryptor, ) (BatchBuilder, error) @@ -85,6 +85,8 @@ type batchContainer struct { // without needing costly re-allocations. maxBatchSize uint + maxMessageSize uint32 + producerName string producerID uint64 @@ -102,18 +104,19 @@ type batchContainer struct { // newBatchContainer init a batchContainer func newBatchContainer( - maxMessages uint, maxBatchSize uint, producerName string, producerID uint64, + maxMessages uint, maxBatchSize uint, maxMessageSize uint32, producerName string, producerID uint64, compressionType pb.CompressionType, level compression.Level, bufferPool BuffersPool, logger log.Logger, encryptor crypto.Encryptor, ) batchContainer { bc := batchContainer{ - buffer: NewBuffer(4096), - numMessages: 0, - maxMessages: maxMessages, - maxBatchSize: maxBatchSize, - producerName: producerName, - producerID: producerID, + buffer: NewBuffer(4096), + numMessages: 0, + maxMessages: maxMessages, + maxBatchSize: maxBatchSize, + maxMessageSize: maxMessageSize, + producerName: producerName, + producerID: producerID, cmdSend: baseCommand( pb.BaseCommand_SEND, &pb.CommandSend{ @@ -139,13 +142,13 @@ func newBatchContainer( // NewBatchBuilder init batch builder and return BatchBuilder pointer. Build a new batch message container. func NewBatchBuilder( - maxMessages uint, maxBatchSize uint, producerName string, producerID uint64, + maxMessages uint, maxBatchSize uint, maxMessageSize uint32, producerName string, producerID uint64, compressionType pb.CompressionType, level compression.Level, bufferPool BuffersPool, logger log.Logger, encryptor crypto.Encryptor, ) (BatchBuilder, error) { bc := newBatchContainer( - maxMessages, maxBatchSize, producerName, producerID, compressionType, + maxMessages, maxBatchSize, maxMessageSize, producerName, producerID, compressionType, level, bufferPool, logger, encryptor, ) @@ -164,7 +167,11 @@ func (bc *batchContainer) hasSpace(payload []byte) bool { return true } msgSize := uint32(len(payload)) - return bc.numMessages+1 <= bc.maxMessages && bc.buffer.ReadableBytes()+msgSize <= uint32(bc.maxBatchSize) + expectedSize := bc.buffer.ReadableBytes() + msgSize + if !(expectedSize <= uint32(bc.maxBatchSize) && expectedSize <= bc.maxMessageSize) { + return false + } + return bc.numMessages+1 <= bc.maxMessages } func (bc *batchContainer) hasSameSchema(schemaVersion []byte) bool { @@ -259,7 +266,8 @@ func (bc *batchContainer) Flush() ( } if err = serializeMessage( - buffer, bc.cmdSend, bc.msgMetadata, bc.buffer, bc.compressionProvider, bc.encryptor, true, + buffer, bc.cmdSend, bc.msgMetadata, bc.buffer, bc.compressionProvider, + bc.encryptor, bc.maxMessageSize, true, ); err == nil { // no error in serializing Batch sequenceID = bc.cmdSend.Send.GetSequenceId() } diff --git a/pulsar/internal/commands.go b/pulsar/internal/commands.go index 63548371f9..2d89947bf7 100644 --- a/pulsar/internal/commands.go +++ b/pulsar/internal/commands.go @@ -21,6 +21,7 @@ import ( "encoding/binary" "errors" "fmt" + "github.com/gogo/protobuf/proto" "github.com/apache/pulsar-client-go/pulsar/internal/compression" @@ -49,6 +50,8 @@ var ErrEOM = errors.New("EOF") var ErrConnectionClosed = errors.New("connection closed") +var ErrExceedMaxMessageSize = errors.New("encryptedPayload exceeds MaxMessageSize") + func NewMessageReader(headersAndPayload Buffer) *MessageReader { return &MessageReader{ buffer: headersAndPayload, @@ -243,6 +246,7 @@ func serializeMessage(wb Buffer, payload Buffer, compressionProvider compression.Provider, encryptor crypto.Encryptor, + maxMessageSize uint32, doCompress bool) error { // Wire format // [TOTAL_SIZE] [CMD_SIZE][CMD] [MAGIC_NUMBER][CHECKSUM] [METADATA_SIZE][METADATA] [PAYLOAD] @@ -261,6 +265,11 @@ func serializeMessage(wb Buffer, // error occurred while encrypting the payload, ProducerCryptoFailureAction is set to Fail return fmt.Errorf("encryption of message failed, ProducerCryptoFailureAction is set to Fail. Error :%v", err) } + // the maxMessageSize check of batching message is in here + if len(encryptedPayload) > int(maxMessageSize) { + return fmt.Errorf("%w, size: %d, MaxMessageSize: %d", + ErrExceedMaxMessageSize, len(encryptedPayload), maxMessageSize) + } cmdSize := uint32(proto.Size(cmdSend)) msgMetadataSize := uint32(proto.Size(msgMetadata)) @@ -310,7 +319,8 @@ func SingleSend(wb Buffer, producerID, sequenceID uint64, msgMetadata *pb.MessageMetadata, compressedPayload Buffer, - encryptor crypto.Encryptor) error { + encryptor crypto.Encryptor, + maxMassageSize uint32) error { cmdSend := baseCommand( pb.BaseCommand_SEND, &pb.CommandSend{ @@ -323,7 +333,8 @@ func SingleSend(wb Buffer, cmdSend.Send.IsChunk = &isChunk } // payload has been compressed so compressionProvider can be nil - return serializeMessage(wb, cmdSend, msgMetadata, compressedPayload, nil, encryptor, false) + return serializeMessage(wb, cmdSend, msgMetadata, compressedPayload, + nil, encryptor, maxMassageSize, false) } // ConvertFromStringMap convert a string map to a KeyValue []byte diff --git a/pulsar/internal/key_based_batch_builder.go b/pulsar/internal/key_based_batch_builder.go index 77fbb8c77a..334e6746ab 100644 --- a/pulsar/internal/key_based_batch_builder.go +++ b/pulsar/internal/key_based_batch_builder.go @@ -84,7 +84,7 @@ func (h *keyBasedBatches) Val(key string) *batchContainer { // NewKeyBasedBatchBuilder init batch builder and return BatchBuilder // pointer. Build a new key based batch message container. func NewKeyBasedBatchBuilder( - maxMessages uint, maxBatchSize uint, producerName string, producerID uint64, + maxMessages uint, maxBatchSize uint, maxMessageSize uint32, producerName string, producerID uint64, compressionType pb.CompressionType, level compression.Level, bufferPool BuffersPool, logger log.Logger, encryptor crypto.Encryptor, ) (BatchBuilder, error) { @@ -92,7 +92,7 @@ func NewKeyBasedBatchBuilder( bb := &keyBasedBatchContainer{ batches: newKeyBasedBatches(), batchContainer: newBatchContainer( - maxMessages, maxBatchSize, producerName, producerID, + maxMessages, maxBatchSize, maxMessageSize, producerName, producerID, compressionType, level, bufferPool, logger, encryptor, ), compressionType: compressionType, @@ -151,7 +151,7 @@ func (bc *keyBasedBatchContainer) Add( if batchPart == nil { // create batchContainer for new key t := newBatchContainer( - bc.maxMessages, bc.maxBatchSize, bc.producerName, bc.producerID, + bc.maxMessages, bc.maxBatchSize, bc.maxMessageSize, bc.producerName, bc.producerID, bc.compressionType, bc.level, bc.buffersPool, bc.log, bc.encryptor, ) batchPart = &t diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index d7cb16d932..19c6bf0e3a 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -263,6 +263,22 @@ func (p *partitionProducer) grabCnx() error { p.encryptor = internalcrypto.NewNoopEncryptor() } + if p.sequenceIDGenerator == nil { + nextSequenceID := uint64(res.Response.ProducerSuccess.GetLastSequenceId() + 1) + p.sequenceIDGenerator = &nextSequenceID + } + + schemaVersion := res.Response.ProducerSuccess.GetSchemaVersion() + if len(schemaVersion) != 0 { + p.schemaCache.Put(p.schemaInfo, schemaVersion) + } + + p._setConn(res.Cnx) + err = p._getConn().RegisterListener(p.producerID, p) + if err != nil { + return err + } + if p.options.DisableBatching { // removed BatchBuilder creation } else if p.batchBuilder == nil { @@ -270,9 +286,9 @@ func (p *partitionProducer) grabCnx() error { if err != nil { provider, _ = GetBatcherBuilderProvider(DefaultBatchBuilder) } - + maxMessageSize := uint32(p._getConn().GetMaxMessageSize()) p.batchBuilder, err = provider(p.options.BatchingMaxMessages, p.options.BatchingMaxSize, - p.producerName, p.producerID, pb.CompressionType(p.options.CompressionType), + maxMessageSize, p.producerName, p.producerID, pb.CompressionType(p.options.CompressionType), compression.Level(p.options.CompressionLevel), p, p.log, @@ -282,21 +298,6 @@ func (p *partitionProducer) grabCnx() error { } } - if p.sequenceIDGenerator == nil { - nextSequenceID := uint64(res.Response.ProducerSuccess.GetLastSequenceId() + 1) - p.sequenceIDGenerator = &nextSequenceID - } - - schemaVersion := res.Response.ProducerSuccess.GetSchemaVersion() - if len(schemaVersion) != 0 { - p.schemaCache.Put(p.schemaInfo, schemaVersion) - } - - p._setConn(res.Cnx) - err = p._getConn().RegisterListener(p.producerID, p) - if err != nil { - return err - } p.log.WithFields(log.Fields{ "cnx": res.Cnx.ID(), "epoch": atomic.LoadUint64(&p.epoch), @@ -461,7 +462,7 @@ func (p *partitionProducer) Name() string { } func (p *partitionProducer) internalSend(request *sendRequest) { - p.log.Debug("Received send request: ", *request) + p.log.Debug("Received send request: ", *request.msg) msg := request.msg @@ -524,22 +525,6 @@ func (p *partitionProducer) internalSend(request *sendRequest) { uncompressedSize := len(uncompressedPayload) - // compress payload - compressedPayload := p.compressionProvider.Compress(nil, uncompressedPayload) - compressedSize := len(compressedPayload) - - // if msg is too large and chunking is disabled - if compressedSize > int(p._getConn().GetMaxMessageSize()) && !p.options.EnableChunking { - p.publishSemaphore.Release() - request.callback(nil, request.msg, errMessageTooLarge) - p.log.WithError(errMessageTooLarge). - WithField("size", compressedSize). - WithField("properties", msg.Properties). - Errorf("MaxMessageSize %d", int(p._getConn().GetMaxMessageSize())) - p.metrics.PublishErrorsMsgTooLarge.Inc() - return - } - mm := p.genMetadata(msg, uncompressedSize) // set default ReplicationClusters when DisableReplication @@ -557,6 +542,34 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 + maxMessageSize := int(p._getConn().GetMaxMessageSize()) + + // compress payload if not batching + var compressedPayload []byte + var compressedSize int + var checkSize int + if !sendAsBatch { + compressedPayload = p.compressionProvider.Compress(nil, uncompressedPayload) + compressedSize = len(compressedPayload) + checkSize = compressedSize + } else { + // final check for batching message is in serializeMessage + // this is a double check + checkSize = uncompressedSize + } + + // if msg is too large and chunking is disabled + if checkSize > maxMessageSize && !p.options.EnableChunking { + p.publishSemaphore.Release() + request.callback(nil, request.msg, errMessageTooLarge) + p.log.WithError(errMessageTooLarge). + WithField("size", checkSize). + WithField("properties", msg.Properties). + Errorf("MaxMessageSize %d", maxMessageSize) + p.metrics.PublishErrorsMsgTooLarge.Inc() + return + } + var totalChunks int // max chunk payload size var payloadChunkSize int @@ -592,30 +605,35 @@ func (p *partitionProducer) internalSend(request *sendRequest) { request.totalChunks = totalChunks if !sendAsBatch { + if msg.SequenceID != nil { + mm.SequenceId = proto.Uint64(uint64(*msg.SequenceID)) + } else { + mm.SequenceId = proto.Uint64(internal.GetAndAdd(p.sequenceIDGenerator, 1)) + } if totalChunks > 1 { var lhs, rhs int uuid := fmt.Sprintf("%s-%d", p.producerName, mm.SequenceId) mm.Uuid = proto.String(uuid) mm.NumChunksFromMsg = proto.Int(totalChunks) mm.TotalChunkMsgSize = proto.Int(compressedSize) - for chunkId := 0; chunkId < totalChunks; chunkId++ { - lhs = chunkId * payloadChunkSize + for chunkID := 0; chunkID < totalChunks; chunkID++ { + lhs = chunkID * payloadChunkSize if rhs = lhs + payloadChunkSize; rhs > compressedSize { rhs = compressedSize } // update chunk id - mm.ChunkId = proto.Int(chunkId) + mm.ChunkId = proto.Int(chunkID) nsr := &sendRequest{ ctx: request.ctx, msg: request.msg, callback: request.callback, publishTime: request.publishTime, - chunkId: chunkId, + chunkID: chunkID, } - p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr) + p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } } else { - p.internalSingleSend(mm, compressedPayload, request) + p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } } else { smm := p.genSingleMetaMessage(msg, uncompressedSize) @@ -647,16 +665,8 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize int) (mm *pb.MessageMetadata) { - var sequenceID uint64 - if msg.SequenceID != nil { - sequenceID = uint64(*msg.SequenceID) - } else { - sequenceID = internal.GetAndAdd(p.sequenceIDGenerator, 1) - } - mm = &pb.MessageMetadata{ ProducerName: &p.producerName, - SequenceId: proto.Uint64(sequenceID), PublishTime: proto.Uint64(internal.TimestampMillis(time.Now())), ReplicateTo: msg.ReplicationClusters, UncompressedSize: proto.Uint32(uint32(uncompressedSize)), @@ -681,7 +691,8 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize i return } -func (p *partitionProducer) genSingleMetaMessage(msg *ProducerMessage, uncompressedSize int) (smm *pb.SingleMessageMetadata) { +func (p *partitionProducer) genSingleMetaMessage(msg *ProducerMessage, + uncompressedSize int) (smm *pb.SingleMessageMetadata) { smm = &pb.SingleMessageMetadata{ PayloadSize: proto.Int(uncompressedSize), } @@ -714,7 +725,10 @@ func (p *partitionProducer) genSingleMetaMessage(msg *ProducerMessage, uncompres return } -func (p *partitionProducer) internalSingleSend(mm *pb.MessageMetadata, compressedPayload []byte, request *sendRequest) { +func (p *partitionProducer) internalSingleSend(mm *pb.MessageMetadata, + compressedPayload []byte, + request *sendRequest, + maxMessageSize uint32) { msg := request.msg payloadBuf := internal.NewBuffer(len(compressedPayload)) @@ -734,7 +748,9 @@ func (p *partitionProducer) internalSingleSend(mm *pb.MessageMetadata, compresse mm, payloadBuf, p.encryptor, + maxMessageSize, ); err != nil { + request.callback(nil, request.msg, err) p.log.WithError(err).Errorf("Single message serialize failed %s", msg.Value) return } @@ -876,7 +892,7 @@ func (p *partitionProducer) failTimeoutMessages() { WithField("properties", sr.msg.Properties) } // the callback is only invoked by last chunk when chunked - if sr.callback != nil && (sr.totalChunks <= 1 || (sr.chunkId == sr.totalChunks-1)) { + if sr.callback != nil && (sr.totalChunks <= 1 || (sr.chunkID == sr.totalChunks-1)) { sr.callback(nil, sr.msg, errSendTimeout) } } @@ -895,7 +911,7 @@ func (p *partitionProducer) failTimeoutMessages() { } func (p *partitionProducer) internalFlushCurrentBatches() { - batchesData, sequenceIDs, callbacks, errors := p.batchBuilder.FlushBatches() + batchesData, sequenceIDs, callbacks, errs := p.batchBuilder.FlushBatches() if batchesData == nil { return } @@ -903,10 +919,10 @@ func (p *partitionProducer) internalFlushCurrentBatches() { for i := range batchesData { // error occurred in processing batch // report it using callback - if errors[i] != nil { + if errs[i] != nil { for _, cb := range callbacks[i] { if sr, ok := cb.(*sendRequest); ok { - sr.callback(nil, sr.msg, errors[i]) + sr.callback(nil, sr.msg, errs[i]) } } continue @@ -977,6 +993,10 @@ func (p *partitionProducer) Send(ctx context.Context, msg *ProducerMessage) (Mes // wait for send request to finish <-doneCh + + // handle internal error + p.internalErrHandle(err) + return msgID, err } @@ -1164,7 +1184,7 @@ type sendRequest struct { blockCh chan struct{} blockOnce sync.Once totalChunks int - chunkId int + chunkID int } type closeProducer struct { @@ -1207,21 +1227,30 @@ func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { } return false } + } else if !p.publishSemaphore.Acquire(sr.ctx) { + sr.callback(nil, sr.msg, errContextExpired) + sr.blockOnce.Do(func() { + sr.blockCh <- struct{}{} + }) + return false } else { - if !p.publishSemaphore.Acquire(sr.ctx) { - sr.callback(nil, sr.msg, errContextExpired) - sr.blockOnce.Do(func() { - sr.blockCh <- struct{}{} - }) - return false - } else { - sr.blockOnce.Do(func() { - sr.blockCh <- struct{}{} - }) - } + sr.blockOnce.Do(func() { + sr.blockCh <- struct{}{} + }) } - p.metrics.MessagesPending.Inc() p.metrics.BytesPending.Add(float64(len(sr.msg.Payload))) return true } + +func (p *partitionProducer) internalErrHandle(err error) { + if err == nil { + return + } else if errors.Is(err, internal.ErrExceedMaxMessageSize) { + p.log.WithError(errMessageTooLarge). + Errorf("internal err: %s", err) + p.metrics.PublishErrorsMsgTooLarge.Inc() + return + } + // other internal error can be handled here +} diff --git a/pulsar/producer_test.go b/pulsar/producer_test.go index 494f076e4b..b9653cc523 100644 --- a/pulsar/producer_test.go +++ b/pulsar/producer_test.go @@ -19,6 +19,7 @@ package pulsar import ( "context" + "errors" "fmt" "net/http" "strconv" @@ -934,11 +935,43 @@ func TestMaxMessageSize(t *testing.T) { assert.NotNil(t, producer) defer producer.Close() + // producer2 disable batching + producer2, err := client.CreateProducer(ProducerOptions{ + Topic: newTopicName(), + DisableBatching: true, + }) + assert.NoError(t, err) + assert.NotNil(t, producer2) + defer producer2.Close() + + // When serverMaxMessageSize=1024, the batch payload=1041 + // The totalSize includes: + // | singleMsgMetadataLength | singleMsgMetadata | payload | + // | ----------------------- | ----------------- | ------- | + // | 4 | 13 | 1024 | + // So when bias <= 0, the uncompressed payload will not exceed maxMessageSize, + // but encryptedPayloadSize exceeds maxMessageSize, Send() will return an internal error. + // When bias = 1, the first check of maxMessageSize (for uncompressed payload) is valid, + // Send() will return errMessageTooLarge for bias := -1; bias <= 1; bias++ { payload := make([]byte, serverMaxMessageSize+bias) ID, err := producer.Send(context.Background(), &ProducerMessage{ Payload: payload, }) + if bias <= 0 { + assert.Equal(t, true, errors.Is(err, internal.ErrExceedMaxMessageSize)) + assert.Nil(t, ID) + } else { + assert.Equal(t, errMessageTooLarge, err) + } + } + + // singleMsgMetadata is not included in the payload of a non-batching message + for bias := -1; bias <= 1; bias++ { + payload := make([]byte, serverMaxMessageSize+bias) + ID, err := producer2.Send(context.Background(), &ProducerMessage{ + Payload: payload, + }) if bias <= 0 { assert.NoError(t, err) assert.NotNil(t, ID) From ed7915d7d1d32172b3a50d294ce5e7adbfbe9bdb Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Thu, 28 Jul 2022 22:41:31 +0800 Subject: [PATCH 04/40] feat: Implemented chunking on the consumer side --- pulsar/consumer.go | 11 + pulsar/consumer_impl.go | 71 ++-- pulsar/consumer_partition.go | 556 +++++++++++++++++++++++------- pulsar/consumer_test.go | 29 +- pulsar/impl_message.go | 40 +++ pulsar/internal/timewheel.go | 174 ++++++++++ pulsar/internal/timewheel_test.go | 81 +++++ pulsar/negative_acks_tracker.go | 6 +- pulsar/test_helper.go | 2 +- 9 files changed, 817 insertions(+), 153 deletions(-) create mode 100644 pulsar/internal/timewheel.go create mode 100644 pulsar/internal/timewheel_test.go diff --git a/pulsar/consumer.go b/pulsar/consumer.go index 2df16374c4..994dd7889b 100644 --- a/pulsar/consumer.go +++ b/pulsar/consumer.go @@ -189,6 +189,17 @@ type ConsumerOptions struct { // error information of the Ack method only contains errors that may occur in the Go SDK's own processing. // Default: false AckWithResponse bool + + // MaxPendingChunkedMessage sets the maximum pending chunked messages. (default: 100) + MaxPendingChunkedMessage int + + // ExpireTimeOfIncompleteChunk sets the expiry time of discarding incomplete chunked message. + // The timing accuracy is 100ms level and the default value is 60 * 1000 millis + ExpireTimeOfIncompleteChunk time.Duration + + // AutoAckIncompleteChunk sets whether consumer auto acknowledges incomplete chunked message when it should + // be removed (e.g.the chunked message pending queue is full). (default: false) + AutoAckIncompleteChunk bool } // Consumer is an interface that abstracts behavior of Pulsar's consumer diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 2328ca882b..9dc94628c1 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -36,8 +36,8 @@ import ( const defaultNackRedeliveryDelay = 1 * time.Minute type acker interface { - AckID(id trackingMessageID) error - NackID(id trackingMessageID) + AckID(id MessageID) error + NackID(id MessageID) NackMsg(msg Message) } @@ -91,6 +91,15 @@ func newConsumer(client *client, options ConsumerOptions) (Consumer, error) { } } + if options.MaxPendingChunkedMessage == 0 { + options.MaxPendingChunkedMessage = 100 + } + + // the minimum timer interval is 100ms + if options.ExpireTimeOfIncompleteChunk < 100*time.Millisecond { + options.ExpireTimeOfIncompleteChunk = time.Minute + } + if options.NackBackoffPolicy == nil && options.EnableDefaultNackBackoffPolicy { options.NackBackoffPolicy = new(defaultNackBackoffPolicy) } @@ -342,27 +351,30 @@ func (c *consumer) internalTopicSubscribeToPartitions() error { nackRedeliveryDelay = c.options.NackRedeliveryDelay } opts := &partitionConsumerOpts{ - topic: pt, - consumerName: c.consumerName, - subscription: c.options.SubscriptionName, - subscriptionType: c.options.Type, - subscriptionInitPos: c.options.SubscriptionInitialPosition, - partitionIdx: idx, - receiverQueueSize: receiverQueueSize, - nackRedeliveryDelay: nackRedeliveryDelay, - nackBackoffPolicy: c.options.NackBackoffPolicy, - metadata: metadata, - subProperties: subProperties, - replicateSubscriptionState: c.options.ReplicateSubscriptionState, - startMessageID: trackingMessageID{}, - subscriptionMode: durable, - readCompacted: c.options.ReadCompacted, - interceptors: c.options.Interceptors, - maxReconnectToBroker: c.options.MaxReconnectToBroker, - keySharedPolicy: c.options.KeySharedPolicy, - schema: c.options.Schema, - decryption: c.options.Decryption, - ackWithResponse: c.options.AckWithResponse, + topic: pt, + consumerName: c.consumerName, + subscription: c.options.SubscriptionName, + subscriptionType: c.options.Type, + subscriptionInitPos: c.options.SubscriptionInitialPosition, + partitionIdx: idx, + receiverQueueSize: receiverQueueSize, + nackRedeliveryDelay: nackRedeliveryDelay, + nackBackoffPolicy: c.options.NackBackoffPolicy, + metadata: metadata, + subProperties: subProperties, + replicateSubscriptionState: c.options.ReplicateSubscriptionState, + startMessageID: trackingMessageID{}, + subscriptionMode: durable, + readCompacted: c.options.ReadCompacted, + interceptors: c.options.Interceptors, + maxReconnectToBroker: c.options.MaxReconnectToBroker, + keySharedPolicy: c.options.KeySharedPolicy, + schema: c.options.Schema, + decryption: c.options.Decryption, + ackWithResponse: c.options.AckWithResponse, + maxPendingChunkedMessage: c.options.MaxPendingChunkedMessage, + expireTimeOfIncompleteChunk: c.options.ExpireTimeOfIncompleteChunk, + autoAckIncompleteChunk: c.options.AutoAckIncompleteChunk, } cons, err := newPartitionConsumer(c, c.client, opts, c.messageCh, c.dlq, c.metrics) ch <- ConsumerError{ @@ -453,16 +465,13 @@ func (c *consumer) Ack(msg Message) error { // AckID the consumption of a single message, identified by its MessageID func (c *consumer) AckID(msgID MessageID) error { - mid, ok := c.messageID(msgID) - if !ok { - return errors.New("failed to convert trackingMessageID") - } - - if mid.consumer != nil { - return mid.Ack() + if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { + c.log.Errorf("invalid partition index %d expected a partition between [0-%d]", + msgID.PartitionIdx(), len(c.consumers)) + return errors.New("invalid partition index") } - return c.consumers[mid.partitionIdx].AckID(mid) + return c.consumers[msgID.PartitionIdx()].AckID(msgID) } // ReconsumeLater mark a message for redelivery after custom delay diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index fac9d4bdd2..ba2b108a0d 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -18,6 +18,7 @@ package pulsar import ( + "container/list" "encoding/hex" "errors" "fmt" @@ -83,29 +84,32 @@ const ( ) type partitionConsumerOpts struct { - topic string - consumerName string - subscription string - subscriptionType SubscriptionType - subscriptionInitPos SubscriptionInitialPosition - partitionIdx int - receiverQueueSize int - nackRedeliveryDelay time.Duration - nackBackoffPolicy NackBackoffPolicy - metadata map[string]string - subProperties map[string]string - replicateSubscriptionState bool - startMessageID trackingMessageID - startMessageIDInclusive bool - subscriptionMode subscriptionMode - readCompacted bool - disableForceTopicCreation bool - interceptors ConsumerInterceptors - maxReconnectToBroker *uint - keySharedPolicy *KeySharedPolicy - schema Schema - decryption *MessageDecryptionInfo - ackWithResponse bool + topic string + consumerName string + subscription string + subscriptionType SubscriptionType + subscriptionInitPos SubscriptionInitialPosition + partitionIdx int + receiverQueueSize int + nackRedeliveryDelay time.Duration + nackBackoffPolicy NackBackoffPolicy + metadata map[string]string + subProperties map[string]string + replicateSubscriptionState bool + startMessageID trackingMessageID + startMessageIDInclusive bool + subscriptionMode subscriptionMode + readCompacted bool + disableForceTopicCreation bool + interceptors ConsumerInterceptors + maxReconnectToBroker *uint + keySharedPolicy *KeySharedPolicy + schema Schema + decryption *MessageDecryptionInfo + ackWithResponse bool + maxPendingChunkedMessage int + expireTimeOfIncompleteChunk time.Duration + autoAckIncompleteChunk bool } type partitionConsumer struct { @@ -150,6 +154,9 @@ type partitionConsumer struct { metrics *internal.LeveledMetrics decryptor cryptointernal.Decryptor schemaInfoCache *schemaInfoCache + + chunkedMsgCtxMap *chunkedMsgCtxMap + unAckedChunkedIDSequences *unAckedChunkedIDSequences } type schemaInfoCache struct { @@ -202,28 +209,30 @@ func newPartitionConsumer(parent Consumer, client *client, options *partitionCon messageCh chan ConsumerMessage, dlq *dlqRouter, metrics *internal.LeveledMetrics) (*partitionConsumer, error) { pc := &partitionConsumer{ - parentConsumer: parent, - client: client, - options: options, - topic: options.topic, - name: options.consumerName, - consumerID: client.rpcClient.NewConsumerID(), - partitionIdx: int32(options.partitionIdx), - eventsCh: make(chan interface{}, 10), - queueSize: int32(options.receiverQueueSize), - queueCh: make(chan []*message, options.receiverQueueSize), - startMessageID: options.startMessageID, - connectedCh: make(chan struct{}), - messageCh: messageCh, - connectClosedCh: make(chan connectionClosed, 10), - closeCh: make(chan struct{}), - clearQueueCh: make(chan func(id trackingMessageID)), - clearMessageQueuesCh: make(chan chan struct{}), - compressionProviders: sync.Map{}, - dlq: dlq, - metrics: metrics, - schemaInfoCache: newSchemaInfoCache(client, options.topic), - } + parentConsumer: parent, + client: client, + options: options, + topic: options.topic, + name: options.consumerName, + consumerID: client.rpcClient.NewConsumerID(), + partitionIdx: int32(options.partitionIdx), + eventsCh: make(chan interface{}, 10), + queueSize: int32(options.receiverQueueSize), + queueCh: make(chan []*message, options.receiverQueueSize), + startMessageID: options.startMessageID, + connectedCh: make(chan struct{}), + messageCh: messageCh, + connectClosedCh: make(chan connectionClosed, 10), + closeCh: make(chan struct{}), + clearQueueCh: make(chan func(id trackingMessageID)), + clearMessageQueuesCh: make(chan chan struct{}), + compressionProviders: sync.Map{}, + dlq: dlq, + metrics: metrics, + schemaInfoCache: newSchemaInfoCache(client, options.topic), + unAckedChunkedIDSequences: newUnAckedChunkedIDSequences(), + } + pc.chunkedMsgCtxMap = newChunkedMsgCtxMap(options.maxPendingChunkedMessage, pc) pc.setConsumerState(consumerInit) pc.log = client.log.SubLogger(log.Fields{ "name": pc.name, @@ -366,17 +375,29 @@ func (pc *partitionConsumer) requestGetLastMessageID() (trackingMessageID, error return convertToMessageID(id), nil } -func (pc *partitionConsumer) AckID(msgID trackingMessageID) error { +func (pc *partitionConsumer) AckID(msgID MessageID) error { if state := pc.getConsumerState(); state == consumerClosed || state == consumerClosing { pc.log.WithField("state", state).Error("Failed to ack by closing or closed consumer") return errors.New("consumer state is closed") } ackReq := new(ackRequest) - if !msgID.Undefined() && msgID.ack() { + + if tmid, ok := toTrackingMessageID(msgID); ok { + if tmid.ack() { + pc.metrics.AcksCounter.Inc() + pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-tmid.receivedTime.UnixNano()) / 1.0e9) + ackReq.msgID = tmid + // send ack request to eventsCh + pc.eventsCh <- ackReq + + pc.options.interceptors.OnAcknowledge(pc.parentConsumer, msgID) + } + } else if cmid, ok := toChunkedMessageID(msgID); ok { + // todo: too redundent, maybe better implement pc.metrics.AcksCounter.Inc() - pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-msgID.receivedTime.UnixNano()) / 1.0e9) - ackReq.msgID = msgID + pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-cmid.receivedTime.UnixNano()) / 1.0e9) + ackReq.msgID = cmid // send ack request to eventsCh pc.eventsCh <- ackReq @@ -386,8 +407,8 @@ func (pc *partitionConsumer) AckID(msgID trackingMessageID) error { return ackReq.err } -func (pc *partitionConsumer) NackID(msgID trackingMessageID) { - pc.nackTracker.Add(msgID.messageID) +func (pc *partitionConsumer) NackID(msgID MessageID) { + pc.nackTracker.Add(msgID) pc.metrics.NacksCounter.Inc() } @@ -450,6 +471,9 @@ func (pc *partitionConsumer) Close() { return } + // close chunkedMsgCtxMap + pc.chunkedMsgCtxMap.close() + req := &closeRequest{doneCh: make(chan struct{})} pc.eventsCh <- req @@ -457,15 +481,23 @@ func (pc *partitionConsumer) Close() { <-req.doneCh } -func (pc *partitionConsumer) Seek(msgID trackingMessageID) error { +func (pc *partitionConsumer) Seek(msgID MessageID) error { if state := pc.getConsumerState(); state == consumerClosed || state == consumerClosing { pc.log.WithField("state", state).Error("Failed to seek by closing or closed consumer") return errors.New("failed to seek by closing or closed consumer") } req := &seekRequest{ doneCh: make(chan struct{}), - msgID: msgID, } + if tmid, ok := toTrackingMessageID(msgID); ok { + req.msgID = tmid.messageID + } else if cmid, ok := toChunkedMessageID(msgID); ok { + req.msgID = cmid.firstChunkID + } else { + // will never reach + return errors.New("unhandled messageID type") + } + pc.eventsCh <- req // wait for the request to complete @@ -475,7 +507,7 @@ func (pc *partitionConsumer) Seek(msgID trackingMessageID) error { func (pc *partitionConsumer) internalSeek(seek *seekRequest) { defer close(seek.doneCh) - seek.err = pc.requestSeek(seek.msgID.messageID) + seek.err = pc.requestSeek(seek.msgID) } func (pc *partitionConsumer) requestSeek(msgID messageID) error { if err := pc.requestSeekWithoutClear(msgID); err != nil { @@ -570,8 +602,8 @@ func (pc *partitionConsumer) internalAck(req *ackRequest) { messageIDs := make([]*pb.MessageIdData, 1) messageIDs[0] = &pb.MessageIdData{ - LedgerId: proto.Uint64(uint64(msgID.ledgerID)), - EntryId: proto.Uint64(uint64(msgID.entryID)), + LedgerId: proto.Uint64(uint64(msgID.LedgerID())), + EntryId: proto.Uint64(uint64(msgID.EntryID())), } reqID := pc.client.rpcClient.NewRequestID() @@ -660,8 +692,23 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header } } + isChunkedMsg := false + if msgMeta.GetNumChunksFromMsg() > 1 { + isChunkedMsg = true + } + + processedPayloadBuffer := internal.NewBufferWrapper(decryptedPayload) + if isChunkedMsg { + processedPayloadBuffer = pc.processMessageChunk(processedPayloadBuffer, msgMeta, pbMsgID) + if processedPayloadBuffer == nil { + return nil + } + } else { + processedPayloadBuffer = internal.NewBufferWrapper(decryptedPayload) + } + // decryption is success, decompress the payload - uncompressedHeadersAndPayload, err := pc.Decompress(msgMeta, internal.NewBufferWrapper(decryptedPayload)) + uncompressedHeadersAndPayload, err := pc.Decompress(msgMeta, processedPayloadBuffer) if err != nil { pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_DecompressionError) return err @@ -676,41 +723,50 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header } messages := make([]*message, 0) - var ackTracker *ackTracker - // are there multiple messages in this batch? - if numMsgs > 1 { - ackTracker = newAckTracker(numMsgs) - } pc.metrics.MessagesReceived.Add(float64(numMsgs)) pc.metrics.PrefetchedMessages.Add(float64(numMsgs)) - for i := 0; i < numMsgs; i++ { - smm, payload, err := reader.ReadMessage() - if err != nil || payload == nil { - pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) - return err + var messageIndex *uint64 + var brokerPublishTime *time.Time + var msg *message + + // If no batched + if numMsgs == 1 { + var msgID MessageID + var shouldDiscard bool + if isChunkedMsg { + ctx := pc.chunkedMsgCtxMap.get(msgMeta.GetUuid()) + if ctx == nil { + // chunkedMsgCtxMap has closed because of consumer closed + return nil + } + if len(ctx.chunkedMsgIDs) > 0 { + msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) + // todo: maybe race + pc.unAckedChunkedIDSequences.add(string(msgID.Serialize()), ctx.chunkedMsgIDs) + // todo: is first or last? + shouldDiscard = pc.messageShouldBeDiscarded(msgID.(chunkMessageID).messageID) + } + // todo: no unackedTracker + } else { + msgID = messageID{ + ledgerID: int64(pbMsgID.GetLedgerId()), + entryID: int64(pbMsgID.GetEntryId()), + batchIdx: -1, + partitionIdx: pc.partitionIdx, + } + shouldDiscard = pc.messageShouldBeDiscarded(msgID.(messageID)) } - pc.metrics.BytesReceived.Add(float64(len(payload))) - pc.metrics.PrefetchedBytes.Add(float64(len(payload))) - - msgID := newTrackingMessageID( - int64(pbMsgID.GetLedgerId()), - int64(pbMsgID.GetEntryId()), - int32(i), - pc.partitionIdx, - ackTracker) - - if pc.messageShouldBeDiscarded(msgID) { + if shouldDiscard { pc.AckID(msgID) - continue + return nil } - var messageIndex *uint64 - var brokerPublishTime *time.Time + if brokerMetadata != nil { if brokerMetadata.Index != nil { - aux := brokerMetadata.GetIndex() - uint64(numMsgs) + uint64(i) + 1 + aux := brokerMetadata.GetIndex() - uint64(numMsgs) + 1 messageIndex = &aux } if brokerMetadata.BrokerTimestamp != nil { @@ -718,10 +774,78 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header brokerPublishTime = &aux } } - // set the consumer so we know how to ack the message id - msgID.consumer = pc - var msg *message - if smm != nil { + + _, payload, err := reader.ReadMessage() + if err != nil || payload == nil { + pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) + return err + } + + pc.metrics.BytesReceived.Add(float64(len(payload))) + pc.metrics.PrefetchedBytes.Add(float64(len(payload))) + + msg = &message{ + publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), + eventTime: timeFromUnixTimestampMillis(msgMeta.GetEventTime()), + key: msgMeta.GetPartitionKey(), + producerName: msgMeta.GetProducerName(), + properties: internal.ConvertToStringMap(msgMeta.GetProperties()), + topic: pc.topic, + msgID: msgID, + payLoad: payload, + schema: pc.options.schema, + replicationClusters: msgMeta.GetReplicateTo(), + replicatedFrom: msgMeta.GetReplicatedFrom(), + redeliveryCount: response.GetRedeliveryCount(), + schemaVersion: msgMeta.GetSchemaVersion(), + schemaInfoCache: pc.schemaInfoCache, + index: messageIndex, + brokerPublishTime: brokerPublishTime, + } + + pc.options.interceptors.BeforeConsume(ConsumerMessage{ + Consumer: pc.parentConsumer, + Message: msg, + }) + + messages = append(messages, msg) + } else { + for i := 0; i < numMsgs; i++ { + tracker := newAckTracker(numMsgs) + smm, payload, err := reader.ReadMessage() + if err != nil || payload == nil { + pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) + return err + } + + pc.metrics.BytesReceived.Add(float64(len(payload))) + pc.metrics.PrefetchedBytes.Add(float64(len(payload))) + + msgID := newTrackingMessageID( + int64(pbMsgID.GetLedgerId()), + int64(pbMsgID.GetEntryId()), + int32(i), + pc.partitionIdx, + tracker) + + if pc.messageShouldBeDiscarded(msgID.messageID) { + pc.AckID(msgID) + continue + } + + if brokerMetadata != nil { + if brokerMetadata.Index != nil { + aux := brokerMetadata.GetIndex() - uint64(numMsgs) + uint64(i) + 1 + messageIndex = &aux + } + if brokerMetadata.BrokerTimestamp != nil { + aux := timeFromUnixTimestampMillis(*brokerMetadata.BrokerTimestamp) + brokerPublishTime = &aux + } + } + // set the consumer so we know how to ack the message id + msgID.consumer = pc + msg = &message{ publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), eventTime: timeFromUnixTimestampMillis(smm.GetEventTime()), @@ -741,33 +865,14 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header index: messageIndex, brokerPublishTime: brokerPublishTime, } - } else { - msg = &message{ - publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), - eventTime: timeFromUnixTimestampMillis(msgMeta.GetEventTime()), - key: msgMeta.GetPartitionKey(), - producerName: msgMeta.GetProducerName(), - properties: internal.ConvertToStringMap(msgMeta.GetProperties()), - topic: pc.topic, - msgID: msgID, - payLoad: payload, - schema: pc.options.schema, - replicationClusters: msgMeta.GetReplicateTo(), - replicatedFrom: msgMeta.GetReplicatedFrom(), - redeliveryCount: response.GetRedeliveryCount(), - schemaVersion: msgMeta.GetSchemaVersion(), - schemaInfoCache: pc.schemaInfoCache, - index: messageIndex, - brokerPublishTime: brokerPublishTime, - } - } - pc.options.interceptors.BeforeConsume(ConsumerMessage{ - Consumer: pc.parentConsumer, - Message: msg, - }) + pc.options.interceptors.BeforeConsume(ConsumerMessage{ + Consumer: pc.parentConsumer, + Message: msg, + }) - messages = append(messages, msg) + messages = append(messages, msg) + } } // send messages to the dispatcher @@ -775,7 +880,61 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header return nil } -func (pc *partitionConsumer) messageShouldBeDiscarded(msgID trackingMessageID) bool { +func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buffer, + msgMeta *pb.MessageMetadata, + pbMsgID *pb.MessageIdData) internal.Buffer { + // todo: expire incomplete chunk message + uuid := msgMeta.GetUuid() + numChunks := msgMeta.GetNumChunksFromMsg() + totalChunksSize := int(msgMeta.GetTotalChunkMsgSize()) + chunkID := msgMeta.GetChunkId() + msgID := messageID{ + ledgerID: int64(pbMsgID.GetLedgerId()), + entryID: int64(pbMsgID.GetEntryId()), + batchIdx: -1, + partitionIdx: pc.partitionIdx, + } + + if msgMeta.GetChunkId() == 0 { + pc.chunkedMsgCtxMap.addIfAbsent(uuid, + numChunks, + totalChunksSize, + ) + } + + ctx := pc.chunkedMsgCtxMap.get(uuid) + + if ctx == nil || ctx.chunkedMsgBuffer == nil || chunkID != ctx.lastChunkedMsgID+1 || + int32(compressedPayload.ReadableBytes()+ctx.chunkedMsgBuffer.ReadableBytes()) > msgMeta.GetTotalChunkMsgSize() { + lastChunkedMsgID := -1 + totalChunks := -1 + if ctx != nil { + lastChunkedMsgID = int(ctx.lastChunkedMsgID) + totalChunks = int(ctx.totalChunks) + // todo: how to release buffer + ctx.chunkedMsgBuffer.Clear() + } + pc.log.Info(fmt.Sprintf( + "Received unexpected chunk messageId %s, last-chunk-id %d, chunkId = %d, total-chunks %d", + msgID.Serialize(), lastChunkedMsgID, chunkID, totalChunks)) + pc.chunkedMsgCtxMap.remove(uuid) + pc.availablePermits++ + // todo: expire tracker ack + return nil + } + + ctx.refresh(chunkID, msgID, compressedPayload) + + if msgMeta.GetChunkId() != msgMeta.GetNumChunksFromMsg()-1 { + pc.availablePermits++ + // todo: how to release buffer + return nil + } + + return ctx.chunkedMsgBuffer +} + +func (pc *partitionConsumer) messageShouldBeDiscarded(msgID messageID) bool { if pc.startMessageID.Undefined() { return false } @@ -785,11 +944,11 @@ func (pc *partitionConsumer) messageShouldBeDiscarded(msgID trackingMessageID) b } if pc.options.startMessageIDInclusive { - return pc.startMessageID.greater(msgID.messageID) + return pc.startMessageID.greater(msgID) } // Non inclusive - return pc.startMessageID.greaterEqual(msgID.messageID) + return pc.startMessageID.greaterEqual(msgID) } // create EncryptionContext from message metadata @@ -986,7 +1145,7 @@ func (pc *partitionConsumer) dispatcher() { } type ackRequest struct { - msgID trackingMessageID + msgID MessageID err error } @@ -1011,7 +1170,7 @@ type getLastMsgIDRequest struct { type seekRequest struct { doneCh chan struct{} - msgID trackingMessageID + msgID messageID err error } @@ -1444,3 +1603,168 @@ func convertToMessageID(id *pb.MessageIdData) trackingMessageID { return msgID } + +type chunkedMsgCtx struct { + totalChunks int32 + chunkedMsgBuffer internal.Buffer + lastChunkedMsgID int32 + chunkedMsgIDs []messageID + receivedTime int64 +} + +func newChunkedMsgCtx(numChunksFromMsg int32, totalChunkMsgSize int) *chunkedMsgCtx { + return &chunkedMsgCtx{ + totalChunks: numChunksFromMsg, + chunkedMsgBuffer: internal.NewBuffer(totalChunkMsgSize), + lastChunkedMsgID: -1, + chunkedMsgIDs: make([]messageID, numChunksFromMsg), + receivedTime: time.Now().Unix(), + } +} + +func (c *chunkedMsgCtx) refresh(chunkID int32, msgID messageID, partPayload internal.Buffer) { + c.chunkedMsgIDs[chunkID] = msgID + c.chunkedMsgBuffer.Write(partPayload.ReadableSlice()) + c.lastChunkedMsgID = chunkID +} + +func (c *chunkedMsgCtx) firstChunkID() messageID { + if len(c.chunkedMsgIDs) == 0 { + return messageID{} + } + return c.chunkedMsgIDs[0] +} + +func (c *chunkedMsgCtx) lastChunkID() messageID { + if len(c.chunkedMsgIDs) == 0 { + return messageID{} + } + return c.chunkedMsgIDs[len(c.chunkedMsgIDs)-1] +} + +type chunkedMsgCtxMap struct { + chunkedMsgCtxs map[string]*chunkedMsgCtx + pendingQueue *list.List + maxPending int + pc *partitionConsumer + mu sync.Mutex + tw *internal.TimeWheel + closed bool +} + +func newChunkedMsgCtxMap(maxPending int, pc *partitionConsumer) *chunkedMsgCtxMap { + // create a time wheel with 100ms timing accuracy and 36000 slots + tw := internal.NewTimeWheel(time.Millisecond*100, 36000) + tw.Start() + return &chunkedMsgCtxMap{ + chunkedMsgCtxs: make(map[string]*chunkedMsgCtx, maxPending), + pendingQueue: list.New(), + maxPending: maxPending, + pc: pc, + mu: sync.Mutex{}, + tw: tw, + } +} + +func (c *chunkedMsgCtxMap) addIfAbsent(uuid string, totalChunks int32, totalChunkMsgSize int) { + c.mu.Lock() + defer c.mu.Unlock() + if c.closed { + return + } + if _, ok := c.chunkedMsgCtxs[uuid]; !ok { + c.chunkedMsgCtxs[uuid] = newChunkedMsgCtx(totalChunks, totalChunkMsgSize) + c.pendingQueue.PushBack(uuid) + c.tw.AddJob(uuid, c.pc.options.expireTimeOfIncompleteChunk, func() { + // todo: autoAck is always true in Java client, why? + c.removeChunkMessage(uuid, c.pc.options.autoAckIncompleteChunk) + }) + } + if c.maxPending > 0 && c.pendingQueue.Len() > c.maxPending { + go c.removeChunkMessage(uuid, c.pc.options.autoAckIncompleteChunk) + } +} + +func (c *chunkedMsgCtxMap) get(uuid string) *chunkedMsgCtx { + c.mu.Lock() + defer c.mu.Unlock() + if c.closed { + return nil + } + return c.chunkedMsgCtxs[uuid] +} + +func (c *chunkedMsgCtxMap) remove(uuid string) { + c.mu.Lock() + defer c.mu.Unlock() + if c.closed { + return + } + delete(c.chunkedMsgCtxs, uuid) + e := c.pendingQueue.Front() + for ; e != nil; e = e.Next() { + if e.Value.(string) == uuid { + c.pendingQueue.Remove(e) + break + } + } +} + +func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { + c.mu.Lock() + defer c.mu.Unlock() + if c.closed { + return + } + if ctx, ok := c.chunkedMsgCtxs[uuid]; !ok { + return + } else { + for _, mid := range ctx.chunkedMsgIDs { + if autoAck { + c.pc.log.Info("Removing chunk message-id", mid.String()) + c.pc.AckID(mid) + } + } + } + delete(c.chunkedMsgCtxs, uuid) + e := c.pendingQueue.Front() + for ; e != nil; e = e.Next() { + if e.Value.(string) == uuid { + c.pendingQueue.Remove(e) + break + } + } +} + +func (c *chunkedMsgCtxMap) close() { + c.mu.Lock() + defer c.mu.Unlock() + c.closed = true + + c.tw.Stop() +} + +type unAckedChunkedIDSequences struct { + mu sync.Mutex + + ids map[string][]messageID +} + +func newUnAckedChunkedIDSequences() *unAckedChunkedIDSequences { + return &unAckedChunkedIDSequences{ + mu: sync.Mutex{}, + ids: make(map[string][]messageID), + } +} + +func (u *unAckedChunkedIDSequences) add(chunkIDStr string, unAckedIDs []messageID) { + u.mu.Lock() + defer u.mu.Unlock() + u.ids[chunkIDStr] = unAckedIDs +} + +func (u *unAckedChunkedIDSequences) get(chunkIDStr string) []messageID { + u.mu.Lock() + defer u.mu.Unlock() + return u.ids[chunkIDStr] +} diff --git a/pulsar/consumer_test.go b/pulsar/consumer_test.go index 20d4290726..819bbb7068 100644 --- a/pulsar/consumer_test.go +++ b/pulsar/consumer_test.go @@ -39,8 +39,8 @@ import ( ) var ( - adminURL = "http://localhost:8080" - lookupURL = "pulsar://localhost:6650" + adminURL = "http://10.105.7.225:8080" + lookupURL = "pulsar://10.105.7.225:6650" ) func TestProducerConsumer(t *testing.T) { @@ -177,6 +177,31 @@ func TestBatchMessageReceive(t *testing.T) { assert.Equal(t, count, numOfMessages) } +func TestChunkMessageReceive(t *testing.T) { + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topicName := "persistent://public/default/receive-chunk" + subName := "subscription-name" + ctx := context.Background() + + consumer, err := client.Subscribe(ConsumerOptions{ + Topic: topicName, + SubscriptionName: subName, + }) + assert.Nil(t, err) + defer consumer.Close() + + msg, err := consumer.Receive(ctx) + assert.Nil(t, err) + fmt.Println("id:", msg.ID().(chunkMessageID).String(), "len:", len(msg.Payload())) + consumer.Ack(msg) +} + func TestConsumerWithInvalidConf(t *testing.T) { client, err := NewClient(ClientOptions{ URL: lookupURL, diff --git a/pulsar/impl_message.go b/pulsar/impl_message.go index 067439f39a..fe17471f49 100644 --- a/pulsar/impl_message.go +++ b/pulsar/impl_message.go @@ -204,6 +204,15 @@ func toTrackingMessageID(msgID MessageID) (trackingMessageID, bool) { } } +func toChunkedMessageID(msgID MessageID) (chunkMessageID, bool) { + cid, ok := msgID.(chunkMessageID) + if ok { + return cid, true + } else { + return chunkMessageID{}, false + } +} + func timeFromUnixTimestampMillis(timestamp uint64) time.Time { ts := int64(timestamp) * int64(time.Millisecond) seconds := ts / int64(time.Second) @@ -361,3 +370,34 @@ func (t *ackTracker) completed() bool { defer t.Unlock() return len(t.batchIDs.Bits()) == 0 } + +type chunkMessageID struct { + messageID + + firstChunkID messageID + receivedTime time.Time +} + +func newChunkMessageID(firstChunkID messageID, lastChunkID messageID) chunkMessageID { + return chunkMessageID{ + messageID: lastChunkID, + firstChunkID: firstChunkID, + receivedTime: time.Now(), + } +} + +func (id chunkMessageID) firstChunkMessageID() MessageID { + return id.firstChunkID +} + +func (id chunkMessageID) lastChunkMessageID() MessageID { + return id +} + +func (id chunkMessageID) String() string { + return fmt.Sprintf("%s;%s", id.firstChunkID.String(), id.messageID.String()) +} + +func (id chunkMessageID) Serialize() []byte { + return id.firstChunkID.Serialize() +} diff --git a/pulsar/internal/timewheel.go b/pulsar/internal/timewheel.go new file mode 100644 index 0000000000..1f231a1ae7 --- /dev/null +++ b/pulsar/internal/timewheel.go @@ -0,0 +1,174 @@ +package internal + +import ( + "container/list" + "log" + "time" +) + +// TimeWheel can execute job after waiting given duration +type TimeWheel struct { + interval time.Duration + ticker *time.Ticker + slots []*list.List + + timer map[string]int + currentPos int + slotNum int + addTaskChannel chan task + removeTaskChannel chan string + stopChannel chan bool +} + +type task struct { + delay time.Duration + circle int + key string + job func() +} + +// NewTimeWheel creates a new time wheel +func NewTimeWheel(interval time.Duration, slotNum int) *TimeWheel { + if interval <= 0 || slotNum <= 0 { + return nil + } + tw := &TimeWheel{ + interval: interval, + slots: make([]*list.List, slotNum), + timer: make(map[string]int), + currentPos: 0, + slotNum: slotNum, + addTaskChannel: make(chan task), + removeTaskChannel: make(chan string), + stopChannel: make(chan bool), + } + tw.initSlots() + + return tw +} + +func (tw *TimeWheel) initSlots() { + for i := 0; i < tw.slotNum; i++ { + tw.slots[i] = list.New() + } +} + +// Start starts ticker for time wheel +func (tw *TimeWheel) Start() { + tw.ticker = time.NewTicker(tw.interval) + go tw.start() +} + +// Stop stops the time wheel +func (tw *TimeWheel) Stop() { + tw.stopChannel <- true +} + +// AddJob add new job into pending queue +func (tw *TimeWheel) AddJob(key string, delay time.Duration, job func()) { + if delay < 0 { + return + } + tw.addTaskChannel <- task{delay: delay, key: key, job: job} +} + +// RemoveJob add remove job from pending queue +// if job is done or not found, then nothing happened +func (tw *TimeWheel) RemoveJob(key string) { + if key == "" { + return + } + tw.removeTaskChannel <- key +} + +func (tw *TimeWheel) start() { + for { + select { + case <-tw.ticker.C: + tw.tickHandler() + case task := <-tw.addTaskChannel: + tw.addTask(&task) + case key := <-tw.removeTaskChannel: + tw.removeTask(key) + case <-tw.stopChannel: + tw.ticker.Stop() + return + } + } +} + +func (tw *TimeWheel) tickHandler() { + l := tw.slots[tw.currentPos] + tw.scanAndRunTask(l) + if tw.currentPos == tw.slotNum-1 { + tw.currentPos = 0 + } else { + tw.currentPos++ + } +} + +func (tw *TimeWheel) scanAndRunTask(l *list.List) { + for e := l.Front(); e != nil; { + task := e.Value.(*task) + if task.circle > 0 { + task.circle-- + e = e.Next() + continue + } + + go func() { + defer func() { + if err := recover(); err != nil { + log.Fatalln(err) + } + }() + job := task.job + job() + }() + next := e.Next() + l.Remove(e) + if task.key != "" { + delete(tw.timer, task.key) + } + e = next + } +} + +func (tw *TimeWheel) addTask(task *task) { + pos, circle := tw.getPositionAndCircle(task.delay) + task.circle = circle + + tw.slots[pos].PushBack(task) + + if task.key != "" { + tw.timer[task.key] = pos + } +} + +func (tw *TimeWheel) getPositionAndCircle(d time.Duration) (pos int, circle int) { + delaySeconds := int(d.Milliseconds()) + intervalSeconds := int(tw.interval.Milliseconds()) + circle = int(delaySeconds / intervalSeconds / tw.slotNum) + pos = int(tw.currentPos+delaySeconds/intervalSeconds) % tw.slotNum + if pos < 0 { + pos = 0 + } + return +} + +func (tw *TimeWheel) removeTask(key string) { + position, ok := tw.timer[key] + if !ok { + return + } + l := tw.slots[position] + for e := l.Front(); e != nil; { + task := e.Value.(*task) + if task.key == key { + delete(tw.timer, task.key) + l.Remove(e) + } + + e = e.Next() + } +} diff --git a/pulsar/internal/timewheel_test.go b/pulsar/internal/timewheel_test.go new file mode 100644 index 0000000000..3a958f3b82 --- /dev/null +++ b/pulsar/internal/timewheel_test.go @@ -0,0 +1,81 @@ +package internal + +import ( + "fmt" + "testing" + "time" +) + +func callback() { + fmt.Println("callback !!!") +} + +func checkTimeCost(t *testing.T, start, end time.Time, before int, after int) bool { + due := end.Sub(start) + if due > time.Duration(after)*time.Millisecond { + t.Error("delay run") + return false + } + + if due < time.Duration(before)*time.Millisecond { + t.Error("run ahead") + return false + } + + return true +} + +func TestCalcPos(t *testing.T) { + tw := NewTimeWheel(100*time.Millisecond, 5) + idx, round := tw.getPositionAndCircle(1 * time.Second) + if round != 2 { + t.Error("round err", round) + } + if idx != 0 { + t.Error("idx err", idx) + } +} + +func TestAddFunc(t *testing.T) { + tw := NewTimeWheel(100*time.Millisecond, 5) + tw.Start() + defer tw.Stop() + + for index := 1; index < 6; index++ { + queue := make(chan bool, 0) + start := time.Now() + tw.AddJob(fmt.Sprintf("key_%d", index), time.Duration(index)*time.Second, func() { + queue <- true + }) + + <-queue + + before := index*1000 - 200 + after := index*1000 + 200 + checkTimeCost(t, start, time.Now(), before, after) + t.Log("time since: ", time.Since(start).String()) + } +} + +func TestRemove(t *testing.T) { + tw := NewTimeWheel(100*time.Millisecond, 5) + tw.Start() + defer tw.Stop() + + queue := make(chan bool, 0) + tw.AddJob("key", time.Millisecond*500, func() { + queue <- true + }) + + // remove action after add action + time.AfterFunc(time.Millisecond*10, func() { + tw.RemoveJob("key") + }) + + exitTimer := time.NewTimer(1 * time.Second) + select { + case <-exitTimer.C: + case <-queue: + t.Error("must not run") + } +} diff --git a/pulsar/negative_acks_tracker.go b/pulsar/negative_acks_tracker.go index 3485e1b02d..cde85195d6 100644 --- a/pulsar/negative_acks_tracker.go +++ b/pulsar/negative_acks_tracker.go @@ -65,12 +65,12 @@ func newNegativeAcksTracker(rc redeliveryConsumer, delay time.Duration, return t } -func (t *negativeAcksTracker) Add(msgID messageID) { +func (t *negativeAcksTracker) Add(msgID MessageID) { // Always clear up the batch index since we want to track the nack // for the entire batch batchMsgID := messageID{ - ledgerID: msgID.ledgerID, - entryID: msgID.entryID, + ledgerID: msgID.LedgerID(), + entryID: msgID.EntryID(), batchIdx: 0, } diff --git a/pulsar/test_helper.go b/pulsar/test_helper.go index d6a4f0042d..114144c65d 100644 --- a/pulsar/test_helper.go +++ b/pulsar/test_helper.go @@ -36,7 +36,7 @@ import ( ) const ( - serviceURL = "pulsar://localhost:6650" + serviceURL = "pulsar://10.105.7.225:6650" serviceURLTLS = "pulsar+ssl://localhost:6651" webServiceURL = "http://localhost:8080" From 04566306c01f277a14431af43bc77f708e92733a Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 31 Jul 2022 20:35:03 +0800 Subject: [PATCH 05/40] test: add message_chunking_test.go for chunk consume/produce test --- pulsar/message_chunking_test.go | 312 ++++++++++++++++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 pulsar/message_chunking_test.go diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go new file mode 100644 index 0000000000..1cdd3577ed --- /dev/null +++ b/pulsar/message_chunking_test.go @@ -0,0 +1,312 @@ +package pulsar + +import ( + "context" + "errors" + "math/rand" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +var _brokerMaxMessageSize = 1024 * 1024 + +func TestInvalidChunkingConfig(t *testing.T) { + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + // create producer + producer, err := client.CreateProducer(ProducerOptions{ + Topic: newTopicName(), + DisableBatching: false, + EnableChunking: true, + }) + + assert.Error(t, err, "producer creation should have fail") + assert.Nil(t, producer) +} + +func TestLargeMessage(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + // create producer without ChunkMaxMessageSize + producer1, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + DisableBatching: true, + EnableChunking: true, + }) + assert.NoError(t, err) + assert.NotNil(t, producer1) + defer producer1.Close() + + // create producer with ChunkMaxMessageSize + producer2, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + DisableBatching: true, + EnableChunking: true, + ChunkMaxMessageSize: 5, + }) + assert.NoError(t, err) + assert.NotNil(t, producer2) + defer producer2.Close() + + consumer, err := client.Subscribe(ConsumerOptions{ + Topic: topic, + Type: Exclusive, + SubscriptionName: "chunk-subscriber", + }) + assert.NoError(t, err) + assert.NotNil(t, consumer) + defer consumer.Close() + + expectMsgs := make([][]byte, 0, 10) + + // test send chunk with serverMaxMessageSize limit + for i := 0; i < 5; i++ { + msg := createTestMessagePayload(_brokerMaxMessageSize + 1) + expectMsgs = append(expectMsgs, msg) + ID, err := producer1.Send(context.Background(), &ProducerMessage{ + Payload: msg, + }) + assert.NoError(t, err) + assert.NotNil(t, ID) + } + + // test receive chunk with serverMaxMessageSize limit + for i := 0; i < 5; i++ { + msg, err := consumer.Receive(context.Background()) + assert.NoError(t, err) + + expectMsg := expectMsgs[i] + + assert.Equal(t, expectMsg, msg.Payload()) + // ack message + err = consumer.Ack(msg) + assert.NoError(t, err) + } + + // test send chunk with ChunkMaxMessageSize limit + for i := 0; i < 5; i++ { + msg := createTestMessagePayload(50) + expectMsgs = append(expectMsgs, msg) + ID, err := producer2.Send(context.Background(), &ProducerMessage{ + Payload: msg, + }) + assert.NoError(t, err) + assert.NotNil(t, ID) + } + + // test receive chunk with ChunkMaxMessageSize limit + for i := 5; i < 10; i++ { + msg, err := consumer.Receive(context.Background()) + assert.NoError(t, err) + + expectMsg := expectMsgs[i] + + assert.Equal(t, expectMsg, msg.Payload()) + // ack message + err = consumer.Ack(msg) + assert.NoError(t, err) + } +} + +func TestPublishChunkWithFailure(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + // create producer without ChunkMaxMessageSize + producer, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + ID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(_brokerMaxMessageSize + 1), + }) + assert.Error(t, err) + assert.Nil(t, ID) +} + +func TestMaxPendingChunkMessages(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + totalProducers := 5 + producers := make([]Producer, 0, 20) + defer func() { + for _, p := range producers { + p.Close() + } + }() + + clients := make([]Client, 0, 20) + defer func() { + for _, c := range clients { + c.Close() + } + }() + + for j := 0; j < totalProducers; j++ { + pc, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + assert.Nil(t, err) + clients = append(clients, pc) + producer, err := pc.CreateProducer(ProducerOptions{ + Topic: topic, + DisableBatching: true, + EnableChunking: true, + ChunkMaxMessageSize: 10, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + producers = append(producers, producer) + } + + consumer, err := client.Subscribe(ConsumerOptions{ + Topic: topic, + Type: Exclusive, + SubscriptionName: "chunk-subscriber", + MaxPendingChunkedMessage: 1, + }) + assert.NoError(t, err) + assert.NotNil(t, consumer) + defer consumer.Close() + + totalMsgs := 40 + wg := sync.WaitGroup{} + wg.Add(totalMsgs * totalProducers) + for i := 0; i < totalMsgs; i++ { + for j := 0; j < totalProducers; j++ { + p := producers[j] + go func() { + ID, err := p.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(50), + }) + assert.NoError(t, err) + assert.NotNil(t, ID) + wg.Done() + }() + } + } + wg.Wait() + + received := 0 + for i := 0; i < totalMsgs*totalProducers; i++ { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) + msg, err := consumer.Receive(ctx) + cancel() + if msg == nil || (err != nil && errors.Is(err, context.DeadlineExceeded)) { + break + } + + received++ + + err = consumer.Ack(msg) + assert.NoError(t, err) + } + + assert.NotEqual(t, totalMsgs*totalProducers, received) +} + +func TestExpireIncompleteChunks(t *testing.T) { + rand.Seed(time.Now().Unix()) + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + c, err := client.Subscribe(ConsumerOptions{ + Topic: topic, + Type: Exclusive, + SubscriptionName: "chunk-subscriber", + ExpireTimeOfIncompleteChunk: time.Millisecond * 300, + }) + defer c.Close() + + uuid := "test-uuid" + chunkCtxMap := c.(*consumer).consumers[0].chunkedMsgCtxMap + chunkCtxMap.addIfAbsent(uuid, 2, 100) + ctx := chunkCtxMap.get(uuid) + assert.NotNil(t, ctx) + + time.Sleep(400 * time.Millisecond) + + ctx = chunkCtxMap.get(uuid) + assert.Nil(t, ctx) +} + +func TestChunksEnqueueFailed(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + producer, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + EnableChunking: true, + DisableBatching: true, + MaxPendingMessages: 10, + ChunkMaxMessageSize: 50, + DisableBlockIfQueueFull: true, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + ID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(1000), + }) + assert.Error(t, err) + assert.Nil(t, ID) +} + +func createTestMessagePayload(size int) []byte { + payload := make([]byte, size) + for i := range payload { + payload[i] = byte(rand.Intn(100)) + } + return payload +} From a4769e872d1fa5c71a7eb6ce1b762b3332294d89 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sat, 6 Aug 2022 16:17:07 +0800 Subject: [PATCH 06/40] refector: do some clean, removed unAckedChunkedMessageIdSequenceMap because it's no need --- pulsar/consumer_partition.go | 91 ++++++++++--------------------- pulsar/consumer_test.go | 4 +- pulsar/impl_message.go | 11 +--- pulsar/internal/timewheel.go | 4 +- pulsar/internal/timewheel_test.go | 8 +-- pulsar/message_chunking_test.go | 1 + pulsar/test_helper.go | 2 +- 7 files changed, 39 insertions(+), 82 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index ba2b108a0d..e067446587 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -155,8 +155,7 @@ type partitionConsumer struct { decryptor cryptointernal.Decryptor schemaInfoCache *schemaInfoCache - chunkedMsgCtxMap *chunkedMsgCtxMap - unAckedChunkedIDSequences *unAckedChunkedIDSequences + chunkedMsgCtxMap *chunkedMsgCtxMap } type schemaInfoCache struct { @@ -209,28 +208,27 @@ func newPartitionConsumer(parent Consumer, client *client, options *partitionCon messageCh chan ConsumerMessage, dlq *dlqRouter, metrics *internal.LeveledMetrics) (*partitionConsumer, error) { pc := &partitionConsumer{ - parentConsumer: parent, - client: client, - options: options, - topic: options.topic, - name: options.consumerName, - consumerID: client.rpcClient.NewConsumerID(), - partitionIdx: int32(options.partitionIdx), - eventsCh: make(chan interface{}, 10), - queueSize: int32(options.receiverQueueSize), - queueCh: make(chan []*message, options.receiverQueueSize), - startMessageID: options.startMessageID, - connectedCh: make(chan struct{}), - messageCh: messageCh, - connectClosedCh: make(chan connectionClosed, 10), - closeCh: make(chan struct{}), - clearQueueCh: make(chan func(id trackingMessageID)), - clearMessageQueuesCh: make(chan chan struct{}), - compressionProviders: sync.Map{}, - dlq: dlq, - metrics: metrics, - schemaInfoCache: newSchemaInfoCache(client, options.topic), - unAckedChunkedIDSequences: newUnAckedChunkedIDSequences(), + parentConsumer: parent, + client: client, + options: options, + topic: options.topic, + name: options.consumerName, + consumerID: client.rpcClient.NewConsumerID(), + partitionIdx: int32(options.partitionIdx), + eventsCh: make(chan interface{}, 10), + queueSize: int32(options.receiverQueueSize), + queueCh: make(chan []*message, options.receiverQueueSize), + startMessageID: options.startMessageID, + connectedCh: make(chan struct{}), + messageCh: messageCh, + connectClosedCh: make(chan connectionClosed, 10), + closeCh: make(chan struct{}), + clearQueueCh: make(chan func(id trackingMessageID)), + clearMessageQueuesCh: make(chan chan struct{}), + compressionProviders: sync.Map{}, + dlq: dlq, + metrics: metrics, + schemaInfoCache: newSchemaInfoCache(client, options.topic), } pc.chunkedMsgCtxMap = newChunkedMsgCtxMap(options.maxPendingChunkedMessage, pc) pc.setConsumerState(consumerInit) @@ -394,7 +392,7 @@ func (pc *partitionConsumer) AckID(msgID MessageID) error { pc.options.interceptors.OnAcknowledge(pc.parentConsumer, msgID) } } else if cmid, ok := toChunkedMessageID(msgID); ok { - // todo: too redundent, maybe better implement + // todo: too redundant, maybe better implement pc.metrics.AcksCounter.Inc() pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-cmid.receivedTime.UnixNano()) / 1.0e9) ackReq.msgID = cmid @@ -743,12 +741,8 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header } if len(ctx.chunkedMsgIDs) > 0 { msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) - // todo: maybe race - pc.unAckedChunkedIDSequences.add(string(msgID.Serialize()), ctx.chunkedMsgIDs) - // todo: is first or last? shouldDiscard = pc.messageShouldBeDiscarded(msgID.(chunkMessageID).messageID) } - // todo: no unackedTracker } else { msgID = messageID{ ledgerID: int64(pbMsgID.GetLedgerId()), @@ -1716,14 +1710,14 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { if c.closed { return } - if ctx, ok := c.chunkedMsgCtxs[uuid]; !ok { + ctx, ok := c.chunkedMsgCtxs[uuid] + if !ok { return - } else { - for _, mid := range ctx.chunkedMsgIDs { - if autoAck { - c.pc.log.Info("Removing chunk message-id", mid.String()) - c.pc.AckID(mid) - } + } + for _, mid := range ctx.chunkedMsgIDs { + if autoAck { + c.pc.log.Info("Removing chunk message-id", mid.String()) + c.pc.AckID(mid) } } delete(c.chunkedMsgCtxs, uuid) @@ -1743,28 +1737,3 @@ func (c *chunkedMsgCtxMap) close() { c.tw.Stop() } - -type unAckedChunkedIDSequences struct { - mu sync.Mutex - - ids map[string][]messageID -} - -func newUnAckedChunkedIDSequences() *unAckedChunkedIDSequences { - return &unAckedChunkedIDSequences{ - mu: sync.Mutex{}, - ids: make(map[string][]messageID), - } -} - -func (u *unAckedChunkedIDSequences) add(chunkIDStr string, unAckedIDs []messageID) { - u.mu.Lock() - defer u.mu.Unlock() - u.ids[chunkIDStr] = unAckedIDs -} - -func (u *unAckedChunkedIDSequences) get(chunkIDStr string) []messageID { - u.mu.Lock() - defer u.mu.Unlock() - return u.ids[chunkIDStr] -} diff --git a/pulsar/consumer_test.go b/pulsar/consumer_test.go index 819bbb7068..c2d151e160 100644 --- a/pulsar/consumer_test.go +++ b/pulsar/consumer_test.go @@ -39,8 +39,8 @@ import ( ) var ( - adminURL = "http://10.105.7.225:8080" - lookupURL = "pulsar://10.105.7.225:6650" + adminURL = "http://localhost:8080" + lookupURL = "pulsar://localhost:6650" ) func TestProducerConsumer(t *testing.T) { diff --git a/pulsar/impl_message.go b/pulsar/impl_message.go index fe17471f49..38139c187d 100644 --- a/pulsar/impl_message.go +++ b/pulsar/impl_message.go @@ -208,9 +208,8 @@ func toChunkedMessageID(msgID MessageID) (chunkMessageID, bool) { cid, ok := msgID.(chunkMessageID) if ok { return cid, true - } else { - return chunkMessageID{}, false } + return chunkMessageID{}, false } func timeFromUnixTimestampMillis(timestamp uint64) time.Time { @@ -386,14 +385,6 @@ func newChunkMessageID(firstChunkID messageID, lastChunkID messageID) chunkMessa } } -func (id chunkMessageID) firstChunkMessageID() MessageID { - return id.firstChunkID -} - -func (id chunkMessageID) lastChunkMessageID() MessageID { - return id -} - func (id chunkMessageID) String() string { return fmt.Sprintf("%s;%s", id.firstChunkID.String(), id.messageID.String()) } diff --git a/pulsar/internal/timewheel.go b/pulsar/internal/timewheel.go index 1f231a1ae7..490bf2285b 100644 --- a/pulsar/internal/timewheel.go +++ b/pulsar/internal/timewheel.go @@ -148,8 +148,8 @@ func (tw *TimeWheel) addTask(task *task) { func (tw *TimeWheel) getPositionAndCircle(d time.Duration) (pos int, circle int) { delaySeconds := int(d.Milliseconds()) intervalSeconds := int(tw.interval.Milliseconds()) - circle = int(delaySeconds / intervalSeconds / tw.slotNum) - pos = int(tw.currentPos+delaySeconds/intervalSeconds) % tw.slotNum + circle = delaySeconds / intervalSeconds / tw.slotNum + pos = tw.currentPos + delaySeconds/intervalSeconds%tw.slotNum if pos < 0 { pos = 0 } diff --git a/pulsar/internal/timewheel_test.go b/pulsar/internal/timewheel_test.go index 3a958f3b82..d85bbfe69d 100644 --- a/pulsar/internal/timewheel_test.go +++ b/pulsar/internal/timewheel_test.go @@ -6,10 +6,6 @@ import ( "time" ) -func callback() { - fmt.Println("callback !!!") -} - func checkTimeCost(t *testing.T, start, end time.Time, before int, after int) bool { due := end.Sub(start) if due > time.Duration(after)*time.Millisecond { @@ -42,7 +38,7 @@ func TestAddFunc(t *testing.T) { defer tw.Stop() for index := 1; index < 6; index++ { - queue := make(chan bool, 0) + queue := make(chan bool) start := time.Now() tw.AddJob(fmt.Sprintf("key_%d", index), time.Duration(index)*time.Second, func() { queue <- true @@ -62,7 +58,7 @@ func TestRemove(t *testing.T) { tw.Start() defer tw.Stop() - queue := make(chan bool, 0) + queue := make(chan bool) tw.AddJob("key", time.Millisecond*500, func() { queue <- true }) diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 1cdd3577ed..6c9f6ae2e1 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -258,6 +258,7 @@ func TestExpireIncompleteChunks(t *testing.T) { SubscriptionName: "chunk-subscriber", ExpireTimeOfIncompleteChunk: time.Millisecond * 300, }) + assert.NoError(t, err) defer c.Close() uuid := "test-uuid" diff --git a/pulsar/test_helper.go b/pulsar/test_helper.go index 114144c65d..d6a4f0042d 100644 --- a/pulsar/test_helper.go +++ b/pulsar/test_helper.go @@ -36,7 +36,7 @@ import ( ) const ( - serviceURL = "pulsar://10.105.7.225:6650" + serviceURL = "pulsar://localhost:6650" serviceURLTLS = "pulsar+ssl://localhost:6651" webServiceURL = "http://localhost:8080" From 64b9a20f0b9143eb28f90d45fb0dcb66b7f192b4 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Wed, 10 Aug 2022 15:15:43 +0800 Subject: [PATCH 07/40] fix: fix the bug of uuid generation --- pulsar/producer_partition.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 19c6bf0e3a..219750b596 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -22,6 +22,7 @@ import ( "errors" "fmt" "math" + "strconv" "strings" "sync" "sync/atomic" @@ -612,7 +613,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } if totalChunks > 1 { var lhs, rhs int - uuid := fmt.Sprintf("%s-%d", p.producerName, mm.SequenceId) + uuid := fmt.Sprintf("%s-%s", p.producerName, strconv.FormatUint(*mm.SequenceId, 10)) mm.Uuid = proto.String(uuid) mm.NumChunksFromMsg = proto.Int(totalChunks) mm.TotalChunkMsgSize = proto.Int(compressedSize) @@ -1087,7 +1088,11 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) ) if sr.callback != nil { - sr.callback(msgID, sr.msg, nil) + if sr.chunkMsgID != nil && sr.chunkMsgID.complete() { + sr.callback(sr.chunkMsgID.firstChunkID, sr.msg, nil) + } else { + sr.callback(msgID, sr.msg, nil) + } } p.options.Interceptors.OnSendAcknowledgement(p, sr.msg, msgID) @@ -1185,6 +1190,7 @@ type sendRequest struct { blockOnce sync.Once totalChunks int chunkID int + chunkMsgID *chunkMessageID } type closeProducer struct { From c8e083acf4801bcfbc0ce03c8c455c021814d37c Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Thu, 11 Aug 2022 13:53:54 +0800 Subject: [PATCH 08/40] fix: fix the Seek() for chunk message id --- pulsar/consumer_impl.go | 34 ++++----- pulsar/consumer_partition.go | 17 +++-- pulsar/message_chunking_test.go | 126 +++++++++++++++++++++++++++++++- pulsar/producer_partition.go | 77 +++++++++++++++++-- 4 files changed, 219 insertions(+), 35 deletions(-) diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 9dc94628c1..9c7f7fc00a 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -525,16 +525,15 @@ func (c *consumer) ReconsumeLater(msg Message, delay time.Duration) { func (c *consumer) Nack(msg Message) { if c.options.EnableDefaultNackBackoffPolicy || c.options.NackBackoffPolicy != nil { - mid, ok := c.messageID(msg.ID()) - if !ok { - return - } + msgID := msg.ID() - if mid.consumer != nil { - mid.Nack() + if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { + c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", + msgID.PartitionIdx(), len(c.consumers)) return } - c.consumers[mid.partitionIdx].NackMsg(msg) + + c.consumers[msgID.PartitionIdx()].NackMsg(msg) return } @@ -542,17 +541,13 @@ func (c *consumer) Nack(msg Message) { } func (c *consumer) NackID(msgID MessageID) { - mid, ok := c.messageID(msgID) - if !ok { - return - } - - if mid.consumer != nil { - mid.Nack() + if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { + c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", + msgID.PartitionIdx(), len(c.consumers)) return } - c.consumers[mid.partitionIdx].NackID(mid) + c.consumers[msgID.PartitionIdx()].NackID(msgID) } func (c *consumer) Close() { @@ -588,12 +583,13 @@ func (c *consumer) Seek(msgID MessageID) error { return newError(SeekFailed, "for partition topic, seek command should perform on the individual partitions") } - mid, ok := c.messageID(msgID) - if !ok { - return nil + if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { + c.log.Errorf("invalid partition index %d expected a partition between [0-%d]", + msgID.PartitionIdx(), len(c.consumers)) + return errors.New("invalid partition index") } - return c.consumers[mid.partitionIdx].Seek(mid) + return c.consumers[msgID.PartitionIdx()].Seek(msgID) } func (c *consumer) SeekByTime(time time.Time) error { diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index e067446587..ffbe490714 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -470,7 +470,7 @@ func (pc *partitionConsumer) Close() { } // close chunkedMsgCtxMap - pc.chunkedMsgCtxMap.close() + pc.chunkedMsgCtxMap.Close() req := &closeRequest{doneCh: make(chan struct{})} pc.eventsCh <- req @@ -775,6 +775,9 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header return err } + // clean chunkedMsgCtxMap + pc.chunkedMsgCtxMap.remove(msgMeta.GetUuid()) + pc.metrics.BytesReceived.Add(float64(len(payload))) pc.metrics.PrefetchedBytes.Add(float64(len(payload))) @@ -898,8 +901,7 @@ func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buff ctx := pc.chunkedMsgCtxMap.get(uuid) - if ctx == nil || ctx.chunkedMsgBuffer == nil || chunkID != ctx.lastChunkedMsgID+1 || - int32(compressedPayload.ReadableBytes()+ctx.chunkedMsgBuffer.ReadableBytes()) > msgMeta.GetTotalChunkMsgSize() { + if ctx == nil || ctx.chunkedMsgBuffer == nil || chunkID != ctx.lastChunkedMsgID+1 { lastChunkedMsgID := -1 totalChunks := -1 if ctx != nil { @@ -908,9 +910,9 @@ func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buff // todo: how to release buffer ctx.chunkedMsgBuffer.Clear() } - pc.log.Info(fmt.Sprintf( + pc.log.Warnf(fmt.Sprintf( "Received unexpected chunk messageId %s, last-chunk-id %d, chunkId = %d, total-chunks %d", - msgID.Serialize(), lastChunkedMsgID, chunkID, totalChunks)) + msgID.String(), lastChunkedMsgID, chunkID, totalChunks)) pc.chunkedMsgCtxMap.remove(uuid) pc.availablePermits++ // todo: expire tracker ack @@ -1670,8 +1672,7 @@ func (c *chunkedMsgCtxMap) addIfAbsent(uuid string, totalChunks int32, totalChun c.chunkedMsgCtxs[uuid] = newChunkedMsgCtx(totalChunks, totalChunkMsgSize) c.pendingQueue.PushBack(uuid) c.tw.AddJob(uuid, c.pc.options.expireTimeOfIncompleteChunk, func() { - // todo: autoAck is always true in Java client, why? - c.removeChunkMessage(uuid, c.pc.options.autoAckIncompleteChunk) + c.removeChunkMessage(uuid, true) }) } if c.maxPending > 0 && c.pendingQueue.Len() > c.maxPending { @@ -1730,7 +1731,7 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { } } -func (c *chunkedMsgCtxMap) close() { +func (c *chunkedMsgCtxMap) Close() { c.mu.Lock() defer c.mu.Unlock() c.closed = true diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 6c9f6ae2e1..c95fffe941 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -89,7 +89,9 @@ func TestLargeMessage(t *testing.T) { // test receive chunk with serverMaxMessageSize limit for i := 0; i < 5; i++ { - msg, err := consumer.Receive(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + msg, err := consumer.Receive(ctx) + cancel() assert.NoError(t, err) expectMsg := expectMsgs[i] @@ -304,6 +306,128 @@ func TestChunksEnqueueFailed(t *testing.T) { assert.Nil(t, ID) } +func TestSeekChunkMessages(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + totalMessages := 5 + + producer, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + EnableChunking: true, + DisableBatching: true, + ChunkMaxMessageSize: 50, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + consumer, err := client.Subscribe(ConsumerOptions{ + Topic: topic, + Type: Exclusive, + SubscriptionName: "default-seek", + }) + assert.NoError(t, err) + assert.NotNil(t, consumer) + defer consumer.Close() + + msgIDs := make([]MessageID, 0) + for i := 0; i < totalMessages; i++ { + ID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(100), + }) + assert.NoError(t, err) + msgIDs = append(msgIDs, ID) + } + + for i := 0; i < totalMessages; i++ { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + msg, err := consumer.Receive(ctx) + cancel() + assert.NoError(t, err) + assert.NotNil(t, msg) + assert.Equal(t, msgIDs[i].Serialize(), msg.ID().Serialize()) + } + + err = consumer.Seek(msgIDs[1]) + assert.NoError(t, err) + + for i := 1; i < totalMessages; i++ { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + msg, err := consumer.Receive(ctx) + cancel() + assert.NoError(t, err) + assert.NotNil(t, msg) + assert.Equal(t, msgIDs[i].Serialize(), msg.ID().Serialize()) + } + + // todo: add reader seek test when support reader read chunk message +} + +func TestChunkAckAndNAck(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + + assert.Nil(t, err) + defer client.Close() + + topic := newTopicName() + + producer, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + EnableChunking: true, + DisableBatching: true, + ChunkMaxMessageSize: 50, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + consumer, err := client.Subscribe(ConsumerOptions{ + Topic: topic, + Type: Exclusive, + SubscriptionName: "default-seek", + NackRedeliveryDelay: time.Second, + }) + assert.NoError(t, err) + assert.NotNil(t, consumer) + defer consumer.Close() + + content := createTestMessagePayload(100) + + _, err = producer.Send(context.Background(), &ProducerMessage{ + Payload: content, + }) + assert.NoError(t, err) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + msg, err := consumer.Receive(ctx) + cancel() + assert.NoError(t, err) + assert.NotNil(t, msg) + assert.Equal(t, msg.Payload(), content) + + consumer.Nack(msg) + time.Sleep(time.Second * 2) + + ctx, cancel = context.WithTimeout(context.Background(), time.Second*5) + msg, err = consumer.Receive(ctx) + cancel() + assert.NoError(t, err) + assert.NotNil(t, msg) + assert.Equal(t, msg.Payload(), content) +} + func createTestMessagePayload(size int) []byte { payload := make([]byte, size) for i := range payload { diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 219750b596..6a750ea4fc 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -81,6 +81,7 @@ type partitionProducer struct { batchFlushTicker *time.Ticker encryptor internalcrypto.Encryptor compressionProvider compression.Provider + chunkRecorder *chunkRecorder // Channel where app is posting messages to be published eventsChan chan interface{} @@ -154,6 +155,7 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions batchFlushTicker: time.NewTicker(batchingMaxPublishDelay), compressionProvider: internal.GetCompressionProvider(pb.CompressionType(options.CompressionType), compression.Level(options.CompressionLevel)), + chunkRecorder: newChunkRecorder(), publishSemaphore: internal.NewSemaphore(int32(maxPendingMessages)), pendingQueue: internal.NewBlockingQueue(maxPendingMessages), lastSequenceID: -1, @@ -629,7 +631,9 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg: request.msg, callback: request.callback, publishTime: request.publishTime, + totalChunks: totalChunks, chunkID: chunkID, + uuid: uuid, } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } @@ -1087,15 +1091,41 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) p.partitionIdx, ) - if sr.callback != nil { - if sr.chunkMsgID != nil && sr.chunkMsgID.complete() { - sr.callback(sr.chunkMsgID.firstChunkID, sr.msg, nil) - } else { - sr.callback(msgID, sr.msg, nil) + if sr.totalChunks > 1 { + if sr.chunkID == 0 { + cmid := p.chunkRecorder.addIfAbsent(sr.uuid) + cmid.firstChunkID = messageID{ + int64(response.MessageId.GetLedgerId()), + int64(response.MessageId.GetEntryId()), + -1, + p.partitionIdx, + } + } else if sr.chunkID == sr.totalChunks-1 { + cmid, ok := p.chunkRecorder.get(sr.uuid) + if ok { + cmid.messageID = messageID{ + int64(response.MessageId.GetLedgerId()), + int64(response.MessageId.GetEntryId()), + -1, + p.partitionIdx, + } + // use chunkMsgID to set msgID + msgID = *cmid + p.chunkRecorder.remove(sr.uuid) + } else { + p.log.Warnf("Recorder has lost the chunk message id, uuid: %v", + sr.uuid) + continue + } } } - p.options.Interceptors.OnSendAcknowledgement(p, sr.msg, msgID) + if sr.totalChunks <= 1 || sr.chunkID == sr.totalChunks-1 { + if sr.callback != nil { + sr.callback(msgID, sr.msg, nil) + } + p.options.Interceptors.OnSendAcknowledgement(p, sr.msg, msgID) + } } } @@ -1190,7 +1220,7 @@ type sendRequest struct { blockOnce sync.Once totalChunks int chunkID int - chunkMsgID *chunkMessageID + uuid string } type closeProducer struct { @@ -1260,3 +1290,36 @@ func (p *partitionProducer) internalErrHandle(err error) { } // other internal error can be handled here } + +type chunkRecorder struct { + chunkedMsgCtxs map[string]*chunkMessageID + mu sync.Mutex +} + +func newChunkRecorder() *chunkRecorder { + return &chunkRecorder{ + chunkedMsgCtxs: make(map[string]*chunkMessageID), + } +} + +func (c *chunkRecorder) addIfAbsent(uuid string) *chunkMessageID { + c.mu.Lock() + defer c.mu.Unlock() + if _, ok := c.chunkedMsgCtxs[uuid]; !ok { + c.chunkedMsgCtxs[uuid] = &chunkMessageID{} + } + return c.chunkedMsgCtxs[uuid] +} + +func (c *chunkRecorder) get(uuid string) (*chunkMessageID, bool) { + c.mu.Lock() + defer c.mu.Unlock() + cmid, ok := c.chunkedMsgCtxs[uuid] + return cmid, ok +} + +func (c *chunkRecorder) remove(uuid string) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.chunkedMsgCtxs, uuid) +} From 7cc168f593b78cd1ce82b51e52b1d42dae6f7e58 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Thu, 11 Aug 2022 14:28:58 +0800 Subject: [PATCH 09/40] test: move chunk tests to message_chunking_test --- pulsar/consumer_test.go | 25 ------------------------- pulsar/producer_test.go | 29 ----------------------------- 2 files changed, 54 deletions(-) diff --git a/pulsar/consumer_test.go b/pulsar/consumer_test.go index c2d151e160..20d4290726 100644 --- a/pulsar/consumer_test.go +++ b/pulsar/consumer_test.go @@ -177,31 +177,6 @@ func TestBatchMessageReceive(t *testing.T) { assert.Equal(t, count, numOfMessages) } -func TestChunkMessageReceive(t *testing.T) { - client, err := NewClient(ClientOptions{ - URL: lookupURL, - }) - - assert.Nil(t, err) - defer client.Close() - - topicName := "persistent://public/default/receive-chunk" - subName := "subscription-name" - ctx := context.Background() - - consumer, err := client.Subscribe(ConsumerOptions{ - Topic: topicName, - SubscriptionName: subName, - }) - assert.Nil(t, err) - defer consumer.Close() - - msg, err := consumer.Receive(ctx) - assert.Nil(t, err) - fmt.Println("id:", msg.ID().(chunkMessageID).String(), "len:", len(msg.Payload())) - consumer.Ack(msg) -} - func TestConsumerWithInvalidConf(t *testing.T) { client, err := NewClient(ClientOptions{ URL: lookupURL, diff --git a/pulsar/producer_test.go b/pulsar/producer_test.go index b9653cc523..061cdcfd45 100644 --- a/pulsar/producer_test.go +++ b/pulsar/producer_test.go @@ -981,35 +981,6 @@ func TestMaxMessageSize(t *testing.T) { } } -func TestProducerChunking(t *testing.T) { - serverMaxMessageSize := 1024 * 1024 - - client, err := NewClient(ClientOptions{ - URL: serviceURL, - }) - assert.NoError(t, err) - defer client.Close() - - // both EnableChunking and DisableBatching should be set true to open chunking - producer, err := client.CreateProducer(ProducerOptions{ - Topic: newTopicName(), - EnableChunking: true, - DisableBatching: true, - }) - assert.NoError(t, err) - assert.NotNil(t, producer) - defer producer.Close() - - payload := make([]byte, serverMaxMessageSize+1) - ID, err := producer.Send(context.Background(), &ProducerMessage{ - Payload: payload, - }) - - assert.NoError(t, err) - assert.NotNil(t, ID) - -} - func TestFailedSchemaEncode(t *testing.T) { client, err := NewClient(ClientOptions{ URL: lookupURL, From db7537f1dc8a30895000d8e160145c82609acdc3 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 15 Aug 2022 15:52:31 +0800 Subject: [PATCH 10/40] refactor: add licence header --- pulsar/internal/timewheel.go | 16 ++++++++++++++++ pulsar/internal/timewheel_test.go | 17 +++++++++++++++++ pulsar/message_chunking_test.go | 17 +++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/pulsar/internal/timewheel.go b/pulsar/internal/timewheel.go index 490bf2285b..d15e3e0f1b 100644 --- a/pulsar/internal/timewheel.go +++ b/pulsar/internal/timewheel.go @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations + package internal import ( diff --git a/pulsar/internal/timewheel_test.go b/pulsar/internal/timewheel_test.go index d85bbfe69d..a680ba35a4 100644 --- a/pulsar/internal/timewheel_test.go +++ b/pulsar/internal/timewheel_test.go @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package internal import ( diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index c95fffe941..330397637f 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package pulsar import ( From 2537cf42220b638c8496803683b087bf821a9bf9 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 15 Aug 2022 17:14:12 +0800 Subject: [PATCH 11/40] fix: Revert the change about trackingMessageID. Fix the conflict of batchIDX to pass test. Add the lisence headers to timewheel.go/timewheel_test.go/message_chunking_test.go --- pulsar/consumer_impl.go | 42 ++++--- pulsar/consumer_partition.go | 212 ++++++++++++++--------------------- pulsar/impl_message.go | 5 + 3 files changed, 113 insertions(+), 146 deletions(-) diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 9c7f7fc00a..28a1b5e4dd 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -36,8 +36,8 @@ import ( const defaultNackRedeliveryDelay = 1 * time.Minute type acker interface { - AckID(id MessageID) error - NackID(id MessageID) + AckID(id trackingMessageID) error + NackID(id trackingMessageID) NackMsg(msg Message) } @@ -465,13 +465,16 @@ func (c *consumer) Ack(msg Message) error { // AckID the consumption of a single message, identified by its MessageID func (c *consumer) AckID(msgID MessageID) error { - if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { - c.log.Errorf("invalid partition index %d expected a partition between [0-%d]", - msgID.PartitionIdx(), len(c.consumers)) - return errors.New("invalid partition index") + mid, ok := c.messageID(msgID) + if !ok { + return errors.New("failed to convert trackingMessageID") } - return c.consumers[msgID.PartitionIdx()].AckID(msgID) + if mid.consumer != nil { + return mid.Ack() + } + + return c.consumers[mid.partitionIdx].AckID(mid) } // ReconsumeLater mark a message for redelivery after custom delay @@ -525,15 +528,16 @@ func (c *consumer) ReconsumeLater(msg Message, delay time.Duration) { func (c *consumer) Nack(msg Message) { if c.options.EnableDefaultNackBackoffPolicy || c.options.NackBackoffPolicy != nil { - msgID := msg.ID() - - if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { - c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", - msgID.PartitionIdx(), len(c.consumers)) + mid, ok := c.messageID(msg.ID()) + if !ok { return } - c.consumers[msgID.PartitionIdx()].NackMsg(msg) + if mid.consumer != nil { + mid.Nack() + return + } + c.consumers[mid.partitionIdx].NackMsg(msg) return } @@ -541,13 +545,17 @@ func (c *consumer) Nack(msg Message) { } func (c *consumer) NackID(msgID MessageID) { - if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { - c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", - msgID.PartitionIdx(), len(c.consumers)) + mid, ok := c.messageID(msgID) + if !ok { + return + } + + if mid.consumer != nil { + mid.Nack() return } - c.consumers[msgID.PartitionIdx()].NackID(msgID) + c.consumers[mid.partitionIdx].NackID(mid) } func (c *consumer) Close() { diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index ffbe490714..9f0deee224 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -373,29 +373,17 @@ func (pc *partitionConsumer) requestGetLastMessageID() (trackingMessageID, error return convertToMessageID(id), nil } -func (pc *partitionConsumer) AckID(msgID MessageID) error { +func (pc *partitionConsumer) AckID(msgID trackingMessageID) error { if state := pc.getConsumerState(); state == consumerClosed || state == consumerClosing { pc.log.WithField("state", state).Error("Failed to ack by closing or closed consumer") return errors.New("consumer state is closed") } ackReq := new(ackRequest) - - if tmid, ok := toTrackingMessageID(msgID); ok { - if tmid.ack() { - pc.metrics.AcksCounter.Inc() - pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-tmid.receivedTime.UnixNano()) / 1.0e9) - ackReq.msgID = tmid - // send ack request to eventsCh - pc.eventsCh <- ackReq - - pc.options.interceptors.OnAcknowledge(pc.parentConsumer, msgID) - } - } else if cmid, ok := toChunkedMessageID(msgID); ok { - // todo: too redundant, maybe better implement + if !msgID.Undefined() && msgID.ack() { pc.metrics.AcksCounter.Inc() - pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-cmid.receivedTime.UnixNano()) / 1.0e9) - ackReq.msgID = cmid + pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-msgID.receivedTime.UnixNano()) / 1.0e9) + ackReq.msgID = msgID // send ack request to eventsCh pc.eventsCh <- ackReq @@ -405,8 +393,8 @@ func (pc *partitionConsumer) AckID(msgID MessageID) error { return ackReq.err } -func (pc *partitionConsumer) NackID(msgID MessageID) { - pc.nackTracker.Add(msgID) +func (pc *partitionConsumer) NackID(msgID trackingMessageID) { + pc.nackTracker.Add(msgID.messageID) pc.metrics.NacksCounter.Inc() } @@ -487,10 +475,10 @@ func (pc *partitionConsumer) Seek(msgID MessageID) error { req := &seekRequest{ doneCh: make(chan struct{}), } - if tmid, ok := toTrackingMessageID(msgID); ok { - req.msgID = tmid.messageID - } else if cmid, ok := toChunkedMessageID(msgID); ok { + if cmid, ok := toChunkedMessageID(msgID); ok { req.msgID = cmid.firstChunkID + } else if tmid, ok := toTrackingMessageID(msgID); ok { + req.msgID = tmid.messageID } else { // will never reach return errors.New("unhandled messageID type") @@ -600,8 +588,8 @@ func (pc *partitionConsumer) internalAck(req *ackRequest) { messageIDs := make([]*pb.MessageIdData, 1) messageIDs[0] = &pb.MessageIdData{ - LedgerId: proto.Uint64(uint64(msgID.LedgerID())), - EntryId: proto.Uint64(uint64(msgID.EntryID())), + LedgerId: proto.Uint64(uint64(msgID.ledgerID)), + EntryId: proto.Uint64(uint64(msgID.entryID)), } reqID := pc.client.rpcClient.NewRequestID() @@ -721,46 +709,63 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header } messages := make([]*message, 0) + var ackTracker *ackTracker + // are there multiple messages in this batch? + if numMsgs > 1 { + ackTracker = newAckTracker(numMsgs) + } pc.metrics.MessagesReceived.Add(float64(numMsgs)) pc.metrics.PrefetchedMessages.Add(float64(numMsgs)) - var messageIndex *uint64 - var brokerPublishTime *time.Time - var msg *message + for i := 0; i < numMsgs; i++ { + smm, payload, err := reader.ReadMessage() + if err != nil || payload == nil { + pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) + return err + } + + pc.metrics.BytesReceived.Add(float64(len(payload))) + pc.metrics.PrefetchedBytes.Add(float64(len(payload))) - // If no batched - if numMsgs == 1 { var msgID MessageID - var shouldDiscard bool + var trackingMsgID trackingMessageID + if isChunkedMsg { ctx := pc.chunkedMsgCtxMap.get(msgMeta.GetUuid()) if ctx == nil { // chunkedMsgCtxMap has closed because of consumer closed return nil } - if len(ctx.chunkedMsgIDs) > 0 { - msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) - shouldDiscard = pc.messageShouldBeDiscarded(msgID.(chunkMessageID).messageID) - } + msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) + trackingMsgID, _ = toTrackingMessageID(msgID) } else { - msgID = messageID{ - ledgerID: int64(pbMsgID.GetLedgerId()), - entryID: int64(pbMsgID.GetEntryId()), - batchIdx: -1, - partitionIdx: pc.partitionIdx, - } - shouldDiscard = pc.messageShouldBeDiscarded(msgID.(messageID)) + trackingMsgID = newTrackingMessageID( + int64(pbMsgID.GetLedgerId()), + int64(pbMsgID.GetEntryId()), + int32(i), + pc.partitionIdx, + ackTracker) + // set the consumer so we know how to ack the message id + trackingMsgID.consumer = pc + msgID = trackingMsgID } - if shouldDiscard { - pc.AckID(msgID) - return nil + if pc.messageShouldBeDiscarded(trackingMsgID) { + pc.AckID(trackingMsgID) + continue + } + + if isChunkedMsg { + // clean chunkedMsgCtxMap + pc.chunkedMsgCtxMap.remove(msgMeta.GetUuid()) } + var messageIndex *uint64 + var brokerPublishTime *time.Time if brokerMetadata != nil { if brokerMetadata.Index != nil { - aux := brokerMetadata.GetIndex() - uint64(numMsgs) + 1 + aux := brokerMetadata.GetIndex() - uint64(numMsgs) + uint64(i) + 1 messageIndex = &aux } if brokerMetadata.BrokerTimestamp != nil { @@ -769,80 +774,8 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header } } - _, payload, err := reader.ReadMessage() - if err != nil || payload == nil { - pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) - return err - } - - // clean chunkedMsgCtxMap - pc.chunkedMsgCtxMap.remove(msgMeta.GetUuid()) - - pc.metrics.BytesReceived.Add(float64(len(payload))) - pc.metrics.PrefetchedBytes.Add(float64(len(payload))) - - msg = &message{ - publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), - eventTime: timeFromUnixTimestampMillis(msgMeta.GetEventTime()), - key: msgMeta.GetPartitionKey(), - producerName: msgMeta.GetProducerName(), - properties: internal.ConvertToStringMap(msgMeta.GetProperties()), - topic: pc.topic, - msgID: msgID, - payLoad: payload, - schema: pc.options.schema, - replicationClusters: msgMeta.GetReplicateTo(), - replicatedFrom: msgMeta.GetReplicatedFrom(), - redeliveryCount: response.GetRedeliveryCount(), - schemaVersion: msgMeta.GetSchemaVersion(), - schemaInfoCache: pc.schemaInfoCache, - index: messageIndex, - brokerPublishTime: brokerPublishTime, - } - - pc.options.interceptors.BeforeConsume(ConsumerMessage{ - Consumer: pc.parentConsumer, - Message: msg, - }) - - messages = append(messages, msg) - } else { - for i := 0; i < numMsgs; i++ { - tracker := newAckTracker(numMsgs) - smm, payload, err := reader.ReadMessage() - if err != nil || payload == nil { - pc.discardCorruptedMessage(pbMsgID, pb.CommandAck_BatchDeSerializeError) - return err - } - - pc.metrics.BytesReceived.Add(float64(len(payload))) - pc.metrics.PrefetchedBytes.Add(float64(len(payload))) - - msgID := newTrackingMessageID( - int64(pbMsgID.GetLedgerId()), - int64(pbMsgID.GetEntryId()), - int32(i), - pc.partitionIdx, - tracker) - - if pc.messageShouldBeDiscarded(msgID.messageID) { - pc.AckID(msgID) - continue - } - - if brokerMetadata != nil { - if brokerMetadata.Index != nil { - aux := brokerMetadata.GetIndex() - uint64(numMsgs) + uint64(i) + 1 - messageIndex = &aux - } - if brokerMetadata.BrokerTimestamp != nil { - aux := timeFromUnixTimestampMillis(*brokerMetadata.BrokerTimestamp) - brokerPublishTime = &aux - } - } - // set the consumer so we know how to ack the message id - msgID.consumer = pc - + var msg *message + if smm != nil { msg = &message{ publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), eventTime: timeFromUnixTimestampMillis(smm.GetEventTime()), @@ -862,14 +795,33 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header index: messageIndex, brokerPublishTime: brokerPublishTime, } + } else { + msg = &message{ + publishTime: timeFromUnixTimestampMillis(msgMeta.GetPublishTime()), + eventTime: timeFromUnixTimestampMillis(msgMeta.GetEventTime()), + key: msgMeta.GetPartitionKey(), + producerName: msgMeta.GetProducerName(), + properties: internal.ConvertToStringMap(msgMeta.GetProperties()), + topic: pc.topic, + msgID: msgID, + payLoad: payload, + schema: pc.options.schema, + replicationClusters: msgMeta.GetReplicateTo(), + replicatedFrom: msgMeta.GetReplicatedFrom(), + redeliveryCount: response.GetRedeliveryCount(), + schemaVersion: msgMeta.GetSchemaVersion(), + schemaInfoCache: pc.schemaInfoCache, + index: messageIndex, + brokerPublishTime: brokerPublishTime, + } + } - pc.options.interceptors.BeforeConsume(ConsumerMessage{ - Consumer: pc.parentConsumer, - Message: msg, - }) + pc.options.interceptors.BeforeConsume(ConsumerMessage{ + Consumer: pc.parentConsumer, + Message: msg, + }) - messages = append(messages, msg) - } + messages = append(messages, msg) } // send messages to the dispatcher @@ -930,7 +882,7 @@ func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buff return ctx.chunkedMsgBuffer } -func (pc *partitionConsumer) messageShouldBeDiscarded(msgID messageID) bool { +func (pc *partitionConsumer) messageShouldBeDiscarded(msgID trackingMessageID) bool { if pc.startMessageID.Undefined() { return false } @@ -940,11 +892,11 @@ func (pc *partitionConsumer) messageShouldBeDiscarded(msgID messageID) bool { } if pc.options.startMessageIDInclusive { - return pc.startMessageID.greater(msgID) + return pc.startMessageID.greater(msgID.messageID) } // Non inclusive - return pc.startMessageID.greaterEqual(msgID) + return pc.startMessageID.greaterEqual(msgID.messageID) } // create EncryptionContext from message metadata @@ -1141,7 +1093,7 @@ func (pc *partitionConsumer) dispatcher() { } type ackRequest struct { - msgID MessageID + msgID trackingMessageID err error } @@ -1718,7 +1670,8 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { for _, mid := range ctx.chunkedMsgIDs { if autoAck { c.pc.log.Info("Removing chunk message-id", mid.String()) - c.pc.AckID(mid) + tmid, _ := toTrackingMessageID(mid) + c.pc.AckID(tmid) } } delete(c.chunkedMsgCtxs, uuid) @@ -1729,6 +1682,7 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { break } } + c.pc.log.Infof("Chunked message [%s] has been removed from chunkedMsgCtxMap", uuid) } func (c *chunkedMsgCtxMap) Close() { diff --git a/pulsar/impl_message.go b/pulsar/impl_message.go index 38139c187d..e477b81d4a 100644 --- a/pulsar/impl_message.go +++ b/pulsar/impl_message.go @@ -199,6 +199,11 @@ func toTrackingMessageID(msgID MessageID) (trackingMessageID, bool) { }, true } else if mid, ok := msgID.(trackingMessageID); ok { return mid, true + } else if cmid, ok := msgID.(chunkMessageID); ok { + return trackingMessageID{ + messageID: cmid.messageID, + receivedTime: cmid.receivedTime, + }, true } else { return trackingMessageID{}, false } From 4dc9bb7590caa9cfe3ca642fd33a89ffd15344bd Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 16 Aug 2022 10:35:50 +0800 Subject: [PATCH 12/40] fix: fix the license header of timewheel.go. --- pulsar/internal/timewheel.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pulsar/internal/timewheel.go b/pulsar/internal/timewheel.go index d15e3e0f1b..e996d8a4e9 100644 --- a/pulsar/internal/timewheel.go +++ b/pulsar/internal/timewheel.go @@ -13,6 +13,7 @@ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations +// under the License. package internal From 0db443ed7b0af88b7f6c2001385ff8a2a2c5f492 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 16 Aug 2022 15:01:26 +0800 Subject: [PATCH 13/40] fix: fix the bug of data race for availablePermits --- pulsar/consumer_partition.go | 48 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 9f0deee224..acd1d7f51e 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -25,6 +25,7 @@ import ( "math" "strings" "sync" + "sync/atomic" "time" "github.com/gogo/protobuf/proto" @@ -832,7 +833,6 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buffer, msgMeta *pb.MessageMetadata, pbMsgID *pb.MessageIdData) internal.Buffer { - // todo: expire incomplete chunk message uuid := msgMeta.GetUuid() numChunks := msgMeta.GetNumChunksFromMsg() totalChunksSize := int(msgMeta.GetTotalChunkMsgSize()) @@ -859,23 +859,20 @@ func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buff if ctx != nil { lastChunkedMsgID = int(ctx.lastChunkedMsgID) totalChunks = int(ctx.totalChunks) - // todo: how to release buffer ctx.chunkedMsgBuffer.Clear() } pc.log.Warnf(fmt.Sprintf( "Received unexpected chunk messageId %s, last-chunk-id %d, chunkId = %d, total-chunks %d", msgID.String(), lastChunkedMsgID, chunkID, totalChunks)) pc.chunkedMsgCtxMap.remove(uuid) - pc.availablePermits++ - // todo: expire tracker ack + atomic.AddInt32(&pc.availablePermits, 1) return nil } ctx.refresh(chunkID, msgID, compressedPayload) if msgMeta.GetChunkId() != msgMeta.GetNumChunksFromMsg()-1 { - pc.availablePermits++ - // todo: how to release buffer + atomic.AddInt32(&pc.availablePermits, 1) return nil } @@ -1037,12 +1034,13 @@ func (pc *partitionConsumer) dispatcher() { // TODO implement a better flow controller // send more permits if needed - pc.availablePermits++ + atomic.AddInt32(&pc.availablePermits, 1) + permits := atomic.LoadInt32(&pc.availablePermits) flowThreshold := int32(math.Max(float64(pc.queueSize/2), 1)) - if pc.availablePermits >= flowThreshold { - availablePermits := pc.availablePermits + if permits >= flowThreshold { + availablePermits := permits requestedPermits := availablePermits - pc.availablePermits = 0 + atomic.StoreInt32(&pc.availablePermits, 0) pc.log.Debugf("requesting more permits=%d available=%d", requestedPermits, availablePermits) if err := pc.internalFlow(uint32(requestedPermits)); err != nil { @@ -1558,6 +1556,8 @@ type chunkedMsgCtx struct { lastChunkedMsgID int32 chunkedMsgIDs []messageID receivedTime int64 + + mu sync.Mutex } func newChunkedMsgCtx(numChunksFromMsg int32, totalChunkMsgSize int) *chunkedMsgCtx { @@ -1571,25 +1571,39 @@ func newChunkedMsgCtx(numChunksFromMsg int32, totalChunkMsgSize int) *chunkedMsg } func (c *chunkedMsgCtx) refresh(chunkID int32, msgID messageID, partPayload internal.Buffer) { + c.mu.Lock() + defer c.mu.Unlock() c.chunkedMsgIDs[chunkID] = msgID c.chunkedMsgBuffer.Write(partPayload.ReadableSlice()) c.lastChunkedMsgID = chunkID } func (c *chunkedMsgCtx) firstChunkID() messageID { - if len(c.chunkedMsgIDs) == 0 { - return messageID{} - } + c.mu.Lock() + defer c.mu.Unlock() return c.chunkedMsgIDs[0] } func (c *chunkedMsgCtx) lastChunkID() messageID { + c.mu.Lock() + defer c.mu.Unlock() if len(c.chunkedMsgIDs) == 0 { return messageID{} } return c.chunkedMsgIDs[len(c.chunkedMsgIDs)-1] } +func (c *chunkedMsgCtx) discard(pc *partitionConsumer) { + c.mu.Lock() + defer c.mu.Unlock() + + for _, mid := range c.chunkedMsgIDs { + pc.log.Info("Removing chunk message-id", mid.String()) + tmid, _ := toTrackingMessageID(mid) + pc.AckID(tmid) + } +} + type chunkedMsgCtxMap struct { chunkedMsgCtxs map[string]*chunkedMsgCtx pendingQueue *list.List @@ -1667,12 +1681,8 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { if !ok { return } - for _, mid := range ctx.chunkedMsgIDs { - if autoAck { - c.pc.log.Info("Removing chunk message-id", mid.String()) - tmid, _ := toTrackingMessageID(mid) - c.pc.AckID(tmid) - } + if autoAck { + ctx.discard(c.pc) } delete(c.chunkedMsgCtxs, uuid) e := c.pendingQueue.Front() From 92fbff265c46e876b78a9b6a736ef79d43e12eba Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 6 Sep 2022 21:01:17 +0800 Subject: [PATCH 14/40] fix: 1. fix the comments of EnableChunking and rename the MaxChunkSize. 2. Add the unit test for chunk size. 3. fix some other bugs. --- pulsar/internal/commands.go | 12 ++- pulsar/message_chunking_test.go | 83 +++++++++++----- pulsar/producer.go | 6 +- pulsar/producer_partition.go | 170 +++++++++++++++----------------- 4 files changed, 148 insertions(+), 123 deletions(-) diff --git a/pulsar/internal/commands.go b/pulsar/internal/commands.go index 2d89947bf7..2207248142 100644 --- a/pulsar/internal/commands.go +++ b/pulsar/internal/commands.go @@ -265,14 +265,16 @@ func serializeMessage(wb Buffer, // error occurred while encrypting the payload, ProducerCryptoFailureAction is set to Fail return fmt.Errorf("encryption of message failed, ProducerCryptoFailureAction is set to Fail. Error :%v", err) } - // the maxMessageSize check of batching message is in here - if len(encryptedPayload) > int(maxMessageSize) { - return fmt.Errorf("%w, size: %d, MaxMessageSize: %d", - ErrExceedMaxMessageSize, len(encryptedPayload), maxMessageSize) - } cmdSize := uint32(proto.Size(cmdSend)) msgMetadataSize := uint32(proto.Size(msgMetadata)) + msgSize := len(encryptedPayload) + int(msgMetadataSize) + + // the maxMessageSize check of batching message is in here + if !(msgMetadata.GetTotalChunkMsgSize() != 0) && msgSize > int(maxMessageSize) { + return fmt.Errorf("%w, size: %d, MaxMessageSize: %d", + ErrExceedMaxMessageSize, msgSize, maxMessageSize) + } frameSizeIdx := wb.WriterIndex() wb.WriteUint32(0) // Skip frame size until we now the size diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 330397637f..23388aa579 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -61,7 +61,7 @@ func TestLargeMessage(t *testing.T) { topic := newTopicName() - // create producer without ChunkMaxMessageSize + // create producer without MaxChunkSize producer1, err := client.CreateProducer(ProducerOptions{ Topic: topic, DisableBatching: true, @@ -71,12 +71,12 @@ func TestLargeMessage(t *testing.T) { assert.NotNil(t, producer1) defer producer1.Close() - // create producer with ChunkMaxMessageSize + // create producer with MaxChunkSize producer2, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - DisableBatching: true, - EnableChunking: true, - ChunkMaxMessageSize: 5, + Topic: topic, + DisableBatching: true, + EnableChunking: true, + MaxChunkSize: 5, }) assert.NoError(t, err) assert.NotNil(t, producer2) @@ -119,7 +119,7 @@ func TestLargeMessage(t *testing.T) { assert.NoError(t, err) } - // test send chunk with ChunkMaxMessageSize limit + // test send chunk with MaxChunkSize limit for i := 0; i < 5; i++ { msg := createTestMessagePayload(50) expectMsgs = append(expectMsgs, msg) @@ -130,7 +130,7 @@ func TestLargeMessage(t *testing.T) { assert.NotNil(t, ID) } - // test receive chunk with ChunkMaxMessageSize limit + // test receive chunk with MaxChunkSize limit for i := 5; i < 10; i++ { msg, err := consumer.Receive(context.Background()) assert.NoError(t, err) @@ -156,7 +156,7 @@ func TestPublishChunkWithFailure(t *testing.T) { topic := newTopicName() - // create producer without ChunkMaxMessageSize + // create producer without MaxChunkSize producer, err := client.CreateProducer(ProducerOptions{ Topic: topic, }) @@ -204,10 +204,10 @@ func TestMaxPendingChunkMessages(t *testing.T) { assert.Nil(t, err) clients = append(clients, pc) producer, err := pc.CreateProducer(ProducerOptions{ - Topic: topic, - DisableBatching: true, - EnableChunking: true, - ChunkMaxMessageSize: 10, + Topic: topic, + DisableBatching: true, + EnableChunking: true, + MaxChunkSize: 10, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -309,7 +309,7 @@ func TestChunksEnqueueFailed(t *testing.T) { EnableChunking: true, DisableBatching: true, MaxPendingMessages: 10, - ChunkMaxMessageSize: 50, + MaxChunkSize: 50, DisableBlockIfQueueFull: true, }) assert.NoError(t, err) @@ -337,10 +337,10 @@ func TestSeekChunkMessages(t *testing.T) { totalMessages := 5 producer, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - EnableChunking: true, - DisableBatching: true, - ChunkMaxMessageSize: 50, + Topic: topic, + EnableChunking: true, + DisableBatching: true, + MaxChunkSize: 50, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -401,10 +401,10 @@ func TestChunkAckAndNAck(t *testing.T) { topic := newTopicName() producer, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - EnableChunking: true, - DisableBatching: true, - ChunkMaxMessageSize: 50, + Topic: topic, + EnableChunking: true, + DisableBatching: true, + MaxChunkSize: 50, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -445,6 +445,45 @@ func TestChunkAckAndNAck(t *testing.T) { assert.Equal(t, msg.Payload(), content) } +func TestChunkSize(t *testing.T) { + rand.Seed(time.Now().Unix()) + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + assert.Nil(t, err) + defer client.Close() + + //the default message metadata size for string schema + payloadChunkSize := _brokerMaxMessageSize - 18 + + topic := newTopicName() + + producer, err := client.CreateProducer(ProducerOptions{ + Name: "test", + Topic: topic, + EnableChunking: true, + DisableBatching: true, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + for size := payloadChunkSize; size <= _brokerMaxMessageSize; size++ { + msgID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(size), + }) + assert.NoError(t, err) + if size <= payloadChunkSize { + _, ok := msgID.(messageID) + assert.Equal(t, true, ok) + } else { + _, ok := msgID.(chunkMessageID) + assert.Equal(t, true, ok) + } + } +} + func createTestMessagePayload(size int) []byte { payload := make([]byte, size) for i := range payload { diff --git a/pulsar/producer.go b/pulsar/producer.go index e4e84f77bc..dfcd30499a 100644 --- a/pulsar/producer.go +++ b/pulsar/producer.go @@ -175,12 +175,12 @@ type ProducerOptions struct { // EnableChunking controls whether automatic chunking of messages is enabled for the producer. By default, chunking // is disabled. - // Chunking can not be enabled when batching is not closed. + // Chunking can not be enabled when batching is enabled. EnableChunking bool - // ChunkMaxMessageSize is the max size of single chunk payload. + // MaxChunkSize is the max size of single chunk payload. // It will actually only take effect if it is smaller than broker.MaxMessageSize - ChunkMaxMessageSize uint + MaxChunkSize uint } // Producer is used to publish messages on a topic diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 6a750ea4fc..3ba20d9f4c 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -81,7 +81,6 @@ type partitionProducer struct { batchFlushTicker *time.Ticker encryptor internalcrypto.Encryptor compressionProvider compression.Provider - chunkRecorder *chunkRecorder // Channel where app is posting messages to be published eventsChan chan interface{} @@ -155,7 +154,6 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions batchFlushTicker: time.NewTicker(batchingMaxPublishDelay), compressionProvider: internal.GetCompressionProvider(pb.CompressionType(options.CompressionType), compression.Level(options.CompressionLevel)), - chunkRecorder: newChunkRecorder(), publishSemaphore: internal.NewSemaphore(int32(maxPendingMessages)), pendingQueue: internal.NewBlockingQueue(maxPendingMessages), lastSequenceID: -1, @@ -256,7 +254,6 @@ func (p *partitionProducer) grabCnx() error { p.producerName = res.Response.ProducerSuccess.GetProducerName() - //var encryptor internalcrypto.Encryptor if p.options.Encryption != nil { p.encryptor = internalcrypto.NewProducerEncryptor(p.options.Encryption.Keys, p.options.Encryption.KeyReader, @@ -465,7 +462,7 @@ func (p *partitionProducer) Name() string { } func (p *partitionProducer) internalSend(request *sendRequest) { - p.log.Debug("Received send request: ", *request.msg) + p.log.Debug("Received send request: ", *request) msg := request.msg @@ -528,19 +525,18 @@ func (p *partitionProducer) internalSend(request *sendRequest) { uncompressedSize := len(uncompressedPayload) - mm := p.genMetadata(msg, uncompressedSize) + deliverAt := msg.DeliverAt + if msg.DeliverAfter.Nanoseconds() > 0 { + deliverAt = time.Now().Add(msg.DeliverAfter) + } + + mm := p.genMetadata(msg, uncompressedSize, deliverAt) // set default ReplicationClusters when DisableReplication if msg.DisableReplication { msg.ReplicationClusters = []string{"__local__"} } - // todo: deliverAt has calculated in genMetadata but it's not a good idea to make genMetadata() return it. - deliverAt := msg.DeliverAt - if msg.DeliverAfter.Nanoseconds() > 0 { - deliverAt = time.Now().Add(msg.DeliverAfter) - } - sendAsBatch := !p.options.DisableBatching && msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 @@ -590,9 +586,9 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.metrics.PublishErrorsMsgTooLarge.Inc() return } - // set ChunkMaxMessageSize - if p.options.ChunkMaxMessageSize != 0 { - payloadChunkSize = int(math.Min(float64(payloadChunkSize), float64(p.options.ChunkMaxMessageSize))) + // set MaxChunkSize + if p.options.MaxChunkSize != 0 { + payloadChunkSize = int(math.Min(float64(payloadChunkSize), float64(p.options.MaxChunkSize))) } totalChunks = int(math.Max(1, math.Ceil(float64(compressedSize)/float64(payloadChunkSize)))) } @@ -600,6 +596,10 @@ func (p *partitionProducer) internalSend(request *sendRequest) { // correct limit queue when chunked for i := 0; i < totalChunks-1; i++ { if !p.canAddToQueue(request) { + // release all permits including the first + for j := i; j >= 0; j-- { + p.publishSemaphore.Release() + } return } } @@ -608,17 +608,13 @@ func (p *partitionProducer) internalSend(request *sendRequest) { request.totalChunks = totalChunks if !sendAsBatch { - if msg.SequenceID != nil { - mm.SequenceId = proto.Uint64(uint64(*msg.SequenceID)) - } else { - mm.SequenceId = proto.Uint64(internal.GetAndAdd(p.sequenceIDGenerator, 1)) - } if totalChunks > 1 { var lhs, rhs int uuid := fmt.Sprintf("%s-%s", p.producerName, strconv.FormatUint(*mm.SequenceId, 10)) mm.Uuid = proto.String(uuid) mm.NumChunksFromMsg = proto.Int(totalChunks) mm.TotalChunkMsgSize = proto.Int(compressedSize) + cr := newChunkRecorder() for chunkID := 0; chunkID < totalChunks; chunkID++ { lhs = chunkID * payloadChunkSize if rhs = lhs + payloadChunkSize; rhs > compressedSize { @@ -627,13 +623,15 @@ func (p *partitionProducer) internalSend(request *sendRequest) { // update chunk id mm.ChunkId = proto.Int(chunkID) nsr := &sendRequest{ - ctx: request.ctx, - msg: request.msg, - callback: request.callback, - publishTime: request.publishTime, - totalChunks: totalChunks, - chunkID: chunkID, - uuid: uuid, + ctx: request.ctx, + msg: request.msg, + callback: request.callback, + callbackOnce: request.callbackOnce, + publishTime: request.publishTime, + totalChunks: totalChunks, + chunkID: chunkID, + uuid: uuid, + chunkRecorder: cr, } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } @@ -641,7 +639,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } } else { - smm := p.genSingleMetaMessage(msg, uncompressedSize) + smm := p.genSingleMessageMetadataInBatch(msg, uncompressedSize) multiSchemaEnabled := !p.options.DisableMultiSchema added := p.batchBuilder.Add(smm, p.sequenceIDGenerator, uncompressedPayload, request, msg.ReplicationClusters, deliverAt, schemaVersion, multiSchemaEnabled) @@ -669,7 +667,9 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } } -func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize int) (mm *pb.MessageMetadata) { +func (p *partitionProducer) genMetadata(msg *ProducerMessage, + uncompressedSize int, + deliverAt time.Time) (mm *pb.MessageMetadata) { mm = &pb.MessageMetadata{ ProducerName: &p.producerName, PublishTime: proto.Uint64(internal.TimestampMillis(time.Now())), @@ -677,6 +677,12 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize i UncompressedSize: proto.Uint32(uint32(uncompressedSize)), } + if msg.SequenceID != nil { + mm.SequenceId = proto.Uint64(uint64(*msg.SequenceID)) + } else { + mm.SequenceId = proto.Uint64(internal.GetAndAdd(p.sequenceIDGenerator, 1)) + } + if msg.Key != "" { mm.PartitionKey = proto.String(msg.Key) } @@ -685,10 +691,6 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize i mm.Properties = internal.ConvertFromStringMap(msg.Properties) } - deliverAt := msg.DeliverAt - if msg.DeliverAfter.Nanoseconds() > 0 { - deliverAt = time.Now().Add(msg.DeliverAfter) - } if deliverAt.UnixNano() > 0 { mm.DeliverAtTime = proto.Int64(int64(internal.TimestampMillis(deliverAt))) } @@ -696,7 +698,7 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, uncompressedSize i return } -func (p *partitionProducer) genSingleMetaMessage(msg *ProducerMessage, +func (p *partitionProducer) genSingleMessageMetadataInBatch(msg *ProducerMessage, uncompressedSize int) (smm *pb.SingleMessageMetadata) { smm = &pb.SingleMessageMetadata{ PayloadSize: proto.Int(uncompressedSize), @@ -797,6 +799,12 @@ func (p *partitionProducer) internalFlushCurrentBatch() { sr.callback(nil, sr.msg, err) } } + if errors.Is(err, internal.ErrExceedMaxMessageSize) { + p.log.WithError(errMessageTooLarge). + Errorf("internal err: %s", err) + p.metrics.PublishErrorsMsgTooLarge.Inc() + return + } return } @@ -886,7 +894,6 @@ func (p *partitionProducer) failTimeoutMessages() { for _, i := range pi.sendRequests { sr := i.(*sendRequest) if sr.msg != nil { - // todo: it's not correct. the size should be schemaed uncompressed payload size size := len(sr.msg.Payload) p.publishSemaphore.Release() p.metrics.MessagesPending.Dec() @@ -896,9 +903,11 @@ func (p *partitionProducer) failTimeoutMessages() { WithField("size", size). WithField("properties", sr.msg.Properties) } - // the callback is only invoked by last chunk when chunked - if sr.callback != nil && (sr.totalChunks <= 1 || (sr.chunkID == sr.totalChunks-1)) { - sr.callback(nil, sr.msg, errSendTimeout) + + if sr.callback != nil { + sr.callbackOnce.Do(func() { + sr.callback(nil, sr.msg, errSendTimeout) + }) } } @@ -930,6 +939,12 @@ func (p *partitionProducer) internalFlushCurrentBatches() { sr.callback(nil, sr.msg, errs[i]) } } + if errors.Is(errs[i], internal.ErrExceedMaxMessageSize) { + p.log.WithError(errMessageTooLarge). + Errorf("internal err: %s", errs[i]) + p.metrics.PublishErrorsMsgTooLarge.Inc() + return + } continue } if batchesData[i] == nil { @@ -999,9 +1014,6 @@ func (p *partitionProducer) Send(ctx context.Context, msg *ProducerMessage) (Mes // wait for send request to finish <-doneCh - // handle internal error - p.internalErrHandle(err) - return msgID, err } @@ -1020,10 +1032,15 @@ func (p *partitionProducer) internalSendAsync(ctx context.Context, msg *Producer // bc only works when DisableBlockIfQueueFull is false bc := make(chan struct{}) + + // callbackOnce make sure the callback is only invoked once in chunking + callbackOnce := &sync.Once{} + sr := &sendRequest{ ctx: ctx, msg: msg, callback: callback, + callbackOnce: callbackOnce, flushImmediately: flushImmediately, publishTime: time.Now(), blockCh: bc, @@ -1077,7 +1094,6 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) p.metrics.PublishLatency.Observe(float64(now-sr.publishTime.UnixNano()) / 1.0e9) p.metrics.MessagesPublished.Inc() p.metrics.MessagesPending.Dec() - // todo: it's not correct. the size should be schemaed uncompressed payload size payloadSize := float64(len(sr.msg.Payload)) p.metrics.BytesPublished.Add(payloadSize) p.metrics.BytesPending.Sub(payloadSize) @@ -1093,30 +1109,23 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt) if sr.totalChunks > 1 { if sr.chunkID == 0 { - cmid := p.chunkRecorder.addIfAbsent(sr.uuid) - cmid.firstChunkID = messageID{ - int64(response.MessageId.GetLedgerId()), - int64(response.MessageId.GetEntryId()), - -1, - p.partitionIdx, - } + sr.chunkRecorder.setFirstChunkID( + messageID{ + int64(response.MessageId.GetLedgerId()), + int64(response.MessageId.GetEntryId()), + -1, + p.partitionIdx, + }) } else if sr.chunkID == sr.totalChunks-1 { - cmid, ok := p.chunkRecorder.get(sr.uuid) - if ok { - cmid.messageID = messageID{ + sr.chunkRecorder.setLastChunkID( + messageID{ int64(response.MessageId.GetLedgerId()), int64(response.MessageId.GetEntryId()), -1, p.partitionIdx, - } - // use chunkMsgID to set msgID - msgID = *cmid - p.chunkRecorder.remove(sr.uuid) - } else { - p.log.Warnf("Recorder has lost the chunk message id, uuid: %v", - sr.uuid) - continue - } + }) + // use chunkMsgID to set msgID + msgID = sr.chunkRecorder.chunkedMsgID } } @@ -1214,6 +1223,7 @@ type sendRequest struct { ctx context.Context msg *ProducerMessage callback func(MessageID, *ProducerMessage, error) + callbackOnce *sync.Once publishTime time.Time flushImmediately bool blockCh chan struct{} @@ -1221,6 +1231,7 @@ type sendRequest struct { totalChunks int chunkID int uuid string + chunkRecorder *chunkRecorder } type closeProducer struct { @@ -1279,47 +1290,20 @@ func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { return true } -func (p *partitionProducer) internalErrHandle(err error) { - if err == nil { - return - } else if errors.Is(err, internal.ErrExceedMaxMessageSize) { - p.log.WithError(errMessageTooLarge). - Errorf("internal err: %s", err) - p.metrics.PublishErrorsMsgTooLarge.Inc() - return - } - // other internal error can be handled here -} - type chunkRecorder struct { - chunkedMsgCtxs map[string]*chunkMessageID - mu sync.Mutex + chunkedMsgID chunkMessageID } func newChunkRecorder() *chunkRecorder { return &chunkRecorder{ - chunkedMsgCtxs: make(map[string]*chunkMessageID), - } -} - -func (c *chunkRecorder) addIfAbsent(uuid string) *chunkMessageID { - c.mu.Lock() - defer c.mu.Unlock() - if _, ok := c.chunkedMsgCtxs[uuid]; !ok { - c.chunkedMsgCtxs[uuid] = &chunkMessageID{} + chunkedMsgID: chunkMessageID{}, } - return c.chunkedMsgCtxs[uuid] } -func (c *chunkRecorder) get(uuid string) (*chunkMessageID, bool) { - c.mu.Lock() - defer c.mu.Unlock() - cmid, ok := c.chunkedMsgCtxs[uuid] - return cmid, ok +func (c *chunkRecorder) setFirstChunkID(msgID messageID) { + c.chunkedMsgID.firstChunkID = msgID } -func (c *chunkRecorder) remove(uuid string) { - c.mu.Lock() - defer c.mu.Unlock() - delete(c.chunkedMsgCtxs, uuid) +func (c *chunkRecorder) setLastChunkID(msgID messageID) { + c.chunkedMsgID.messageID = msgID } From d23d1cd916d015602085fdb85018107b46ac723e Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Wed, 7 Sep 2022 17:31:46 +0800 Subject: [PATCH 15/40] fix: 1. fix the bug of seq ID. 2. make code pass golangci --- pulsar/producer_partition.go | 22 +++++++++++++++------- pulsar/producer_test.go | 5 ++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 3ba20d9f4c..3c5008debb 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -462,7 +462,7 @@ func (p *partitionProducer) Name() string { } func (p *partitionProducer) internalSend(request *sendRequest) { - p.log.Debug("Received send request: ", *request) + p.log.Debug("Received send request: ", *request.msg) msg := request.msg @@ -541,6 +541,12 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 + if !sendAsBatch { + // update sequence id for metadata, make the size of msgMetadata more accurate + // batch sending will update sequence ID in the BatchBuilder + p.updateMetadataSeqID(mm, msg) + } + maxMessageSize := int(p._getConn().GetMaxMessageSize()) // compress payload if not batching @@ -677,12 +683,6 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, UncompressedSize: proto.Uint32(uint32(uncompressedSize)), } - if msg.SequenceID != nil { - mm.SequenceId = proto.Uint64(uint64(*msg.SequenceID)) - } else { - mm.SequenceId = proto.Uint64(internal.GetAndAdd(p.sequenceIDGenerator, 1)) - } - if msg.Key != "" { mm.PartitionKey = proto.String(msg.Key) } @@ -698,6 +698,14 @@ func (p *partitionProducer) genMetadata(msg *ProducerMessage, return } +func (p *partitionProducer) updateMetadataSeqID(mm *pb.MessageMetadata, msg *ProducerMessage) { + if msg.SequenceID != nil { + mm.SequenceId = proto.Uint64(uint64(*msg.SequenceID)) + } else { + mm.SequenceId = proto.Uint64(internal.GetAndAdd(p.sequenceIDGenerator, 1)) + } +} + func (p *partitionProducer) genSingleMessageMetadataInBatch(msg *ProducerMessage, uncompressedSize int) (smm *pb.SingleMessageMetadata) { smm = &pb.SingleMessageMetadata{ diff --git a/pulsar/producer_test.go b/pulsar/producer_test.go index 061cdcfd45..f193ffd4ec 100644 --- a/pulsar/producer_test.go +++ b/pulsar/producer_test.go @@ -966,15 +966,14 @@ func TestMaxMessageSize(t *testing.T) { } } - // singleMsgMetadata is not included in the payload of a non-batching message for bias := -1; bias <= 1; bias++ { payload := make([]byte, serverMaxMessageSize+bias) ID, err := producer2.Send(context.Background(), &ProducerMessage{ Payload: payload, }) if bias <= 0 { - assert.NoError(t, err) - assert.NotNil(t, ID) + assert.Equal(t, true, errors.Is(err, internal.ErrExceedMaxMessageSize)) + assert.Nil(t, ID) } else { assert.Equal(t, errMessageTooLarge, err) } From 5e098d23dbd2804bbd8cbbee9513d7ce420403b7 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Thu, 8 Sep 2022 21:32:27 +0800 Subject: [PATCH 16/40] fix: fix the test function TestChunkSize() --- pulsar/message_chunking_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 23388aa579..3326e56f79 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -454,8 +454,12 @@ func TestChunkSize(t *testing.T) { assert.Nil(t, err) defer client.Close() - //the default message metadata size for string schema - payloadChunkSize := _brokerMaxMessageSize - 18 + // the default message metadata size for string schema + // The proto messageMetaData size as following. (all with tag) + // | producerName | sequenceID | publishTime | uncompressedSize | + // | ------------ | ---------- | ----------- | ---------------- | + // | 6 | 2 | 7 | 4 | + payloadChunkSize := _brokerMaxMessageSize - 19 topic := newTopicName() From 203a1fc7f70f26ae03b6d4c780021c39b92aff22 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Fri, 9 Sep 2022 15:45:13 +0800 Subject: [PATCH 17/40] fix: revert negativeAcksTracker.Add() --- pulsar/negative_acks_tracker.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pulsar/negative_acks_tracker.go b/pulsar/negative_acks_tracker.go index cde85195d6..3485e1b02d 100644 --- a/pulsar/negative_acks_tracker.go +++ b/pulsar/negative_acks_tracker.go @@ -65,12 +65,12 @@ func newNegativeAcksTracker(rc redeliveryConsumer, delay time.Duration, return t } -func (t *negativeAcksTracker) Add(msgID MessageID) { +func (t *negativeAcksTracker) Add(msgID messageID) { // Always clear up the batch index since we want to track the nack // for the entire batch batchMsgID := messageID{ - ledgerID: msgID.LedgerID(), - entryID: msgID.EntryID(), + ledgerID: msgID.ledgerID, + entryID: msgID.entryID, batchIdx: 0, } From 2a1342d92779551823c7df38a7d3bd9db192d7b0 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Fri, 9 Sep 2022 15:48:48 +0800 Subject: [PATCH 18/40] fix: modify the comment of ExpireTimeOfIncompleteChunk --- pulsar/consumer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pulsar/consumer.go b/pulsar/consumer.go index 994dd7889b..b762727f7f 100644 --- a/pulsar/consumer.go +++ b/pulsar/consumer.go @@ -194,7 +194,7 @@ type ConsumerOptions struct { MaxPendingChunkedMessage int // ExpireTimeOfIncompleteChunk sets the expiry time of discarding incomplete chunked message. - // The timing accuracy is 100ms level and the default value is 60 * 1000 millis + // The minimum value is 100ms. (default: 60 * 1000 millis) ExpireTimeOfIncompleteChunk time.Duration // AutoAckIncompleteChunk sets whether consumer auto acknowledges incomplete chunked message when it should From fda69974f0b39b5db4f15c21de4102efae4897be Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 13 Sep 2022 16:21:03 +0800 Subject: [PATCH 19/40] fix: fix the ack/nack for chunking --- pulsar/consumer_impl.go | 34 +++++----- pulsar/consumer_partition.go | 125 +++++++++++++++++++++++++++-------- 2 files changed, 113 insertions(+), 46 deletions(-) diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 28a1b5e4dd..5c4d518029 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -36,8 +36,8 @@ import ( const defaultNackRedeliveryDelay = 1 * time.Minute type acker interface { - AckID(id trackingMessageID) error - NackID(id trackingMessageID) + AckID(id MessageID) error + NackID(id MessageID) NackMsg(msg Message) } @@ -465,16 +465,16 @@ func (c *consumer) Ack(msg Message) error { // AckID the consumption of a single message, identified by its MessageID func (c *consumer) AckID(msgID MessageID) error { - mid, ok := c.messageID(msgID) - if !ok { - return errors.New("failed to convert trackingMessageID") - } - - if mid.consumer != nil { - return mid.Ack() + partition := int(msgID.PartitionIdx()) + // did we receive a valid partition index? + if partition < 0 || partition >= len(c.consumers) { + c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", + partition, len(c.consumers)) + return fmt.Errorf("invalid partition index %d expected a partition between [0-%d]", + partition, len(c.consumers)) } - return c.consumers[mid.partitionIdx].AckID(mid) + return c.consumers[partition].AckID(msgID) } // ReconsumeLater mark a message for redelivery after custom delay @@ -545,17 +545,15 @@ func (c *consumer) Nack(msg Message) { } func (c *consumer) NackID(msgID MessageID) { - mid, ok := c.messageID(msgID) - if !ok { - return - } - - if mid.consumer != nil { - mid.Nack() + partition := int(msgID.PartitionIdx()) + // did we receive a valid partition index? + if partition < 0 || partition >= len(c.consumers) { + c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", + partition, len(c.consumers)) return } - c.consumers[mid.partitionIdx].NackID(mid) + c.consumers[partition].NackID(msgID) } func (c *consumer) Close() { diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index acd1d7f51e..ce1dceaed7 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -156,7 +156,8 @@ type partitionConsumer struct { decryptor cryptointernal.Decryptor schemaInfoCache *schemaInfoCache - chunkedMsgCtxMap *chunkedMsgCtxMap + chunkedMsgCtxMap *chunkedMsgCtxMap + unAckChunksTracker *unAckChunksTracker } type schemaInfoCache struct { @@ -232,6 +233,7 @@ func newPartitionConsumer(parent Consumer, client *client, options *partitionCon schemaInfoCache: newSchemaInfoCache(client, options.topic), } pc.chunkedMsgCtxMap = newChunkedMsgCtxMap(options.maxPendingChunkedMessage, pc) + pc.unAckChunksTracker = newUnAckChunksTracker(pc) pc.setConsumerState(consumerInit) pc.log = client.log.SubLogger(log.Fields{ "name": pc.name, @@ -374,17 +376,26 @@ func (pc *partitionConsumer) requestGetLastMessageID() (trackingMessageID, error return convertToMessageID(id), nil } -func (pc *partitionConsumer) AckID(msgID trackingMessageID) error { +func (pc *partitionConsumer) AckID(msgID MessageID) error { if state := pc.getConsumerState(); state == consumerClosed || state == consumerClosing { pc.log.WithField("state", state).Error("Failed to ack by closing or closed consumer") return errors.New("consumer state is closed") } + if cmid, ok := toChunkedMessageID(msgID); ok { + return pc.unAckChunksTracker.ack(cmid) + } + + trackingID, ok := toTrackingMessageID(msgID) + if !ok { + return errors.New("failed to convert trackingMessageID") + } + ackReq := new(ackRequest) - if !msgID.Undefined() && msgID.ack() { + if !trackingID.Undefined() && trackingID.ack() { pc.metrics.AcksCounter.Inc() - pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-msgID.receivedTime.UnixNano()) / 1.0e9) - ackReq.msgID = msgID + pc.metrics.ProcessingTime.Observe(float64(time.Now().UnixNano()-trackingID.receivedTime.UnixNano()) / 1.0e9) + ackReq.msgID = trackingID // send ack request to eventsCh pc.eventsCh <- ackReq @@ -394,8 +405,18 @@ func (pc *partitionConsumer) AckID(msgID trackingMessageID) error { return ackReq.err } -func (pc *partitionConsumer) NackID(msgID trackingMessageID) { - pc.nackTracker.Add(msgID.messageID) +func (pc *partitionConsumer) NackID(msgID MessageID) { + if cmid, ok := toChunkedMessageID(msgID); ok { + pc.unAckChunksTracker.nack(cmid) + return + } + + trackingID, ok := toTrackingMessageID(msgID) + if !ok { + return + } + + pc.nackTracker.Add(trackingID.messageID) pc.metrics.NacksCounter.Inc() } @@ -729,9 +750,19 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header pc.metrics.BytesReceived.Add(float64(len(payload))) pc.metrics.PrefetchedBytes.Add(float64(len(payload))) - var msgID MessageID - var trackingMsgID trackingMessageID + trackingMsgID := newTrackingMessageID( + int64(pbMsgID.GetLedgerId()), + int64(pbMsgID.GetEntryId()), + int32(i), + pc.partitionIdx, + ackTracker) + if pc.messageShouldBeDiscarded(trackingMsgID) { + pc.AckID(trackingMsgID) + continue + } + + var msgID MessageID if isChunkedMsg { ctx := pc.chunkedMsgCtxMap.get(msgMeta.GetUuid()) if ctx == nil { @@ -739,27 +770,11 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header return nil } msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) - trackingMsgID, _ = toTrackingMessageID(msgID) - } else { - trackingMsgID = newTrackingMessageID( - int64(pbMsgID.GetLedgerId()), - int64(pbMsgID.GetEntryId()), - int32(i), - pc.partitionIdx, - ackTracker) - // set the consumer so we know how to ack the message id - trackingMsgID.consumer = pc - msgID = trackingMsgID - } - - if pc.messageShouldBeDiscarded(trackingMsgID) { - pc.AckID(trackingMsgID) - continue - } - - if isChunkedMsg { // clean chunkedMsgCtxMap pc.chunkedMsgCtxMap.remove(msgMeta.GetUuid()) + pc.unAckChunksTracker.add(msgID.(chunkMessageID), ctx.chunkedMsgIDs) + } else { + msgID = trackingMsgID } var messageIndex *uint64 @@ -1702,3 +1717,57 @@ func (c *chunkedMsgCtxMap) Close() { c.tw.Stop() } + +type unAckChunksTracker struct { + chunkIDs map[chunkMessageID][]messageID + pc *partitionConsumer + mu sync.Mutex +} + +func newUnAckChunksTracker(pc *partitionConsumer) *unAckChunksTracker { + return &unAckChunksTracker{ + chunkIDs: make(map[chunkMessageID][]messageID), + pc: pc, + } +} + +func (u *unAckChunksTracker) add(cmid chunkMessageID, ids []messageID) { + u.mu.Lock() + defer u.mu.Unlock() + + u.chunkIDs[cmid] = ids +} + +func (u *unAckChunksTracker) get(cmid chunkMessageID) []messageID { + u.mu.Lock() + defer u.mu.Unlock() + + return u.chunkIDs[cmid] +} + +func (u *unAckChunksTracker) remove(cmid chunkMessageID) { + u.mu.Lock() + defer u.mu.Unlock() + + delete(u.chunkIDs, cmid) +} + +func (u *unAckChunksTracker) ack(cmid chunkMessageID) error { + ids := u.get(cmid) + for _, id := range ids { + if err := u.pc.AckID(id); err != nil { + return err + } + } + u.remove(cmid) + return nil +} + +func (u *unAckChunksTracker) nack(cmid chunkMessageID) { + ids := u.get(cmid) + for _, id := range ids { + u.pc.nackTracker.Add(id) + u.pc.metrics.NacksCounter.Inc() + } + u.remove(cmid) +} From 7fd7ffad192a4d4f6e0b00f8fc83d970153411b9 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 13 Sep 2022 22:00:22 +0800 Subject: [PATCH 20/40] fix: fix the chunking message ack/nack for multiTopicConsumer and RegexConsumer --- pulsar/consumer_multitopic.go | 4 +- pulsar/consumer_partition.go | 9 ++++- pulsar/consumer_regex.go | 4 +- pulsar/impl_message.go | 3 ++ pulsar/message_chunking_test.go | 66 +++++++++++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 6 deletions(-) diff --git a/pulsar/consumer_multitopic.go b/pulsar/consumer_multitopic.go index 1d75a2477b..069d7d4795 100644 --- a/pulsar/consumer_multitopic.go +++ b/pulsar/consumer_multitopic.go @@ -136,7 +136,7 @@ func (c *multiTopicConsumer) AckID(msgID MessageID) error { return errors.New("unable to ack message because consumer is nil") } - return mid.Ack() + return mid.consumer.AckID(msgID) } func (c *multiTopicConsumer) ReconsumeLater(msg Message, delay time.Duration) { @@ -196,7 +196,7 @@ func (c *multiTopicConsumer) NackID(msgID MessageID) { return } - mid.Nack() + mid.consumer.NackID(msgID) } func (c *multiTopicConsumer) Close() { diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index ce1dceaed7..059843ed32 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -756,6 +756,8 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header int32(i), pc.partitionIdx, ackTracker) + // set the consumer so we know how to ack the message id + trackingMsgID.consumer = pc if pc.messageShouldBeDiscarded(trackingMsgID) { pc.AckID(trackingMsgID) @@ -769,10 +771,13 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header // chunkedMsgCtxMap has closed because of consumer closed return nil } - msgID = newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) + cmid := newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) + // set the consumer so we know how to ack the message id + cmid.consumer = pc // clean chunkedMsgCtxMap pc.chunkedMsgCtxMap.remove(msgMeta.GetUuid()) - pc.unAckChunksTracker.add(msgID.(chunkMessageID), ctx.chunkedMsgIDs) + pc.unAckChunksTracker.add(cmid, ctx.chunkedMsgIDs) + msgID = cmid } else { msgID = trackingMsgID } diff --git a/pulsar/consumer_regex.go b/pulsar/consumer_regex.go index e4d2077ac7..fdf2344c3e 100644 --- a/pulsar/consumer_regex.go +++ b/pulsar/consumer_regex.go @@ -180,7 +180,7 @@ func (c *regexConsumer) AckID(msgID MessageID) error { return errors.New("consumer is nil in consumer_regex") } - return mid.Ack() + return mid.consumer.AckID(msgID) } func (c *regexConsumer) Nack(msg Message) { @@ -215,7 +215,7 @@ func (c *regexConsumer) NackID(msgID MessageID) { return } - mid.Nack() + mid.consumer.NackID(msgID) } func (c *regexConsumer) Close() { diff --git a/pulsar/impl_message.go b/pulsar/impl_message.go index e477b81d4a..441b9d788e 100644 --- a/pulsar/impl_message.go +++ b/pulsar/impl_message.go @@ -203,6 +203,7 @@ func toTrackingMessageID(msgID MessageID) (trackingMessageID, bool) { return trackingMessageID{ messageID: cmid.messageID, receivedTime: cmid.receivedTime, + consumer: cmid.consumer, }, true } else { return trackingMessageID{}, false @@ -380,6 +381,8 @@ type chunkMessageID struct { firstChunkID messageID receivedTime time.Time + + consumer acker } func newChunkMessageID(firstChunkID messageID, lastChunkID messageID) chunkMessageID { diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 3326e56f79..9d28e3b2ec 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -20,7 +20,9 @@ package pulsar import ( "context" "errors" + "fmt" "math/rand" + "strings" "sync" "testing" "time" @@ -488,6 +490,70 @@ func TestChunkSize(t *testing.T) { } } +func TestChunkMultiTopicConsumerReceive(t *testing.T) { + topic1 := newTopicName() + topic2 := newTopicName() + + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + if err != nil { + t.Fatal(err) + } + topics := []string{topic1, topic2} + consumer, err := client.Subscribe(ConsumerOptions{ + Topics: topics, + SubscriptionName: "multi-topic-sub", + }) + if err != nil { + t.Fatal(err) + } + defer consumer.Close() + + maxSize := 50 + + // produce messages + for i, topic := range topics { + p, err := client.CreateProducer(ProducerOptions{ + Topic: topic, + DisableBatching: true, + EnableChunking: true, + MaxChunkSize: uint(maxSize), + }) + if err != nil { + t.Fatal(err) + } + err = genMessages(p, 10, func(idx int) string { + return fmt.Sprintf("topic-%d-hello-%d-%s", i+1, idx, string(createTestMessagePayload(100))) + }) + p.Close() + if err != nil { + t.Fatal(err) + } + } + + receivedTopic1 := 0 + receivedTopic2 := 0 + // nolint + for receivedTopic1+receivedTopic2 < 20 { + select { + case cm, ok := <-consumer.Chan(): + if ok { + msg := string(cm.Payload()) + if strings.HasPrefix(msg, "topic-1") { + receivedTopic1++ + } else if strings.HasPrefix(msg, "topic-2") { + receivedTopic2++ + } + consumer.Ack(cm.Message) + } else { + t.Fail() + } + } + } + assert.Equal(t, receivedTopic1, receivedTopic2) +} + func createTestMessagePayload(size int) []byte { payload := make([]byte, size) for i := range payload { From 8904e3c3adb9bc81e1679e0661837b0212608aaf Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Wed, 14 Sep 2022 11:03:34 +0800 Subject: [PATCH 21/40] refactor: remove the timewheel and do some refactor. --- pulsar/consumer_impl.go | 39 +++--- pulsar/consumer_partition.go | 19 ++- pulsar/internal/timewheel.go | 191 ------------------------------ pulsar/internal/timewheel_test.go | 94 --------------- 4 files changed, 28 insertions(+), 315 deletions(-) delete mode 100644 pulsar/internal/timewheel.go delete mode 100644 pulsar/internal/timewheel_test.go diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 5c4d518029..3f178f98bc 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -19,7 +19,6 @@ package pulsar import ( "context" - "errors" "fmt" "math/rand" "strconv" @@ -465,16 +464,11 @@ func (c *consumer) Ack(msg Message) error { // AckID the consumption of a single message, identified by its MessageID func (c *consumer) AckID(msgID MessageID) error { - partition := int(msgID.PartitionIdx()) - // did we receive a valid partition index? - if partition < 0 || partition >= len(c.consumers) { - c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", - partition, len(c.consumers)) - return fmt.Errorf("invalid partition index %d expected a partition between [0-%d]", - partition, len(c.consumers)) + if err := c.checkMsgIDPartition(msgID); err != nil { + return err } - return c.consumers[partition].AckID(msgID) + return c.consumers[msgID.PartitionIdx()].AckID(msgID) } // ReconsumeLater mark a message for redelivery after custom delay @@ -534,7 +528,7 @@ func (c *consumer) Nack(msg Message) { } if mid.consumer != nil { - mid.Nack() + mid.consumer.NackID(msg.ID()) return } c.consumers[mid.partitionIdx].NackMsg(msg) @@ -545,15 +539,11 @@ func (c *consumer) Nack(msg Message) { } func (c *consumer) NackID(msgID MessageID) { - partition := int(msgID.PartitionIdx()) - // did we receive a valid partition index? - if partition < 0 || partition >= len(c.consumers) { - c.log.Warnf("invalid partition index %d expected a partition between [0-%d]", - partition, len(c.consumers)) + if err := c.checkMsgIDPartition(msgID); err != nil { return } - c.consumers[partition].NackID(msgID) + c.consumers[msgID.PartitionIdx()].NackID(msgID) } func (c *consumer) Close() { @@ -589,10 +579,8 @@ func (c *consumer) Seek(msgID MessageID) error { return newError(SeekFailed, "for partition topic, seek command should perform on the individual partitions") } - if msgID.PartitionIdx() < 0 || int(msgID.PartitionIdx()) >= len(c.consumers) { - c.log.Errorf("invalid partition index %d expected a partition between [0-%d]", - msgID.PartitionIdx(), len(c.consumers)) - return errors.New("invalid partition index") + if err := c.checkMsgIDPartition(msgID); err != nil { + return err } return c.consumers[msgID.PartitionIdx()].Seek(msgID) @@ -612,6 +600,17 @@ func (c *consumer) SeekByTime(time time.Time) error { return errs } +func (c *consumer) checkMsgIDPartition(msgID MessageID) error { + partition := msgID.PartitionIdx() + if partition < 0 || int(partition) >= len(c.consumers) { + c.log.Errorf("invalid partition index %d expected a partition between [0-%d]", + partition, len(c.consumers)) + return fmt.Errorf("invalid partition index %d expected a partition between [0-%d]", + partition, len(c.consumers)) + } + return nil +} + var r = &random{ R: rand.New(rand.NewSource(time.Now().UnixNano())), } diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 059843ed32..8df4d6971b 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -1630,21 +1630,16 @@ type chunkedMsgCtxMap struct { maxPending int pc *partitionConsumer mu sync.Mutex - tw *internal.TimeWheel closed bool } func newChunkedMsgCtxMap(maxPending int, pc *partitionConsumer) *chunkedMsgCtxMap { - // create a time wheel with 100ms timing accuracy and 36000 slots - tw := internal.NewTimeWheel(time.Millisecond*100, 36000) - tw.Start() return &chunkedMsgCtxMap{ chunkedMsgCtxs: make(map[string]*chunkedMsgCtx, maxPending), pendingQueue: list.New(), maxPending: maxPending, pc: pc, mu: sync.Mutex{}, - tw: tw, } } @@ -1657,9 +1652,7 @@ func (c *chunkedMsgCtxMap) addIfAbsent(uuid string, totalChunks int32, totalChun if _, ok := c.chunkedMsgCtxs[uuid]; !ok { c.chunkedMsgCtxs[uuid] = newChunkedMsgCtx(totalChunks, totalChunkMsgSize) c.pendingQueue.PushBack(uuid) - c.tw.AddJob(uuid, c.pc.options.expireTimeOfIncompleteChunk, func() { - c.removeChunkMessage(uuid, true) - }) + go c.removeChunkIfExpire(uuid, true, c.pc.options.expireTimeOfIncompleteChunk) } if c.maxPending > 0 && c.pendingQueue.Len() > c.maxPending { go c.removeChunkMessage(uuid, c.pc.options.autoAckIncompleteChunk) @@ -1715,12 +1708,18 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { c.pc.log.Infof("Chunked message [%s] has been removed from chunkedMsgCtxMap", uuid) } +func (c *chunkedMsgCtxMap) removeChunkIfExpire(uuid string, autoAck bool, expire time.Duration) { + timer := time.NewTimer(expire) + select { + case <-timer.C: + c.removeChunkMessage(uuid, autoAck) + } +} + func (c *chunkedMsgCtxMap) Close() { c.mu.Lock() defer c.mu.Unlock() c.closed = true - - c.tw.Stop() } type unAckChunksTracker struct { diff --git a/pulsar/internal/timewheel.go b/pulsar/internal/timewheel.go deleted file mode 100644 index e996d8a4e9..0000000000 --- a/pulsar/internal/timewheel.go +++ /dev/null @@ -1,191 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package internal - -import ( - "container/list" - "log" - "time" -) - -// TimeWheel can execute job after waiting given duration -type TimeWheel struct { - interval time.Duration - ticker *time.Ticker - slots []*list.List - - timer map[string]int - currentPos int - slotNum int - addTaskChannel chan task - removeTaskChannel chan string - stopChannel chan bool -} - -type task struct { - delay time.Duration - circle int - key string - job func() -} - -// NewTimeWheel creates a new time wheel -func NewTimeWheel(interval time.Duration, slotNum int) *TimeWheel { - if interval <= 0 || slotNum <= 0 { - return nil - } - tw := &TimeWheel{ - interval: interval, - slots: make([]*list.List, slotNum), - timer: make(map[string]int), - currentPos: 0, - slotNum: slotNum, - addTaskChannel: make(chan task), - removeTaskChannel: make(chan string), - stopChannel: make(chan bool), - } - tw.initSlots() - - return tw -} - -func (tw *TimeWheel) initSlots() { - for i := 0; i < tw.slotNum; i++ { - tw.slots[i] = list.New() - } -} - -// Start starts ticker for time wheel -func (tw *TimeWheel) Start() { - tw.ticker = time.NewTicker(tw.interval) - go tw.start() -} - -// Stop stops the time wheel -func (tw *TimeWheel) Stop() { - tw.stopChannel <- true -} - -// AddJob add new job into pending queue -func (tw *TimeWheel) AddJob(key string, delay time.Duration, job func()) { - if delay < 0 { - return - } - tw.addTaskChannel <- task{delay: delay, key: key, job: job} -} - -// RemoveJob add remove job from pending queue -// if job is done or not found, then nothing happened -func (tw *TimeWheel) RemoveJob(key string) { - if key == "" { - return - } - tw.removeTaskChannel <- key -} - -func (tw *TimeWheel) start() { - for { - select { - case <-tw.ticker.C: - tw.tickHandler() - case task := <-tw.addTaskChannel: - tw.addTask(&task) - case key := <-tw.removeTaskChannel: - tw.removeTask(key) - case <-tw.stopChannel: - tw.ticker.Stop() - return - } - } -} - -func (tw *TimeWheel) tickHandler() { - l := tw.slots[tw.currentPos] - tw.scanAndRunTask(l) - if tw.currentPos == tw.slotNum-1 { - tw.currentPos = 0 - } else { - tw.currentPos++ - } -} - -func (tw *TimeWheel) scanAndRunTask(l *list.List) { - for e := l.Front(); e != nil; { - task := e.Value.(*task) - if task.circle > 0 { - task.circle-- - e = e.Next() - continue - } - - go func() { - defer func() { - if err := recover(); err != nil { - log.Fatalln(err) - } - }() - job := task.job - job() - }() - next := e.Next() - l.Remove(e) - if task.key != "" { - delete(tw.timer, task.key) - } - e = next - } -} - -func (tw *TimeWheel) addTask(task *task) { - pos, circle := tw.getPositionAndCircle(task.delay) - task.circle = circle - - tw.slots[pos].PushBack(task) - - if task.key != "" { - tw.timer[task.key] = pos - } -} - -func (tw *TimeWheel) getPositionAndCircle(d time.Duration) (pos int, circle int) { - delaySeconds := int(d.Milliseconds()) - intervalSeconds := int(tw.interval.Milliseconds()) - circle = delaySeconds / intervalSeconds / tw.slotNum - pos = tw.currentPos + delaySeconds/intervalSeconds%tw.slotNum - if pos < 0 { - pos = 0 - } - return -} - -func (tw *TimeWheel) removeTask(key string) { - position, ok := tw.timer[key] - if !ok { - return - } - l := tw.slots[position] - for e := l.Front(); e != nil; { - task := e.Value.(*task) - if task.key == key { - delete(tw.timer, task.key) - l.Remove(e) - } - - e = e.Next() - } -} diff --git a/pulsar/internal/timewheel_test.go b/pulsar/internal/timewheel_test.go deleted file mode 100644 index a680ba35a4..0000000000 --- a/pulsar/internal/timewheel_test.go +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package internal - -import ( - "fmt" - "testing" - "time" -) - -func checkTimeCost(t *testing.T, start, end time.Time, before int, after int) bool { - due := end.Sub(start) - if due > time.Duration(after)*time.Millisecond { - t.Error("delay run") - return false - } - - if due < time.Duration(before)*time.Millisecond { - t.Error("run ahead") - return false - } - - return true -} - -func TestCalcPos(t *testing.T) { - tw := NewTimeWheel(100*time.Millisecond, 5) - idx, round := tw.getPositionAndCircle(1 * time.Second) - if round != 2 { - t.Error("round err", round) - } - if idx != 0 { - t.Error("idx err", idx) - } -} - -func TestAddFunc(t *testing.T) { - tw := NewTimeWheel(100*time.Millisecond, 5) - tw.Start() - defer tw.Stop() - - for index := 1; index < 6; index++ { - queue := make(chan bool) - start := time.Now() - tw.AddJob(fmt.Sprintf("key_%d", index), time.Duration(index)*time.Second, func() { - queue <- true - }) - - <-queue - - before := index*1000 - 200 - after := index*1000 + 200 - checkTimeCost(t, start, time.Now(), before, after) - t.Log("time since: ", time.Since(start).String()) - } -} - -func TestRemove(t *testing.T) { - tw := NewTimeWheel(100*time.Millisecond, 5) - tw.Start() - defer tw.Stop() - - queue := make(chan bool) - tw.AddJob("key", time.Millisecond*500, func() { - queue <- true - }) - - // remove action after add action - time.AfterFunc(time.Millisecond*10, func() { - tw.RemoveJob("key") - }) - - exitTimer := time.NewTimer(1 * time.Second) - select { - case <-exitTimer.C: - case <-queue: - t.Error("must not run") - } -} From 94a7f6f48875c17a5c220a907fc71a1c562c33f3 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Wed, 14 Sep 2022 11:11:37 +0800 Subject: [PATCH 22/40] fix: fix the bug that may cause index out of range. --- pulsar/consumer_partition.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 8df4d6971b..ecd61653b1 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -1601,6 +1601,9 @@ func (c *chunkedMsgCtx) refresh(chunkID int32, msgID messageID, partPayload inte func (c *chunkedMsgCtx) firstChunkID() messageID { c.mu.Lock() defer c.mu.Unlock() + if len(c.chunkedMsgIDs) == 0 { + return messageID{} + } return c.chunkedMsgIDs[0] } From 05754c6de8a733cbeebee82932094c18abf2979e Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Wed, 14 Sep 2022 16:20:13 +0800 Subject: [PATCH 23/40] fix: fix the lint problem --- pulsar/consumer_partition.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index ecd61653b1..c25a0441fa 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -1713,10 +1713,8 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { func (c *chunkedMsgCtxMap) removeChunkIfExpire(uuid string, autoAck bool, expire time.Duration) { timer := time.NewTimer(expire) - select { - case <-timer.C: - c.removeChunkMessage(uuid, autoAck) - } + <-timer.C + c.removeChunkMessage(uuid, autoAck) } func (c *chunkedMsgCtxMap) Close() { From 8f89741d62c6228bc402c0f6325fbed63a5dd965 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sat, 17 Sep 2022 15:56:18 +0800 Subject: [PATCH 24/40] fix: fix the publishSemaphore leak and modify some comments. --- pulsar/consumer.go | 3 +- pulsar/message_chunking_test.go | 52 ++++++++++++++++----------------- pulsar/producer.go | 6 ++-- pulsar/producer_partition.go | 13 +++++---- 4 files changed, 38 insertions(+), 36 deletions(-) diff --git a/pulsar/consumer.go b/pulsar/consumer.go index b762727f7f..99721df106 100644 --- a/pulsar/consumer.go +++ b/pulsar/consumer.go @@ -193,8 +193,7 @@ type ConsumerOptions struct { // MaxPendingChunkedMessage sets the maximum pending chunked messages. (default: 100) MaxPendingChunkedMessage int - // ExpireTimeOfIncompleteChunk sets the expiry time of discarding incomplete chunked message. - // The minimum value is 100ms. (default: 60 * 1000 millis) + // ExpireTimeOfIncompleteChunk sets the expiry time of discarding incomplete chunked message. (default: 60 seconds) ExpireTimeOfIncompleteChunk time.Duration // AutoAckIncompleteChunk sets whether consumer auto acknowledges incomplete chunked message when it should diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 9d28e3b2ec..9b09aaebee 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -63,7 +63,7 @@ func TestLargeMessage(t *testing.T) { topic := newTopicName() - // create producer without MaxChunkSize + // create producer without ChunkMaxMessageSize producer1, err := client.CreateProducer(ProducerOptions{ Topic: topic, DisableBatching: true, @@ -73,12 +73,12 @@ func TestLargeMessage(t *testing.T) { assert.NotNil(t, producer1) defer producer1.Close() - // create producer with MaxChunkSize + // create producer with ChunkMaxMessageSize producer2, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - DisableBatching: true, - EnableChunking: true, - MaxChunkSize: 5, + Topic: topic, + DisableBatching: true, + EnableChunking: true, + ChunkMaxMessageSize: 5, }) assert.NoError(t, err) assert.NotNil(t, producer2) @@ -121,7 +121,7 @@ func TestLargeMessage(t *testing.T) { assert.NoError(t, err) } - // test send chunk with MaxChunkSize limit + // test send chunk with ChunkMaxMessageSize limit for i := 0; i < 5; i++ { msg := createTestMessagePayload(50) expectMsgs = append(expectMsgs, msg) @@ -132,7 +132,7 @@ func TestLargeMessage(t *testing.T) { assert.NotNil(t, ID) } - // test receive chunk with MaxChunkSize limit + // test receive chunk with ChunkMaxMessageSize limit for i := 5; i < 10; i++ { msg, err := consumer.Receive(context.Background()) assert.NoError(t, err) @@ -158,7 +158,7 @@ func TestPublishChunkWithFailure(t *testing.T) { topic := newTopicName() - // create producer without MaxChunkSize + // create producer without ChunkMaxMessageSize producer, err := client.CreateProducer(ProducerOptions{ Topic: topic, }) @@ -206,10 +206,10 @@ func TestMaxPendingChunkMessages(t *testing.T) { assert.Nil(t, err) clients = append(clients, pc) producer, err := pc.CreateProducer(ProducerOptions{ - Topic: topic, - DisableBatching: true, - EnableChunking: true, - MaxChunkSize: 10, + Topic: topic, + DisableBatching: true, + EnableChunking: true, + ChunkMaxMessageSize: 10, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -311,7 +311,7 @@ func TestChunksEnqueueFailed(t *testing.T) { EnableChunking: true, DisableBatching: true, MaxPendingMessages: 10, - MaxChunkSize: 50, + ChunkMaxMessageSize: 50, DisableBlockIfQueueFull: true, }) assert.NoError(t, err) @@ -339,10 +339,10 @@ func TestSeekChunkMessages(t *testing.T) { totalMessages := 5 producer, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - EnableChunking: true, - DisableBatching: true, - MaxChunkSize: 50, + Topic: topic, + EnableChunking: true, + DisableBatching: true, + ChunkMaxMessageSize: 50, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -403,10 +403,10 @@ func TestChunkAckAndNAck(t *testing.T) { topic := newTopicName() producer, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - EnableChunking: true, - DisableBatching: true, - MaxChunkSize: 50, + Topic: topic, + EnableChunking: true, + DisableBatching: true, + ChunkMaxMessageSize: 50, }) assert.NoError(t, err) assert.NotNil(t, producer) @@ -515,10 +515,10 @@ func TestChunkMultiTopicConsumerReceive(t *testing.T) { // produce messages for i, topic := range topics { p, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - DisableBatching: true, - EnableChunking: true, - MaxChunkSize: uint(maxSize), + Topic: topic, + DisableBatching: true, + EnableChunking: true, + ChunkMaxMessageSize: uint(maxSize), }) if err != nil { t.Fatal(err) diff --git a/pulsar/producer.go b/pulsar/producer.go index dfcd30499a..463e1f3f86 100644 --- a/pulsar/producer.go +++ b/pulsar/producer.go @@ -178,9 +178,9 @@ type ProducerOptions struct { // Chunking can not be enabled when batching is enabled. EnableChunking bool - // MaxChunkSize is the max size of single chunk payload. - // It will actually only take effect if it is smaller than broker.MaxMessageSize - MaxChunkSize uint + // ChunkMaxMessageSize is the max size of single chunk payload. + // It will actually only take effect if it is smaller than the maxMessageSize from the broker. + ChunkMaxMessageSize uint } // Producer is used to publish messages on a topic diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 3c5008debb..c0bcf50ff1 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -483,6 +483,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { if p.options.DisableMultiSchema { if msg.Schema != nil && p.options.Schema != nil && msg.Schema.GetSchemaInfo().hash() != p.options.Schema.GetSchemaInfo().hash() { + p.publishSemaphore.Release() p.log.WithError(err).Errorf("The producer %s of the topic %s is disabled the `MultiSchema`", p.producerName, p.topic) return } @@ -516,6 +517,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { if schemaVersion == nil { schemaVersion, err = p.getOrCreateSchema(schema.GetSchemaInfo()) if err != nil { + p.publishSemaphore.Release() p.log.WithError(err).Error("get schema version fail") return } @@ -584,6 +586,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } else { payloadChunkSize = int(p._getConn().GetMaxMessageSize()) - mm.Size() if payloadChunkSize <= 0 { + p.publishSemaphore.Release() request.callback(nil, msg, errMetaTooLarge) p.log.WithError(errMetaTooLarge). WithField("metadata size", mm.Size()). @@ -592,18 +595,18 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.metrics.PublishErrorsMsgTooLarge.Inc() return } - // set MaxChunkSize - if p.options.MaxChunkSize != 0 { - payloadChunkSize = int(math.Min(float64(payloadChunkSize), float64(p.options.MaxChunkSize))) + // set ChunkMaxMessageSize + if p.options.ChunkMaxMessageSize != 0 { + payloadChunkSize = int(math.Min(float64(payloadChunkSize), float64(p.options.ChunkMaxMessageSize))) } totalChunks = int(math.Max(1, math.Ceil(float64(compressedSize)/float64(payloadChunkSize)))) } // correct limit queue when chunked - for i := 0; i < totalChunks-1; i++ { + for i := 1; i < totalChunks; i++ { if !p.canAddToQueue(request) { // release all permits including the first - for j := i; j >= 0; j-- { + for j := i; j > 0; j-- { p.publishSemaphore.Release() } return From 95ad70b3d88469796ff66b615a5bee8996dc9a94 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 25 Sep 2022 10:33:40 +0800 Subject: [PATCH 25/40] fix: fix the chunking messages may block when the BlockIfQueueFull enabled --- pulsar/message_chunking_test.go | 30 +++++++++++++++++++++++++ pulsar/producer_partition.go | 40 ++++++++++++++------------------- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 9b09aaebee..4a723a1bd6 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -554,6 +554,36 @@ func TestChunkMultiTopicConsumerReceive(t *testing.T) { assert.Equal(t, receivedTopic1, receivedTopic2) } +func TestChunkBlockIfQueueFull(t *testing.T) { + client, err := NewClient(ClientOptions{ + URL: lookupURL, + }) + if err != nil { + t.Fatal(err) + } + + topic := newTopicName() + + producer, err := client.CreateProducer(ProducerOptions{ + Name: "test", + Topic: topic, + EnableChunking: true, + DisableBatching: true, + MaxPendingMessages: 1, + ChunkMaxMessageSize: 10, + }) + assert.NoError(t, err) + assert.NotNil(t, producer) + defer producer.Close() + + // Large messages will be split into 11 chunks, exceeding the length of pending queue + ID, err := producer.Send(context.Background(), &ProducerMessage{ + Payload: createTestMessagePayload(100), + }) + assert.NoError(t, err) + assert.NotNil(t, ID) +} + func createTestMessagePayload(size int) []byte { payload := make([]byte, size) for i := range payload { diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index c0bcf50ff1..fa989293d3 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -476,10 +476,6 @@ func (p *partitionProducer) internalSend(request *sendRequest) { return } - if !p.canAddToQueue(request) { - return - } - if p.options.DisableMultiSchema { if msg.Schema != nil && p.options.Schema != nil && msg.Schema.GetSchemaInfo().hash() != p.options.Schema.GetSchemaInfo().hash() { @@ -549,6 +545,10 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.updateMetadataSeqID(mm, msg) } + if sendAsBatch && !p.canAddToQueue(request) { + return + } + maxMessageSize := int(p._getConn().GetMaxMessageSize()) // compress payload if not batching @@ -602,17 +602,6 @@ func (p *partitionProducer) internalSend(request *sendRequest) { totalChunks = int(math.Max(1, math.Ceil(float64(compressedSize)/float64(payloadChunkSize)))) } - // correct limit queue when chunked - for i := 1; i < totalChunks; i++ { - if !p.canAddToQueue(request) { - // release all permits including the first - for j := i; j > 0; j-- { - p.publishSemaphore.Release() - } - return - } - } - // set total chunks to send request request.totalChunks = totalChunks @@ -637,14 +626,23 @@ func (p *partitionProducer) internalSend(request *sendRequest) { callback: request.callback, callbackOnce: request.callbackOnce, publishTime: request.publishTime, + blockCh: request.blockCh, totalChunks: totalChunks, chunkID: chunkID, uuid: uuid, chunkRecorder: cr, } + // acquire the permits + if !p.canAddToQueue(nsr) { + return + } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } } else { + // acquire the permits + if !p.canAddToQueue(request) { + return + } p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } } else { @@ -769,6 +767,7 @@ func (p *partitionProducer) internalSingleSend(mm *pb.MessageMetadata, maxMessageSize, ); err != nil { request.callback(nil, request.msg, err) + p.publishSemaphore.Release() p.log.WithError(err).Errorf("Single message serialize failed %s", msg.Value) return } @@ -1238,7 +1237,6 @@ type sendRequest struct { publishTime time.Time flushImmediately bool blockCh chan struct{} - blockOnce sync.Once totalChunks int chunkID int uuid string @@ -1287,14 +1285,10 @@ func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { } } else if !p.publishSemaphore.Acquire(sr.ctx) { sr.callback(nil, sr.msg, errContextExpired) - sr.blockOnce.Do(func() { - sr.blockCh <- struct{}{} - }) + sr.blockCh <- struct{}{} return false - } else { - sr.blockOnce.Do(func() { - sr.blockCh <- struct{}{} - }) + } else if sr.totalChunks == 1 || (sr.totalChunks > 1 && sr.chunkID == sr.totalChunks-1) { + sr.blockCh <- struct{}{} } p.metrics.MessagesPending.Inc() p.metrics.BytesPending.Add(float64(len(sr.msg.Payload))) From 4719d0b307999db179116a27cf08d7097fce2151 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 26 Sep 2022 23:07:28 +0800 Subject: [PATCH 26/40] fix: fix bug of totalchunks --- pulsar/producer_partition.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index fa989293d3..6eecc5ed34 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -1287,7 +1287,7 @@ func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { sr.callback(nil, sr.msg, errContextExpired) sr.blockCh <- struct{}{} return false - } else if sr.totalChunks == 1 || (sr.totalChunks > 1 && sr.chunkID == sr.totalChunks-1) { + } else if sr.totalChunks == 0 || sr.totalChunks == 1 || (sr.totalChunks > 1 && sr.chunkID == sr.totalChunks-1) { sr.blockCh <- struct{}{} } p.metrics.MessagesPending.Inc() From 72fe920d157e25f0b0c34aaff800c907b28a692a Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 27 Sep 2022 15:50:34 +0800 Subject: [PATCH 27/40] fix: fix the bug of single sending stuck by permits acquire --- pulsar/message_chunking_test.go | 2 +- pulsar/producer_partition.go | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 4a723a1bd6..0835bd625d 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -457,7 +457,7 @@ func TestChunkSize(t *testing.T) { defer client.Close() // the default message metadata size for string schema - // The proto messageMetaData size as following. (all with tag) + // The proto messageMetaData size as following. (all with tag) (maxMessageSize = 1024 * 1024) // | producerName | sequenceID | publishTime | uncompressedSize | // | ------------ | ---------- | ----------- | ---------------- | // | 6 | 2 | 7 | 4 | diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 6eecc5ed34..0b50c6c9d5 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -476,6 +476,10 @@ func (p *partitionProducer) internalSend(request *sendRequest) { return } + if !p.canAddToQueue(request) { + return + } + if p.options.DisableMultiSchema { if msg.Schema != nil && p.options.Schema != nil && msg.Schema.GetSchemaInfo().hash() != p.options.Schema.GetSchemaInfo().hash() { @@ -545,10 +549,6 @@ func (p *partitionProducer) internalSend(request *sendRequest) { p.updateMetadataSeqID(mm, msg) } - if sendAsBatch && !p.canAddToQueue(request) { - return - } - maxMessageSize := int(p._getConn().GetMaxMessageSize()) // compress payload if not batching @@ -632,17 +632,13 @@ func (p *partitionProducer) internalSend(request *sendRequest) { uuid: uuid, chunkRecorder: cr, } - // acquire the permits - if !p.canAddToQueue(nsr) { + // the permit of first chunk has acquired + if chunkID != 0 && !p.canAddToQueue(nsr) { return } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } } else { - // acquire the permits - if !p.canAddToQueue(request) { - return - } p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } } else { From a962eee7d2f6ac19bf025fc5e41a8f3981e9e729 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 27 Sep 2022 17:22:48 +0800 Subject: [PATCH 28/40] fix: fix the bug of single sending stuck by permits acquire --- pulsar/producer_partition.go | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 0b50c6c9d5..b319d96c60 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -476,6 +476,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { return } + defer request.stopBlock() if !p.canAddToQueue(request) { return } @@ -543,7 +544,10 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 - if !sendAsBatch { + // Once the batching can add to queue, it can finish internalSendAsync immediately + if sendAsBatch { + request.stopBlock() + } else { // update sequence id for metadata, make the size of msgMetadata more accurate // batch sending will update sequence ID in the BatchBuilder p.updateMetadataSeqID(mm, msg) @@ -627,6 +631,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { callbackOnce: request.callbackOnce, publishTime: request.publishTime, blockCh: request.blockCh, + blockOnce: request.blockOnce, totalChunks: totalChunks, chunkID: chunkID, uuid: uuid, @@ -638,7 +643,9 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } + request.stopBlock() } else { + request.stopBlock() p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } } else { @@ -1050,6 +1057,7 @@ func (p *partitionProducer) internalSendAsync(ctx context.Context, msg *Producer flushImmediately: flushImmediately, publishTime: time.Now(), blockCh: bc, + blockOnce: &sync.Once{}, } p.options.Interceptors.BeforeSend(p, msg) @@ -1233,12 +1241,20 @@ type sendRequest struct { publishTime time.Time flushImmediately bool blockCh chan struct{} + blockOnce *sync.Once totalChunks int chunkID int uuid string chunkRecorder *chunkRecorder } +// stopBlock can be invoked multiple times safety +func (sr *sendRequest) stopBlock() { + sr.blockOnce.Do(func() { + sr.blockCh <- struct{}{} + }) +} + type closeProducer struct { doneCh chan struct{} } @@ -1281,10 +1297,7 @@ func (p *partitionProducer) canAddToQueue(sr *sendRequest) bool { } } else if !p.publishSemaphore.Acquire(sr.ctx) { sr.callback(nil, sr.msg, errContextExpired) - sr.blockCh <- struct{}{} return false - } else if sr.totalChunks == 0 || sr.totalChunks == 1 || (sr.totalChunks > 1 && sr.chunkID == sr.totalChunks-1) { - sr.blockCh <- struct{}{} } p.metrics.MessagesPending.Inc() p.metrics.BytesPending.Add(float64(len(sr.msg.Payload))) From fef09d9e87263f924e71ac0f8b1b39ae4f14b712 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Tue, 27 Sep 2022 21:23:05 +0800 Subject: [PATCH 29/40] refactor: do some refactor --- pulsar/producer_partition.go | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index bab48df3d6..0e9ba8db00 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -480,6 +480,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { return } + // The block chan must be closed when returned with exception defer request.stopBlock() if !p.canAddToQueue(request) { return @@ -548,7 +549,7 @@ func (p *partitionProducer) internalSend(request *sendRequest) { msg.ReplicationClusters == nil && deliverAt.UnixNano() < 0 - // Once the batching can add to queue, it can finish internalSendAsync immediately + // Once the batching is enabled, it can close blockCh early to make block finish if sendAsBatch { request.stopBlock() } else { @@ -629,17 +630,17 @@ func (p *partitionProducer) internalSend(request *sendRequest) { // update chunk id mm.ChunkId = proto.Int(chunkID) nsr := &sendRequest{ - ctx: request.ctx, - msg: request.msg, - callback: request.callback, - callbackOnce: request.callbackOnce, - publishTime: request.publishTime, - blockCh: request.blockCh, - blockOnce: request.blockOnce, - totalChunks: totalChunks, - chunkID: chunkID, - uuid: uuid, - chunkRecorder: cr, + ctx: request.ctx, + msg: request.msg, + callback: request.callback, + callbackOnce: request.callbackOnce, + publishTime: request.publishTime, + blockCh: request.blockCh, + closeBlockChOnce: request.closeBlockChOnce, + totalChunks: totalChunks, + chunkID: chunkID, + uuid: uuid, + chunkRecorder: cr, } // the permit of first chunk has acquired if chunkID != 0 && !p.canAddToQueue(nsr) { @@ -647,8 +648,10 @@ func (p *partitionProducer) internalSend(request *sendRequest) { } p.internalSingleSend(mm, compressedPayload[lhs:rhs], nsr, uint32(maxMessageSize)) } + // close the blockCh when all the chunks acquired permits request.stopBlock() } else { + // close the blockCh when totalChunks is 1 (it has acquired permits) request.stopBlock() p.internalSingleSend(mm, compressedPayload, request, uint32(maxMessageSize)) } @@ -1061,7 +1064,7 @@ func (p *partitionProducer) internalSendAsync(ctx context.Context, msg *Producer flushImmediately: flushImmediately, publishTime: time.Now(), blockCh: bc, - blockOnce: &sync.Once{}, + closeBlockChOnce: &sync.Once{}, } p.options.Interceptors.BeforeSend(p, msg) @@ -1245,7 +1248,7 @@ type sendRequest struct { publishTime time.Time flushImmediately bool blockCh chan struct{} - blockOnce *sync.Once + closeBlockChOnce *sync.Once totalChunks int chunkID int uuid string @@ -1254,8 +1257,8 @@ type sendRequest struct { // stopBlock can be invoked multiple times safety func (sr *sendRequest) stopBlock() { - sr.blockOnce.Do(func() { - sr.blockCh <- struct{}{} + sr.closeBlockChOnce.Do(func() { + close(sr.blockCh) }) } From 979456050d5cb70a8e816fba04aa7508e281732f Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Thu, 29 Sep 2022 15:52:29 +0800 Subject: [PATCH 30/40] fix: fix for resolve conflict --- pulsar/consumer_impl.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 9d860279fe..116ffa4471 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -19,7 +19,6 @@ package pulsar import ( "context" - "errors" "fmt" "math/rand" "strconv" From 3aeb764f1ce7e96bba0ad6224fd82d1790b7331f Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Fri, 30 Sep 2022 15:14:43 +0800 Subject: [PATCH 31/40] fix: remove the ExpireTimeOfIncompleteChunk limit and do some refactor --- pulsar/consumer_impl.go | 3 +-- pulsar/consumer_partition.go | 35 ++++++++++++++++++++++++--------- pulsar/message_chunking_test.go | 31 ++++------------------------- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/pulsar/consumer_impl.go b/pulsar/consumer_impl.go index 116ffa4471..a402a4d9f9 100644 --- a/pulsar/consumer_impl.go +++ b/pulsar/consumer_impl.go @@ -96,8 +96,7 @@ func newConsumer(client *client, options ConsumerOptions) (Consumer, error) { options.MaxPendingChunkedMessage = 100 } - // the minimum timer interval is 100ms - if options.ExpireTimeOfIncompleteChunk < 100*time.Millisecond { + if options.ExpireTimeOfIncompleteChunk == 0 { options.ExpireTimeOfIncompleteChunk = time.Minute } diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index a4ac96f859..b9d2bb3dd4 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -925,7 +925,7 @@ func (pc *partitionConsumer) processMessageChunk(compressedPayload internal.Buff return nil } - ctx.refresh(chunkID, msgID, compressedPayload) + ctx.append(chunkID, msgID, compressedPayload) if msgMeta.GetChunkId() != msgMeta.GetNumChunksFromMsg()-1 { atomic.AddInt32(&pc.availablePermits, 1) @@ -1638,7 +1638,7 @@ func newChunkedMsgCtx(numChunksFromMsg int32, totalChunkMsgSize int) *chunkedMsg } } -func (c *chunkedMsgCtx) refresh(chunkID int32, msgID messageID, partPayload internal.Buffer) { +func (c *chunkedMsgCtx) append(chunkID int32, msgID messageID, partPayload internal.Buffer) { c.mu.Lock() defer c.mu.Unlock() c.chunkedMsgIDs[chunkID] = msgID @@ -1703,10 +1703,10 @@ func (c *chunkedMsgCtxMap) addIfAbsent(uuid string, totalChunks int32, totalChun if _, ok := c.chunkedMsgCtxs[uuid]; !ok { c.chunkedMsgCtxs[uuid] = newChunkedMsgCtx(totalChunks, totalChunkMsgSize) c.pendingQueue.PushBack(uuid) - go c.removeChunkIfExpire(uuid, true, c.pc.options.expireTimeOfIncompleteChunk) + go c.discardChunkIfExpire(uuid, true, c.pc.options.expireTimeOfIncompleteChunk) } if c.maxPending > 0 && c.pendingQueue.Len() > c.maxPending { - go c.removeChunkMessage(uuid, c.pc.options.autoAckIncompleteChunk) + go c.discardOldestChunkMessage(c.pc.options.autoAckIncompleteChunk) } } @@ -1735,7 +1735,25 @@ func (c *chunkedMsgCtxMap) remove(uuid string) { } } -func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { +func (c *chunkedMsgCtxMap) discardOldestChunkMessage(autoAck bool) { + c.mu.Lock() + defer c.mu.Unlock() + if c.closed { + return + } + oldest := c.pendingQueue.Front().Value.(string) + ctx, ok := c.chunkedMsgCtxs[oldest] + if !ok { + return + } + if autoAck { + ctx.discard(c.pc) + } + delete(c.chunkedMsgCtxs, oldest) + c.pc.log.Infof("Chunked message [%s] has been removed from chunkedMsgCtxMap", oldest) +} + +func (c *chunkedMsgCtxMap) discardChunkMessage(uuid string, autoAck bool) { c.mu.Lock() defer c.mu.Unlock() if c.closed { @@ -1759,10 +1777,10 @@ func (c *chunkedMsgCtxMap) removeChunkMessage(uuid string, autoAck bool) { c.pc.log.Infof("Chunked message [%s] has been removed from chunkedMsgCtxMap", uuid) } -func (c *chunkedMsgCtxMap) removeChunkIfExpire(uuid string, autoAck bool, expire time.Duration) { +func (c *chunkedMsgCtxMap) discardChunkIfExpire(uuid string, autoAck bool, expire time.Duration) { timer := time.NewTimer(expire) <-timer.C - c.removeChunkMessage(uuid, autoAck) + c.discardChunkMessage(uuid, autoAck) } func (c *chunkedMsgCtxMap) Close() { @@ -1819,8 +1837,7 @@ func (u *unAckChunksTracker) ack(cmid chunkMessageID) error { func (u *unAckChunksTracker) nack(cmid chunkMessageID) { ids := u.get(cmid) for _, id := range ids { - u.pc.nackTracker.Add(id) - u.pc.metrics.NacksCounter.Inc() + u.pc.NackID(id) } u.remove(cmid) } diff --git a/pulsar/message_chunking_test.go b/pulsar/message_chunking_test.go index 0835bd625d..b3d64afaec 100644 --- a/pulsar/message_chunking_test.go +++ b/pulsar/message_chunking_test.go @@ -146,33 +146,6 @@ func TestLargeMessage(t *testing.T) { } } -func TestPublishChunkWithFailure(t *testing.T) { - rand.Seed(time.Now().Unix()) - - client, err := NewClient(ClientOptions{ - URL: lookupURL, - }) - - assert.Nil(t, err) - defer client.Close() - - topic := newTopicName() - - // create producer without ChunkMaxMessageSize - producer, err := client.CreateProducer(ProducerOptions{ - Topic: topic, - }) - assert.NoError(t, err) - assert.NotNil(t, producer) - defer producer.Close() - - ID, err := producer.Send(context.Background(), &ProducerMessage{ - Payload: createTestMessagePayload(_brokerMaxMessageSize + 1), - }) - assert.Error(t, err) - assert.Nil(t, ID) -} - func TestMaxPendingChunkMessages(t *testing.T) { rand.Seed(time.Now().Unix()) @@ -536,6 +509,7 @@ func TestChunkMultiTopicConsumerReceive(t *testing.T) { receivedTopic2 := 0 // nolint for receivedTopic1+receivedTopic2 < 20 { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) select { case cm, ok := <-consumer.Chan(): if ok { @@ -549,7 +523,10 @@ func TestChunkMultiTopicConsumerReceive(t *testing.T) { } else { t.Fail() } + case <-ctx.Done(): + t.Error(ctx.Err()) } + cancel() } assert.Equal(t, receivedTopic1, receivedTopic2) } From 4b53e336413815e243d0234d69b9b2894144b858 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 3 Oct 2022 12:00:53 +0800 Subject: [PATCH 32/40] fix: fix the nil pointer error of discardOldestChunkMessage and fix the chunkMessageID.Serialize --- pulsar/consumer_partition.go | 2 +- pulsar/impl_message.go | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index b9d2bb3dd4..812da44738 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -1738,7 +1738,7 @@ func (c *chunkedMsgCtxMap) remove(uuid string) { func (c *chunkedMsgCtxMap) discardOldestChunkMessage(autoAck bool) { c.mu.Lock() defer c.mu.Unlock() - if c.closed { + if c.closed || (c.maxPending > 0 && c.pendingQueue.Len() <= c.maxPending) { return } oldest := c.pendingQueue.Front().Value.(string) diff --git a/pulsar/impl_message.go b/pulsar/impl_message.go index 4e4e8f6a74..11ecb40f21 100644 --- a/pulsar/impl_message.go +++ b/pulsar/impl_message.go @@ -409,5 +409,18 @@ func (id chunkMessageID) String() string { } func (id chunkMessageID) Serialize() []byte { - return id.firstChunkID.Serialize() + msgID := &pb.MessageIdData{ + LedgerId: proto.Uint64(uint64(id.ledgerID)), + EntryId: proto.Uint64(uint64(id.entryID)), + BatchIndex: proto.Int32(id.batchIdx), + Partition: proto.Int32(id.partitionIdx), + FirstChunkMessageId: &pb.MessageIdData{ + LedgerId: proto.Uint64(uint64(id.firstChunkID.ledgerID)), + EntryId: proto.Uint64(uint64(id.firstChunkID.entryID)), + BatchIndex: proto.Int32(id.firstChunkID.batchIdx), + Partition: proto.Int32(id.firstChunkID.partitionIdx), + }, + } + data, _ := proto.Marshal(msgID) + return data } From e80fe90de7f4e0d73c0675fbdf6399d7657e4a4f Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 16 Oct 2022 11:49:57 +0800 Subject: [PATCH 33/40] test: restart workflow --- pulsar/producer_partition.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index f4b5d0763d..d5d1e0178c 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -134,6 +134,7 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions } var maxPendingMessages int + if options.MaxPendingMessages == 0 { maxPendingMessages = 1000 } else { From 84c02fe58d6b9ca9d4df5bc424f144d6de4accd2 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 16 Oct 2022 16:02:59 +0800 Subject: [PATCH 34/40] test: restart workflow --- pulsar/producer_partition.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index d5d1e0178c..f4b5d0763d 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -134,7 +134,6 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions } var maxPendingMessages int - if options.MaxPendingMessages == 0 { maxPendingMessages = 1000 } else { From 23570cf6d732ad879c64f7c6891274408f287500 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 16 Oct 2022 17:10:13 +0800 Subject: [PATCH 35/40] test: restart workflow --- pulsar/producer_partition.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index f4b5d0763d..d5d1e0178c 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -134,6 +134,7 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions } var maxPendingMessages int + if options.MaxPendingMessages == 0 { maxPendingMessages = 1000 } else { From 56ef453b3f87ea475cb3586e23db608b713c89a6 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sun, 16 Oct 2022 17:45:25 +0800 Subject: [PATCH 36/40] test: restart workflow --- pulsar/producer_partition.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index d5d1e0178c..f4b5d0763d 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -134,7 +134,6 @@ func newPartitionProducer(client *client, topic string, options *ProducerOptions } var maxPendingMessages int - if options.MaxPendingMessages == 0 { maxPendingMessages = 1000 } else { From a42de107d47cca5f48ba561fdcfa5dd2cf1b1c8e Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Sat, 22 Oct 2022 22:16:30 +0800 Subject: [PATCH 37/40] fix: add logs when chunkMsgCtxMap has closed --- pulsar/consumer_partition.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 31c4c0f53c..578eee2215 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -815,6 +815,8 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header ctx := pc.chunkedMsgCtxMap.get(msgMeta.GetUuid()) if ctx == nil { // chunkedMsgCtxMap has closed because of consumer closed + pc.log.Warnf("get chunkedMsgCtx for chunk with uuid %s failed because consumer has closed", + msgMeta.Uuid) return nil } cmid := newChunkMessageID(ctx.firstChunkID(), ctx.lastChunkID()) From b3b304f681fdac195531922d40e8406eca40743a Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 24 Oct 2022 15:45:17 +0800 Subject: [PATCH 38/40] fix: modify by review comment --- pulsar/consumer_partition.go | 2 +- pulsar/producer_partition.go | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 578eee2215..880f2230ba 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -751,7 +751,7 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header isChunkedMsg = true } - processedPayloadBuffer := internal.NewBufferWrapper(decryptedPayload) + var processedPayloadBuffer internal.Buffer if isChunkedMsg { processedPayloadBuffer = pc.processMessageChunk(processedPayloadBuffer, msgMeta, pbMsgID) if processedPayloadBuffer == nil { diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index f4b5d0763d..8740fdd508 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -279,12 +279,10 @@ func (p *partitionProducer) grabCnx() error { return err } - if p.options.DisableBatching { - // removed BatchBuilder creation - } else if p.batchBuilder == nil { + if !p.options.DisableBatching { provider, err := GetBatcherBuilderProvider(p.options.BatcherBuilderType) if err != nil { - provider, _ = GetBatcherBuilderProvider(DefaultBatchBuilder) + return err } maxMessageSize := uint32(p._getConn().GetMaxMessageSize()) p.batchBuilder, err = provider(p.options.BatchingMaxMessages, p.options.BatchingMaxSize, From 6a801cb146524fea63dcf496fd1a9740abf1102f Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 24 Oct 2022 16:26:07 +0800 Subject: [PATCH 39/40] fix: inline compare and fix a data race error --- pulsar/internal/batch_builder.go | 6 ++---- pulsar/producer_partition.go | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pulsar/internal/batch_builder.go b/pulsar/internal/batch_builder.go index ef4552b045..ca19a3ed71 100644 --- a/pulsar/internal/batch_builder.go +++ b/pulsar/internal/batch_builder.go @@ -168,10 +168,8 @@ func (bc *batchContainer) hasSpace(payload []byte) bool { } msgSize := uint32(len(payload)) expectedSize := bc.buffer.ReadableBytes() + msgSize - if !(expectedSize <= uint32(bc.maxBatchSize) && expectedSize <= bc.maxMessageSize) { - return false - } - return bc.numMessages+1 <= bc.maxMessages + return bc.numMessages+1 <= bc.maxMessages && + expectedSize <= uint32(bc.maxBatchSize) && expectedSize <= bc.maxMessageSize } func (bc *batchContainer) hasSameSchema(schemaVersion []byte) bool { diff --git a/pulsar/producer_partition.go b/pulsar/producer_partition.go index 8740fdd508..881451c654 100644 --- a/pulsar/producer_partition.go +++ b/pulsar/producer_partition.go @@ -279,7 +279,7 @@ func (p *partitionProducer) grabCnx() error { return err } - if !p.options.DisableBatching { + if !p.options.DisableBatching && p.batchBuilder == nil { provider, err := GetBatcherBuilderProvider(p.options.BatcherBuilderType) if err != nil { return err From 00e9e8350fcaac1efbfa546e17da0a6ccac4c641 Mon Sep 17 00:00:00 2001 From: Jiaqi Shen <18863662628@163.com> Date: Mon, 24 Oct 2022 16:54:02 +0800 Subject: [PATCH 40/40] fix: fix nil pointer error --- pulsar/consumer_partition.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pulsar/consumer_partition.go b/pulsar/consumer_partition.go index 880f2230ba..ebaa48b989 100644 --- a/pulsar/consumer_partition.go +++ b/pulsar/consumer_partition.go @@ -751,14 +751,12 @@ func (pc *partitionConsumer) MessageReceived(response *pb.CommandMessage, header isChunkedMsg = true } - var processedPayloadBuffer internal.Buffer + processedPayloadBuffer := internal.NewBufferWrapper(decryptedPayload) if isChunkedMsg { processedPayloadBuffer = pc.processMessageChunk(processedPayloadBuffer, msgMeta, pbMsgID) if processedPayloadBuffer == nil { return nil } - } else { - processedPayloadBuffer = internal.NewBufferWrapper(decryptedPayload) } // decryption is success, decompress the payload