sendProducerData把实际要发的消息封装好,放入KakfaNetworkClient中。
private long sendProducerData(long now) { // 1. 计算需要以及可以向哪些节点发送请求 Cluster cluster = metadata.fetch(); // get the list of partitions with data ready to send // 计算需要向哪些节点发送请求 RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now); // if there are any partitions whose leaders are not known yet, force metadata update // 2. 如果存在未知的 leader 副本对应的节点(对应的 topic 分区正在执行 leader 选举,或者对应的 topic 已经失效), // 标记需要更新缓存的集群元数据信息 if (!result.unknownLeaderTopics.isEmpty()) { // The set of topics with unknown leader contains topics with leader election pending as well as // topics which may have expired. Add the topic again to metadata to ensure it is included // and request metadata update, since there are messages to send to the topic. for (String topic : result.unknownLeaderTopics) this.metadata.add(topic); log.debug("Requesting metadata update due to unknown leader topics from the batched records: {}", result.unknownLeaderTopics); this.metadata.requestUpdate(); } // remove any nodes we aren't ready to send to // 3. 遍历处理待发送请求的目标节点,基于网络 IO 检查对应节点是否可用,对于不可用的节点则剔除 Iterator<Node> iter = result.readyNodes.iterator(); long notReadyTimeout = Long.MAX_VALUE; while (iter.hasNext()) { Node node = iter.next(); // 检查目标节点是否准备好接收请求,如果未准备好但目标节点允许创建连接,则创建到目标节点的连接 if (!this.client.ready(node, now)) { // 对于未准备好的节点,则从 ready 集合中删除 iter.remove(); notReadyTimeout = Math.min(notReadyTimeout, this.client.pollDelayMs(node, now)); } } // create produce requests // 4. 获取每个节点待发送消息集合,其中 key 是目标 leader 副本所在节点 ID Map<Integer, List<ProducerBatch>> batches = this.accumulator.drain(cluster, result.readyNodes, this.maxRequestSize, now); addToInflightBatches(batches); if (guaranteeMessageOrder) { // 5. 如果需要保证消息的强顺序性,则缓存对应 topic 分区对象,防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息 // Mute all the partitions drained // 将所有 RecordBatch 的 topic 分区对象加入到 muted 集合中 // 防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息 for (List<ProducerBatch> batchList : batches.values()) { for (ProducerBatch batch : batchList) this.accumulator.mutePartition(batch.topicPartition); } } // 6. 处理本地过期的消息,返回 TimeoutException,并释放空间 accumulator.resetNextBatchExpiryTime(); List<ProducerBatch> expiredInflightBatches = getExpiredInflightBatches(now); List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(now); expiredBatches.addAll(expiredInflightBatches); // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics // for expired batches. see the documentation of @TransactionState.resetProducerId to understand why // we need to reset the producer id here. if (!expiredBatches.isEmpty()) log.trace("Expired {} batches in accumulator", expiredBatches.size()); for (ProducerBatch expiredBatch : expiredBatches) { String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition + ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation"; failBatch(expiredBatch, -1, NO_TIMESTAMP, new TimeoutException(errorMessage), false); if (transactionManager != null && expiredBatch.inRetry()) { // This ensures that no new batches are drained until the current in flight batches are fully resolved. transactionManager.markSequenceUnresolved(expiredBatch.topicPartition); } } sensors.updateProduceRequestMetrics(batches); // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately // loop and try sending more data. Otherwise, the timeout will be the smaller value between next batch expiry // time, and the delay time for checking data availability. Note that the nodes may have data that isn't yet // sendable due to lingering, backing off, etc. This specifically does not include nodes with sendable data // that aren't ready to send since they would cause busy looping. // 如果存在待发送的消息,则设置 pollTimeout 等于 0,这样可以立即发送请求,从而能够缩短剩余消息的缓存时间,避免堆积 long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout); pollTimeout = Math.min(pollTimeout, this.accumulator.nextExpiryTimeMs() - now); pollTimeout = Math.max(pollTimeout, 0); if (!result.readyNodes.isEmpty()) { log.trace("Nodes with data ready to send: {}", result.readyNodes); // if some partitions are already ready to be sent, the select time would be 0; // otherwise if some partition already has some data accumulated but not ready yet, // the select time will be the time difference between now and its linger expiry time; // otherwise the select time will be the time difference between now and the metadata expiry time; pollTimeout = 0; } // 7. 发送请求到服务端,并处理服务端响应 sendProduceRequests(batches, now); return pollTimeout; }RecordAccumulator#ready
步骤 1 ,该步骤用于计算需要向哪些节点投递消息在 RecordAccumulator#ready 方法中:
public ReadyCheckResult ready(Cluster cluster, long nowMs) { // 用于记录接收请求的节点 Set<Node> readyNodes = new HashSet<>(); // 记录下次执行 ready 判断的时间间隔 long nextReadyCheckDelayMs = Long.MAX_VALUE; // 记录找不到 leader 副本的分区对应的 topic 集合 Set<String> unknownLeaderTopics = new HashSet<>(); // 是否有线程在等待 BufferPool 分配空间 boolean exhausted = this.free.queued() > 0; // 遍历每个 topic 分区及其 RecordBatch 队列,对每个分区的 leader 副本所在的节点执行判定 for (Map.Entry<TopicPartition, Deque<ProducerBatch>> entry : this.batches.entrySet()) { Deque<ProducerBatch> deque = entry.getValue(); synchronized (deque) { // When producing to a large number of partitions, this path is hot and deques are often empty. // We check whether a batch exists first to avoid the more expensive checks whenever possible. ProducerBatch batch = deque.peekFirst(); if (batch != null) { TopicPartition part = entry.getKey(); // 获取当前 topic 分区 leader 副本所在的节点 Node leader = cluster.leaderFor(part); // 当前分区 leader 副本未知,但存在发往该分区的消息 if (leader == null) { // This is a partition for which leader is not known, but messages are available to send. // Note that entries are currently not removed from batches when deque is empty. unknownLeaderTopics.add(part.topic()); } else if (!readyNodes.contains(leader) && !isMuted(part, nowMs)) { // 如果需要保证消息顺序性,则不应该存在多个发往该 leader 副本节点且未完成的消息 long waitedTimeMs = batch.waitedTimeMs(nowMs); // 当前为重试操作,且重试时间间隔未达到阈值时间 boolean backingOff = batch.attempts() > 0 && waitedTimeMs < retryBackoffMs; long timeToWaitMs = backingOff ? retryBackoffMs : lingerMs; boolean full = deque.size() > 1 || batch.isFull(); boolean expired = waitedTimeMs >= timeToWaitMs; // 标记当前节点是否可以接收请求,如果满足其中一个则认为需要往目标节点投递消息: boolean sendable = full // 1. 队列中有多个 RecordBatch,或第一个 RecordBatch 已满 || expired // 2. 当前等待重试的时间过长 || exhausted // 3. 有其他线程在等待 BufferPool 分配空间,即本地消息缓存已满 || closed // 4. producer 已经关闭 || flushInProgress(); // 5. 有线程正在等待 flush 操作完成 if (sendable && !backingOff) { // 允许发送消息,且当前为首次发送,或者重试等待时间已经较长,则记录目标 leader 副本所在节点 readyNodes.add(leader); } else { long timeLeftMs = Math.max(timeToWaitMs - waitedTimeMs, 0); // Note that this results in a conservative estimate since an un-sendable partition may have // a leader that will later be found to have sendable data. However, this is good enough // since we'll just wake up and then sleep again for the remaining time. // 更新下次执行 ready 判定的时间间隔 nextReadyCheckDelayMs = Math.min(timeLeftMs, nextReadyCheckDelayMs); } } } } } return new ReadyCheckResult(readyNodes, nextReadyCheckDelayMs, unknownLeaderTopics); }RecordAccumulator#drain
知道了需要向哪些节点投递消息,接下来自然而然就需要获取发往每个节点的数据, 步骤 4 的实现位于 RecordAccumulator#drain 方法中:
public Map<Integer, List<ProducerBatch>> drain(Cluster cluster, Set<Node> nodes, int maxSize, long now) { if (nodes.isEmpty()) return Collections.emptyMap(); // 记录转换后的结果,key 是目标节点 ID Map<Integer, List<ProducerBatch>> batches = new HashMap<>(); for (Node node : nodes) { List<ProducerBatch> ready = drainBatchesForOneNode(cluster, node, maxSize, now); batches.put(node.id(), ready); } return batches; } private List<ProducerBatch> drainBatchesForOneNode(Cluster cluster, Node node, int maxSize, long now) { int size = 0; // 获取当前节点上的分区信息 List<PartitionInfo> parts = cluster.partitionsForNode(node.id()); // 记录待发往当前节点的 RecordBatch 集合 List<ProducerBatch> ready = new ArrayList<>(); /* to make starvation less likely this loop doesn't start at 0 */ /* * drainIndex 用于记录上次发送停止的位置,本次继续从当前位置开始发送, * 如果每次都是从 0 位置开始,可能会导致排在后面的分区饿死,可以看做是一个简单的负载均衡策略 */ int start = drainIndex = drainIndex % parts.size(); do { PartitionInfo part = parts.get(drainIndex); TopicPartition tp = new TopicPartition(part.topic(), part.partition()); this.drainIndex = (this.drainIndex + 1) % parts.size(); // Only proceed if the partition has no in-flight batches. // 如果需要保证消息强顺序性,则不应该同时存在多个发往目标分区的消息 if (isMuted(tp, now)) continue; Deque<ProducerBatch> deque = getDeque(tp); if (deque == null) continue; synchronized (deque) { // invariant: !isMuted(tp,now) && deque != null // 获取当前分区对应的 RecordBatch 集合 ProducerBatch first = deque.peekFirst(); if (first == null) continue; // first != null // 重试 && 重试时间间隔未达到阈值时间 boolean backoff = first.attempts() > 0 && first.waitedTimeMs(now) < retryBackoffMs; // Only drain the batch if it is not during backoff period. if (backoff) continue; // 仅发送第一次发送,或重试等待时间较长的消息 if (size + first.estimatedSizeInBytes() > maxSize && !ready.isEmpty()) { // there is a rare case that a single batch size is larger than the request size due to // compression; in this case we will still eventually send this batch in a single request // 单次消息数据量已达到上限,结束循环,一般对应一个请求的大小,防止请求消息过大 break; } else { if (shouldStopDrainBatchesForPartition(first, tp)) break; boolean isTransactional = transactionManager != null && transactionManager.isTransactional(); ProducerIdAndEpoch producerIdAndEpoch = transactionManager != null ? transactionManager.producerIdAndEpoch() : null; // 每次仅获取第一个 RecordBatch,并放入 read 列表中,这样给每个分区一个机会,保证公平,防止饥饿 ProducerBatch batch = deque.pollFirst(); if (producerIdAndEpoch != null && !batch.hasSequence()) { // If the batch already has an assigned sequence, then we should not change the producer id and // sequence number, since this may introduce duplicates. In particular, the previous attempt // may actually have been accepted, and if we change the producer id and sequence here, this // attempt will also be accepted, causing a duplicate. // // Additionally, we update the next sequence number bound for the partition, and also have // the transaction manager track the batch so as to ensure that sequence ordering is maintained // even if we receive out of order responses. batch.setProducerState(producerIdAndEpoch, transactionManager.sequenceNumber(batch.topicPartition), isTransactional); transactionManager.incrementSequenceNumber(batch.topicPartition, batch.recordCount); log.debug("Assigned producerId {} and producerEpoch {} to batch with base sequence " + "{} being sent to partition {}", producerIdAndEpoch.producerId, producerIdAndEpoch.epoch, batch.baseSequence(), tp); transactionManager.addInFlightBatch(batch); } // 将当前 RecordBatch 设置为只读 batch.close(); size += batch.records().sizeInBytes(); ready.add(batch); // 更新 drainedMs batch.drained(now); } } } while (start != drainIndex); return ready; }Sender#sendProduceRequests
消息发送的过程( 步骤 7 ),位于 Sender#sendProduceRequests 方法中: 这一步主要逻辑就是创建客户端请求 ClientRequest 对象,并通过 NetworkClient#send 方法将请求加入到网络 I/O 通道(KafkaChannel)中。同时将该对象缓存到 InFlightRequests 中,等接收到服务端响应时会通过缓存的 ClientRequest 对象调用对应的 callback 方法。最后调用 NetworkClient#poll 方法执行具体的网络请求和响应。
private void sendProduceRequests(Map<Integer, List<ProducerBatch>> collated, long now) { // 遍历处理待发送消息集合,key 是目标节点 ID for (Map.Entry<Integer, List<ProducerBatch>> entry : collated.entrySet()) sendProduceRequest(now, entry.getKey(), acks, requestTimeoutMs, entry.getValue()); } private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) { if (batches.isEmpty()) return; Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size()); final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size()); // find the minimum magic version used when creating the record sets byte minUsedMagic = apiVersions.maxUsableProduceMagic(); for (ProducerBatch batch : batches) { if (batch.magic() < minUsedMagic) minUsedMagic = batch.magic(); } // 遍历 RecordBatch 集合,整理成 produceRecordsByPartition 和 recordsByPartition for (ProducerBatch batch : batches) { TopicPartition tp = batch.topicPartition; MemoryRecords records = batch.records(); // down convert if necessary to the minimum magic used. In general, there can be a delay between the time // that the producer starts building the batch and the time that we send the request, and we may have // chosen the message format based on out-dated metadata. In the worst case, we optimistically chose to use // the new message format, but found that the broker didn't support it, so we need to down-convert on the // client before sending. This is intended to handle edge cases around cluster upgrades where brokers may // not all support the same message format version. For example, if a partition migrates from a broker // which is supporting the new magic version to one which doesn't, then we will need to convert. if (!records.hasMatchingMagic(minUsedMagic)) records = batch.records().downConvert(minUsedMagic, 0, time).records(); produceRecordsByPartition.put(tp, records); recordsByPartition.put(tp, batch); } String transactionalId = null; if (transactionManager != null && transactionManager.isTransactional()) { transactionalId = transactionManager.transactionalId(); } // 创建 ProduceRequest 请求构造器 ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout, produceRecordsByPartition, transactionalId); // 创建回调对象,用于处理响应 RequestCompletionHandler callback = new RequestCompletionHandler() { public void onComplete(ClientResponse response) { handleProduceResponse(response, recordsByPartition, time.milliseconds()); } }; String nodeId = Integer.toString(destination); // 创建 ClientRequest 请求对象,如果 acks 不等于 0 则表示期望获取服务端响应 ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0, requestTimeoutMs, callback); // 将请求加入到网络 I/O 通道(KafkaChannel)中。同时将该对象缓存到 InFlightRequests 中 client.send(clientRequest, now); log.trace("Sent produce request to {}: {}", nodeId, requestBuilder); }