文章归档

tcp hangle算法

默认socket是开启nagle特性的,该特性主要用于提升网卡的吞吐量,但会增加小包的传输时延。对于一些时延敏感的通信,需要关闭nagle算法:

int flags = 1;
rc = setsockopt (rs->ss_fd, IPPROTO_TCP, TCP_NODELAY, &flags, sizeof (flags));

内核会通过以下调用栈来设置socket的nagle特性:

setsockopt()    // net/socket.c
tcp_setsockopt() // net/ipv4/tcp.c

/*
 *	Socket option code for TCP.
 */
static int do_tcp_setsockopt(struct sock *sk, int level,       // net/ipv4/tcp.c
		int optname, char __user *optval, unsigned int optlen)
{
	//...
	switch (optname) {
	case TCP_NODELAY:
		if (val) {
			/* TCP_NODELAY is weaker than TCP_CORK, so that
			 * this option on corked socket is remembered, but
			 * it is not activated until cork is cleared.
			 *
			 * However, when TCP_NODELAY is set we make
			 * an explicit push, which overrides even TCP_CORK
			 * for currently queued segments.
			 */
			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
			tcp_push_pending_frames(sk);
		} else {
			tp->nonagle &= ~TCP_NAGLE_OFF;
		}
		break;
	}

tcp协议发送数据包的主要动作在 tcp_write_xmit() 这个函数里

/* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
 *
 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
 * account rare use of URG, this is not a big flaw.
 *
 * Returns 1, if no segments are in flight and we have queued segments, but
 * cannot send anything now because of SWS or another problem.
 */
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
              int push_one, gfp_t gfp)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
    unsigned int tso_segs, sent_pkts;
    int cwnd_quota;
    int result;

    sent_pkts = 0;

    if (!push_one) {
        /* Do MTU probing. */
        result = tcp_mtu_probe(sk);
        if (!result) {
            return 0;
        } else if (result > 0) {
            sent_pkts = 1;
        }
    }

    while ((skb = tcp_send_head(sk))) {
        unsigned int limit;

        // 初始化 skb 的 tso 状态,一个skb可能由多个package组成,tso_segs就是
        // skb内package的数量
        tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
        BUG_ON(!tso_segs);

        cwnd_quota = tcp_cwnd_test(tp, skb);
        if (!cwnd_quota)
            break;

        // 发送窗口判断
        if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
            break;

        // nagle检查,下文详解
        if (tso_segs == 1) {
            if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
                             (tcp_skb_is_last(sk, skb) ?
                              nonagle : TCP_NAGLE_PUSH))))
                break;
        } else {
            if (!push_one && tcp_tso_should_defer(sk, skb))
                break;
        }
        // tcp分片
        limit = mss_now;
        if (tso_segs > 1 && !tcp_urg_mode(tp))
            limit = tcp_mss_split_point(sk, skb, mss_now,
                            cwnd_quota);

        if (skb->len > limit &&
            unlikely(tso_fragment(sk, skb, limit, mss_now)))
            break;

        TCP_SKB_CB(skb)->when = tcp_time_stamp;

        // 发送数据包
        if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
            break;

        /* Advance the send_head.  This one is sent out.
         * This call will increment packets_out.
         */
        tcp_event_new_data_sent(sk, skb);

        tcp_minshall_update(tp, mss_now, skb);
        sent_pkts += tcp_skb_pcount(skb);

        if (push_one)
            break;
    }
    if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
        tp->prr_out += sent_pkts;

    if (likely(sent_pkts)) {
        tcp_cwnd_validate(sk);
        return 0;
    }
    return !tp->packets_out && tcp_send_head(sk);
}

从 tcp_write_xmit() 函数可以看出,发送数据包主要包括以下三个过程:

  1. 状态检查,包括tcp的一些特性状态判断以决定数据包该不该发送,如何发送,例如一些nagle/拥塞控制/缓冲区等特性
  2. 数据包分片
  3. 数据包发送

这里主要来看一下 tcp_nagle_test() 和 tcp_tso_should_defer() 这两个函数,tcp_nagle_test() 失败或者 tcp_tso_should_defer() 成功,数据包立即发送。
什么情况下,tcp_nagle_test() 会失败?

/* Return non-zero if the Nagle test allows this packet to be
 * sent now.
 */
static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
                 unsigned int cur_mss, int nonagle)
{
    /* Nagle rule does not apply to frames, which sit in the middle of the
     * write_queue (they have no chances to get new data).
     *
     * This is implemented in the callers, where they modify the 'nonagle'
     * argument based upon the location of SKB in the send queue.
     */
    if (nonagle & TCP_NAGLE_PUSH)
        return 1;

    /* Don't use the nagle rule for urgent data (or for the final FIN).
     * Nagle can be ignored during F-RTO too (see RFC4138).
     */
    if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
        (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
        return 1;

    if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
        return 1;

    return 0;
}

有几种情况:

  1. 当调用者显示的使用TCP_NAGLE_PUSH标志时,sendmsg() 系统调用默认取的就是 tp->nonagle,看tcp_push_pending_frames() 函数
  2. 带外数据,优先级高,理不应defer
  3. frto_counter (细节暂不清楚)
  4. FIN包,应立即发送
  5. 大包,数据包大小超过mss
  6. 设置了TCP_NAGLE_CORK标志
  7. 等等

而 tcp_tso_should_defer() 函数也根据 John Heffner 的tso算法来做一些判断。

Leave a Reply

You can use these HTML tags

<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>