tcp hangle算法
八月 3, 2014
tcp/ip internals
No Comments »
默认socket是开启nagle特性的,该特性主要用于提升网卡的吞吐量,但会增加小包的传输时延。对于一些时延敏感的通信,需要关闭nagle算法:
int flags = 1; rc = setsockopt (rs->ss_fd, IPPROTO_TCP, TCP_NODELAY, &flags, sizeof (flags));
内核会通过以下调用栈来设置socket的nagle特性:
setsockopt() // net/socket.c
tcp_setsockopt() // net/ipv4/tcp.c
/* * Socket option code for TCP. */ static int do_tcp_setsockopt(struct sock *sk, int level, // net/ipv4/tcp.c int optname, char __user *optval, unsigned int optlen) { //... switch (optname) { case TCP_NODELAY: if (val) { /* TCP_NODELAY is weaker than TCP_CORK, so that * this option on corked socket is remembered, but * it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make * an explicit push, which overrides even TCP_CORK * for currently queued segments. */ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tp->nonagle &= ~TCP_NAGLE_OFF; } break; }
tcp协议发送数据包的主要动作在 tcp_write_xmit() 这个函数里
/* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; sent_pkts = 0; if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return 0; } else if (result > 0) { sent_pkts = 1; } } while ((skb = tcp_send_head(sk))) { unsigned int limit; // 初始化 skb 的 tso 状态,一个skb可能由多个package组成,tso_segs就是 // skb内package的数量 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) break; // 发送窗口判断 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; // nagle检查,下文详解 if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb)) break; } // tcp分片 limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) break; TCP_SKB_CB(skb)->when = tcp_time_stamp; // 发送数据包 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) tp->prr_out += sent_pkts; if (likely(sent_pkts)) { tcp_cwnd_validate(sk); return 0; } return !tp->packets_out && tcp_send_head(sk); }
从 tcp_write_xmit() 函数可以看出,发送数据包主要包括以下三个过程:
- 状态检查,包括tcp的一些特性状态判断以决定数据包该不该发送,如何发送,例如一些nagle/拥塞控制/缓冲区等特性
- 数据包分片
- 数据包发送
这里主要来看一下 tcp_nagle_test() 和 tcp_tso_should_defer() 这两个函数,tcp_nagle_test() 失败或者 tcp_tso_should_defer() 成功,数据包立即发送。
什么情况下,tcp_nagle_test() 会失败?
/* Return non-zero if the Nagle test allows this packet to be * sent now. */ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return 1; /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) return 1; return 0; }
有几种情况:
- 当调用者显示的使用TCP_NAGLE_PUSH标志时,sendmsg() 系统调用默认取的就是 tp->nonagle,看tcp_push_pending_frames() 函数
- 带外数据,优先级高,理不应defer
- frto_counter (细节暂不清楚)
- FIN包,应立即发送
- 大包,数据包大小超过mss
- 设置了TCP_NAGLE_CORK标志
- 等等
而 tcp_tso_should_defer() 函数也根据 John Heffner 的tso算法来做一些判断。