多进程共享socket之close调用

九月 8, 2013
filesystem, tcp/ip internals

当父子进程共享一个socket描述符时,如果其中一个进程执行close调用,那会不会发送FIN包,进而影响另一个进程的其他相关操作?只会减少引用计数,没什么其他的操作。

看下源码,可以顺着以下的调用栈看:

close ------------ open.c
__close_fd ---- fs/file.c
filp_close ---------- fs/open.c
fput ------ fs/file_table.c
__fput ---------- fs/file_table.c (the last op)

首先socket描述符本身也属于Unix文件的一种,其文件相关的file_operations实现在net/socket.c源文件,如下:

/*
 *	Socket files have a set of 'special' operations as well as the generic file ones.
 *      These don't appear in the operation structures but are done directly via the socketcall()
 *      multiplexor.
 */

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

close() 系统调用

源码在fs/open.c文件里,可以看到close系统调用最终主要是通过__close_fd来完成的,其中current宏表示当前进程的描述信息体:

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
	int retval = __close_fd(current->files, fd);

	/* can't restart close syscall because file table entry was cleared */
	if (unlikely(retval == -ERESTARTSYS ||
		     retval == -ERESTARTNOINTR ||
		     retval == -ERESTARTNOHAND ||
		     retval == -ERESTART_RESTARTBLOCK))
		retval = -EINTR;

	return retval;
}
EXPORT_SYMBOL(sys_close);

__close_fd() 函数

__close_fd()函数首先从文件描述符数组fdt(每个进程结构体都有一个,保存了当前进程所有打开的文件信息)找到fd对应的file结构,__clear_close_on_exec()清除改文件描述符号在fdt结构里相关的标志位,__put_unused_fd()函数回收该文件描述符,然后剩下的操作由filp_close函数完成。

/*
 * The same warnings as for __alloc_fd()/__fd_install() apply here...
 */
int __close_fd(struct files_struct *files, unsigned fd)
{
	struct file *file;
	struct fdtable *fdt;

	spin_lock(&files->file_lock);
	fdt = files_fdtable(files);
	if (fd >= fdt->max_fds)
		goto out_unlock;
	file = fdt->fd[fd];
	if (!file)
		goto out_unlock;
	rcu_assign_pointer(fdt->fd[fd], NULL);
	__clear_close_on_exec(fd, fdt);
	__put_unused_fd(files, fd);
	spin_unlock(&files->file_lock);
	return filp_close(file, files);

out_unlock:
	spin_unlock(&files->file_lock);
	return -EBADF;
}

这里需要注意的是,多进程共享socket时只是共享一个socket结构,或者是文件file结构,而不是共享fd

file_close() 函数

file_close() 函数首先检查改文件对应的file_operations回调中是否存在flush,前面我们可以看到,socket在创建时并没有指定flush回调,并根据该文件的FMODE_PATH标志做相关的操作(这个不是重点,先忽略),重点是fput函数

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
	int retval = 0;

	if (!file_count(filp)) {
		printk(KERN_ERR "VFS: Close: file count is 0\n");
		return 0;
	}

	if (filp->f_op && filp->f_op->flush)
		retval = filp->f_op->flush(filp, id);

	if (likely(!(filp->f_mode & FMODE_PATH))) {
		dnotify_flush(filp, id);
		locks_remove_posix(filp, id);
	}
	fput(filp);
	return retval;
}

EXPORT_SYMBOL(filp_close);

fput() 函数

atomic_long_dec_and_test是一个原子操作,将file->f_count的引用计数减1,然后判断是否为0,如果为0,就表示当前进程是该文件的最后操作者,需要负责进一步的资源回收与清理工作,否则,直接返回。我们在这里可以看到如果在多进程共享socket的情况下,close一个socket描述符,只会将其file结构的引用计数减1,没有任何其他的操作。

void fput(struct file *file)
{
	if (atomic_long_dec_and_test(&file->f_count)) {
		struct task_struct *task = current;

		file_sb_list_del(file);
		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
			init_task_work(&file->f_u.fu_rcuhead, ____fput);
			if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
				return;
			/*
			 * After this task has run exit_task_work(),
			 * task_work_add() will fail.  free_ipc_ns()->
			 * shm_destroy() can do this.  Fall through to delayed
			 * fput to avoid leaking *file.
			 */
		}

		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
			schedule_work(&delayed_fput_work);
	}
}

如果当前进程是file结构的最后持有者,llist_add将该file结构挂到delayed_fput_list链表上,并调度相应的后台进程进行处理,这个处理函数就是delayed_fput_work,delayed_fput_work在声明的时候是这样的:

static DECLARE_WORK(delayed_fput_work, delayed_fput);

可见,其工作函数是delayed_fput,delayed_fput对于链表delayed_fput_list上的每一个file,调用__fput来回收和清理相关资源:

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
	struct llist_node *node = llist_del_all(&delayed_fput_list);
	struct llist_node *next;

	for (; node; node = next) {
		next = llist_next(node);
		__fput(llist_entry(node, struct file, f_u.fu_llist));
	}
}

__fput() 源码如下:

static void __fput(struct file *file)
{
	struct dentry *dentry = file->f_path.dentry;
	struct vfsmount *mnt = file->f_path.mnt;
	struct inode *inode = file->f_inode;

	might_sleep();

	fsnotify_close(file);
	/*
	 * The function eventpoll_release() should be the first called
	 * in the file cleanup chain.
	 */
	eventpoll_release(file);
	locks_remove_flock(file);

	if (unlikely(file->f_flags & FASYNC)) {
		if (file->f_op && file->f_op->fasync)
			file->f_op->fasync(-1, file, 0);
	}
	ima_file_free(file);
	if (file->f_op && file->f_op->release)
		file->f_op->release(inode, file);
	security_file_free(file);
	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
		     !(file->f_mode & FMODE_PATH))) {
		cdev_put(inode->i_cdev);
	}
	fops_put(file->f_op);
	put_pid(file->f_owner.pid);
	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
		i_readcount_dec(inode);
	if (file->f_mode & FMODE_WRITE)
		drop_file_write_access(file);
	file->f_path.dentry = NULL;
	file->f_path.mnt = NULL;
	file->f_inode = NULL;
	file_free(file);
	dput(dentry);
	mntput(mnt);
}

其中 file->f_op->release(inode, file); 也就是socket的file_operations里的release函数:sock_close,负责socket资源的回收和清理。

tcp拥塞控制

八月 5, 2013
tcp/ip internals

Transmission of packets from TCP sender is restricted by the congestion window. On reception of Ack TCP sender may increase the congestion window. At the start of TCP connection sender is in slow start wherein congestion window starts growing exponentially from a small value (typically 2) till a threshold is reached. After reaching the threshold TCP enters congestion avoidance phase and increments the congestion window linearly.

Congestion avoidance state machine

Event such as arrival of dupack, SACK and Explicit congestion notification indicate a possibility of congestion due to excess transmission of packets by the sender. These events are processed through a state machine which has following states:

  1. TCP_CA_Open
  2. TCP_CA_Disorder
  3. TCP_CA_CWR
  4. TCP_CA_Recovery
  5. TCP_CA_Loss

some flag set

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))

TCP_CA_OPEN

This is the default start state for the TCP connection. In this state connection will increase the congestion window as per slow start and congestion avoidance by calling tcp_cong_avoid. Every ack is checked for being dubious by calling tcp_ack_is_dubious. If the ack is declared dubious then tcp_fastretrans_alert changes CA state as per the event. Dubious ack will raise cwnd by calling tcp_cong_avoid if permitted by tcp_may_raise_cwnd.

TCP_CA_Disorder

This state indicates that TCP is experiencing some disorder in the form of Sacks and dupacks. The heuristics of connection are observed for some time to understand whether this is a genuine loss.

TCP_CA_CWR

This state indicates that there some indication of congestion such as ICMP source crunch, local device congestion; due to which TCP sender slowed packed transmission.

TCP_CA_Recovery

The state indicates that sender is fast retransmitting the packets on detection of packet loss.

TCP_CA_Loss

This state is entered when retransmission timer times out for some packet or the ack received indicates that the Sack information remembered by the sender is not in sync with the tcp receiver.

Detailed functionality:

Raising the cwnd (tcp_cong_avoid)

  1. During Slow start phase:
  2. Congestion window, snd_cwnd is incremented by one for every ack received. Effectively the congestion window is doubled in every round trip.
  3. Slow start size threshold, snd_ssthresh marks the maximum size to which snd_cwnd grows during slow start phase.
  4. snd_cwnd is never allowed to grow beyond snd_cwnd_clamp

During Congestion avoidance phase Once snd_cwnd reaches snd_ssthreash, snd_cwnd is incremented at a slow pace. The snd_cwnd is incremented by one per round trip. snd_cwnd is never allowed to grow beyond snd_cwnd_clamp

Checking for Dubious Ack (tcp_ack_is_dubious)

An ack is considered dubious if ANY of the following three conditions holds:

static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
  return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 
    inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
} 
  1. CA state is not Open
  2. if ALL the following are true: The packet received doesnt carry data && The ack is not updating the receive window in TCP header && The packet is not acking new data
  3. The packet indicates congestion, if ANY of the following is true: Packet has SACK || data ECE bit is set

Permission for raising cwnd (tcp_may_raise_cwnd)

On getting the dubious ack the congestion window will be allowed for raise if ALL the following conditions hold:
pre class="prettyprint lang-cpp">static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
  const struct tcp_sock *tp = tcp_sk(sk);
  return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
    !tcp_in_cwnd_reduction(sk);
}

  1. CA state is not Recovery or CWR
  2. Either ECE flag is not set or the cwnd is smaller than slow start threshold

Processing dubious ack event (tcp_fastretrans_alert)

1. SACK reneging (tcp_check_sack_reneging?): When a SACK is received the packets being SACKED are marked. A clearing ack would typically cover up for the sacked packets. However, if the ack received points to a remembered SACK that probably indicates that the knowledge of SACK is erroneous. Following actions are taken:

static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
  if (flag & FLAG_SACK_RENEGING) {
    struct inet_connection_sock *icsk = inet_csk(sk);
    NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);

    tcp_enter_loss(sk, 1);
    icsk->icsk_retransmits++;
    tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, 
        TCP_RTO_MAX);
    return true;
  }
  return false;
}

Enter loss (tcp_enter_loss?):

void tcp_enter_loss(struct sock *sk, int how)
{
  const struct inet_connection_sock *icsk = inet_csk(sk);
  struct tcp_sock *tp = tcp_sk(sk);
  struct sk_buff *skb;
  bool new_recovery = false;

  /* Reduce ssthresh if it has not yet been made inside this window. */
  if (icsk->icsk_ca_state <= TCP_CA_Disorder || !after(tp->high_seq, tp->snd_una)
    || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
    new_recovery = true;
    tp->prior_ssthresh = tcp_current_ssthresh(sk);
    tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    tcp_ca_event(sk, CA_EVENT_LOSS);
  }
  tp->snd_cwnd       = 1;
  tp->snd_cwnd_cnt   = 0;
  tp->snd_cwnd_stamp = tcp_time_stamp;

  tcp_clear_retrans_partial(tp);

  if (tcp_is_reno(tp))
    tcp_reset_reno_sack(tp);

  tp->undo_marker = tp->snd_una;
  if (how) {
    tp->sacked_out = 0;
    tp->fackets_out = 0;
  }
  tcp_clear_all_retrans_hints(tp);

  tcp_for_write_queue(skb, sk) {
    if (skb == tcp_send_head(sk))
      break;

    if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
      tp->undo_marker = 0;
    TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
    if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
      TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
      TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
      tp->lost_out += tcp_skb_pcount(skb);
      tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
    }
  }
  tcp_verify_left_out(tp);

  tp->reordering = min_t(unsigned int, tp->reordering,
             sysctl_tcp_reordering);
  tcp_set_ca_state(sk, TCP_CA_Loss);
  tp->high_seq = tp->snd_nxt;
  TCP_ECN_queue_cwr(tp);

  /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
   * loss recovery is underway except recurring timeout(s) on
   * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
   */
  tp->frto = sysctl_tcp_frto &&
    (new_recovery || icsk->icsk_retransmits) &&
    !inet_csk(sk)->icsk_mtup.probe_size;
}
  1. Remember the sequence number of the packet to be sent next(snd_nxt value) at the onset of congestion, as high_seq.
  2. Reduce cwnd to 1
  3. undo_marker = snd_una
  4. Queue CWR for indicating congestion situation to peer.(ref TCP_ECN_queue_cwr)
  5. If the state is CWR or recovery (Rate halving will happen), prior_ssthresh is set to the value of ssthresh.
  6. Change CA state to Loss

Retransmit the packet at the top of write queue (which was erroneously marked as sacked earlier)

2.Time to recover (tcp_time_to_recover) This function examines various parameters (like number of packets lost) for TCP connection to decide whether its the right time to move to Recovery state. Its time to recover when TCP heuristics suggest a strong possibility of packet loss in the network, the following checks are made:

static bool tcp_time_to_recover(struct sock *sk, int flag)
{
  struct tcp_sock *tp = tcp_sk(sk);
  __u32 packets_out;

  /* Trick#1: The loss is proven. */
  if (tp->lost_out)
    return true;
  /* Not-A-Trick#2 : Classic rule... */
  if (tcp_dupack_heuristics(tp) > tp->reordering)
    return true;

  /* Trick#4: It is still not OK... But will it be useful to delay
   * recovery more?
   */
  packets_out = tp->packets_out;
  if (packets_out <= tp->reordering &&
      tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
      !tcp_may_send_now(sk)) {
    /* We have nothing to send. This connection is limited
     * either by receiver window or by application.
     */
    return true;
  }

  /* If a thin stream is detected, retransmit after first
   * received dupack. Employ only if SACK is supported in order
   * to avoid possible corner-case series of spurious retransmissions
   * Use only if there are no unsent data.
   */
  if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
      tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
      tcp_is_sack(tp) && !tcp_send_head(sk))
    return true;

  /* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
   * retransmissions due to small network reorderings, we implement
   * Mitigation A.3 in the RFC and delay the retransmission for a short
   * interval if appropriate.
   */
  if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
      (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) 
         && !tcp_may_send_now(sk))
    return !tcp_pause_early_retransmit(sk, flag);
  return false;
} 
  1. some packets are lost (lost_out is non zero)
  2. SACK is an acknowledgement for out of order packets. If number of packets Sacked is greater than the reordering metrics of the network, then loss is assumed to have happened
  3. If the following three conditions are true, TCP sender is in a state where no more data can be transmitted and number of packets acked is big enough to assume that rest of the packets are lost in the network:
    1. If packets in flight is less that the reordering metrics ( condition 2 will never be true)
    2. more than half of the packets in flight have been sacked by the receiver or number of packets sacked is more than the Fast Retransmit thresh. (Fast Retransmit thresh is the number of dupacks that sender awaits before starting fast retransmission)
    3. the sender can not send any more packets because either it is bound by the sliding window or the application has not delivered any more data to it in anticipation of Ack for already provided data.

If its declared to be the time to recover; CA State would switch to Recovery.

3.Try to open (tcp_try_to_open) If its not yet the time to move to recovery state, tcp_try_to_open will check for the following:

static void tcp_try_keep_open(struct sock *sk)
{
  struct tcp_sock *tp = tcp_sk(sk);
  int state = TCP_CA_Open;

  if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
    state = TCP_CA_Disorder;

  if (inet_csk(sk)->icsk_ca_state != state) {
    tcp_set_ca_state(sk, state);
    tp->high_seq = tp->snd_nxt;
  }
}

static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
{
  struct tcp_sock *tp = tcp_sk(sk);

  tcp_verify_left_out(tp);

  if (!tcp_any_retrans_done(sk))
    tp->retrans_stamp = 0;

  if (flag & FLAG_ECE)
    tcp_enter_cwr(sk, 1);

  if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
    tcp_try_keep_open(sk);
    if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
      tcp_moderate_cwnd(tp);
  } else {
    tcp_cwnd_reduction(sk, prior_unsacked, 0);
  }
}
  1. If the packet is indicating ECE then state will switch to CWR. Cwnd wil be reduced by calling tcp_cwnd_down and reduction will be indicate to peer by queuing CWR notification.
  2. If the state is not CWR; if we have any sacked or retransmitted packets set the state to Disorder and call tcp_moderate_cwnd.
  3. tcp_moderate_cwnd In Disorder state TCP is still unsure of genuineness of loss, after receiving acks with sack there may be a clearing ack which acks many packets non dubiously in one go. Such a clearing ack may cause a packet burst in the network, to avoid this cwnd size is reduced to allow no more than max_burst (usually 3) number of packets.

4.Update scoreboard (tcp_update_scoreboard) This function will mark all the packets which were not sacked (till the maximum seq number sacked) as lost packets. Also the packets which have waited for the acks to arrive for interval equivalent to retransmission time are marked as lost packets. The accounting for lost , sacked and left packets is also done in this function.

5.Transmit packets (tcp_xmit_retransmit_queue)

  1. The packets which are marked as lost and not yet retransmitted are retransmitted till all lost packets are transmitted or cwnd limit is reached.
  2. If the CA state is recovery (which means packets were actually lost on network) :
  3. If permitted by cwnd, new packets are transmitted.
  4. If new packets can not be transmitted though cwnd allows more transmissions then already transmitted packets are retransmitted (since there is evidence of lossy network)

6.Processing subsequent dubious acks The processing of the subsequent dubious acks will depend on the current CA state:
1 If the current state is Recovery
If a partial ack arrives acking some of the data in transit call (tcp_try_undo_partial):

  1. tcp_may_undo : Check whether any retransmissions were done. If the timestamp is enabled we may further check whether the ack is generated for the retransmitted packet of the original packet. This indicates whether TCP unnecessarily switched the CA state and should undo the actions taken.
  2. If tcp_may_undo permits:
  3. (tcp_undo_cwr):
  4. undo the cwnd reduction. If prior_ssthresh is zero bring cwnd to ssthresh level. If prior ssthresh is non zero, restore cwnd to its value at the time of cwnd reduction. Restore ssthresh to prior_ssthresh. (prior ssthresh may be set to zero when cwnd undo is not intended e.g ECE reception)
  5. Moderate cwnd to prevent packet burst in network (tcp_moderate_cwnd)
  6. call TCP_ECN_withdraw_cwr(ref)

set is_dupack to false in order to prevent retransmission of further packets.
2 If the current state is Loss

  1. tcp_try_undo_loss: If a partial ack arrives acking some of the data in transit; call tcp_undo_cwr for cwnd restoration if allowed by tcp_may_undo.
  2. If undo is not allowed, re-moderate cwnd (tcp_moderate_cwnd) and retransmit packets by calling tcp_xmit_retransmit_queue.

3 If the current state is not Recovery or Loss

  1. If state is Disorder call tcp_try_undo_dsack: [Whenever a D-sack arrives , it indicates that packet was received twice by the receiver which means retransmission of packet was unnecessary. Undo_retrans variable is decremented for each dsack.] If undo_retrans is set to zero call tcp_undo_cwr.
  2. Call tcp_time_to_recover to check if it is the time to recover, if not then call tcp_try_to_open and wait for subsequent acks.
  3. If it is the time to recover switch to Recovery state and retransmit more packets by calling tcp_xmit_retransmit_queue.

7.Clearing ack received When sender TCP receives an ack for the highest sequence number that was transmitted at the time of CA state switch from Open ; CA state machine will exit towards Open state.
1 If the current state is Loss
Attempt recovery (tcp_try_undo_recovery):

  1. tcp_may_undo is called to see if ack was for original packet and the changes may be undone. If permitted by tcp_may_undo, tcp_undo_cwr is called to recover cwnd and ssthresh values.
  2. After undoing the cwnd reduction: if the ack is received for packets beyond high_seq then switch to Open state, else wait for processing of subsequent acks before announcing a state switch.

2 If the current state is CWR

  1. If seq number greater than high_seq is acked, it indicates that the CWR indication has reached the peer TCP, call tcp_complete_cwr to bring down the cwnd to ssthresh value.
  2. switch to Open state.

3 If the current state is disorder

  1. Call tcp_try_undo_dsack for possibility of undoing cwnd changes.
  2. When an ack is received for packet higher than high_seq, switch to Open state. This is a safe point for switching the state as dupacks are no more expected.

4 If the current state is recovery

  1. Call tcp_try_undo_recovery for restoring the cwnd and switching to Open state.
  2. Call tcp_complete_cwr to bring down the cwnd to ssthresh value.

Maintainer: seema.garg@hsc.com

奔腾年代

七月 27, 2013
Life

不能因为他有一点瑕疵,我们就抹杀他的天赋

好片,看的真是热血沸腾,主要有几点:

  1. 胸怀梦想,纯粹的令人感动
  2. 不断的挑战自身极限,不管失败多少次,都要努力登上顶峰。
  3. 还有他和沃夫之间的这段友情

这种感受,和 当幸福来敲门 一样,从这里我们都似乎看到了自己的影子

 

The new hlist api for linux kernel

七月 26, 2013
Kernel

今天在写list的时候参考了linux kernel的list实现,发现内核hlist的实现还有一个可以改进的地方,当前linux版本为 redhat 2.6.32-358.14.1.el6.x86_64
关键是这两个API,list_for_each_entry和hlist_for_each_entry

/**
 * list_for_each_entry	-	iterate over list of given type
 * @pos:	the type * to use as a loop cursor.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry(pos, head, member)				\
	for (pos = list_entry((head)->next, typeof(*pos), member);	\
	     &pos->member != (head); 	\
	     pos = list_entry(pos->member.next, typeof(*pos), member))

/**
 * hlist_for_each_entry	- iterate over list of given type
 * @tpos:	the type * to use as a loop cursor.
 * @pos:	the &struct hlist_node to use as a loop cursor.
 * @head:	the head for your list.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(tpos, pos, head, member)			 \
	for (pos = (head)->first;					 \
	     pos &&							 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

hlist比普通的list多了一个tpos,这是为什么。主要是因为hlist_for_each_entry()函数for循环里的条件判断,仅仅靠一个变量是很难完成的,为什么list_for_each_entry()就可以呢?

  1. list是一个双向列表,所有对象通过 struct list_head { struct list_head *next; struct list_head *prev}; 衔接在一起,链表头在初始化的时候,next域和prev域均是指向自身,也就是说,next域不可能为NULL,在list_for_each_entry宏里,&pos->member总能得到一个合法的地址,所以list的实现里可以很简单的 &pos->member != (head); 作为判断条件,而不会引起segmenfaul
  2. 但是hlist就不同了,hlist_node的next域和hlist_head的first域是可以为NULL的

现在假设一下hlist像list这么实现时会有什么问题,如下:

#define hlist_for_each_entry(pos, head, member)				\
	for (pos = hlist_entry((head)->first, typeof(*pos), member);	\
	     &pos->member != ???; 	\
	     pos = hlist_entry(pos->member.next, typeof(*pos), member))

这里的条件判断还真不好写,因为当first为NULL或者当前某个pos->member.next为NULL的时候,计算出来的pos是个负值,我们首先还要判断first/next域是否为NULL,再做处理。正是多了这个判断条件,仅仅一个pos似乎不够了,所以就引入了多一个成员,tpos,而pos的类型变成了hlist_node

但是hlist_for_each_entry宏还是可以改进的,我今天突然想出了一个办法,不需要tpos,仅仅一个pos就足够了,和list的接口一样。

细看hlist的实现,pos的目的,最主要的还是在条件判断的时候起作用,但是这个工作我们可以巧妙的让tpos来完成,通过强制类型转换。因为tpos和pos都是指针,而指针本质上在编译的时候是没有任何区别的,仅仅当访问结构体变量的时候通过查询其类型来计算偏移量,我的实现是:

//
// hlist_for_each_entry	- iterate over list of given type
// @param pos	the &struct hlist_node to use as a loop counter.
// @param head	the head for your list.
// @param type
// @param member        the name of the hlist_node within the struct.

#define hlist_for_each_entry(pos, head, member)  \
	for (pos = (typeof(pos)) (head)->first; pos && \
		     ({pos = hlist_entry((struct hlist_node *)pos, typeof(*pos), member); 1;}); \
	     pos = (type *)pos->member.next)

我临时借pos来保存hlist_node指针,判断是否为NULL,如果否,再将其转换为原本的类型。

upstream的内核已经有这个了,但是做法不一样,实现更加优雅一点,主要是利用c99里的新特性,如下:

#define hlist_entry_safe(ptr, type, member) \
	({ typeof(ptr) ____ptr = (ptr); \
	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
	})

/**
 * hlist_for_each_entry	- iterate over list of given type
 * @pos:	the type * to use as a loop cursor.
 * @head:	the head for your list.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)				\
	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
	     pos;							\
	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

The Design of a High-Performance File Server

七月 20, 2013
storage

老论文了,Bullet file Server,文献原址:http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.38.5481&rep=rep1&type=pdf

key point:

  1. to do away with the traditional file server's block model. In fact, we have chosen the extreme, which is to maintain files contiguously throughout the system. That is, files are contiguously stored on disk, contiguously cached in RAM, and kept contiguously in processes’ memories. This dictates the choice for whole file transfer.
  2. Another design choice, which is closely linked to keeping files contiguous, is to make all files immutable. That is, the only operations on files are creation, retrieval, and deletion; there are no update-in-place operations. Instead, if we want to update a data struc-
    ture that is stored on a file, we do this by creating a new file holding the updated data structure.

今天重新思考了一下这几年来了解过的分布式存储系统。

分布式key-value存储系统的设计,可以依据数据类型来进行分类,典型的如数据的大小,因为数据的尺寸通常代表了不同特定的场景和应用:

  1. 对于记录型数据,例如用户信息,日志,消息等等,数据尺寸普遍在100Bytes~20k范围内,绝大多数可能不会超过1k。
  2. 小文件数据,例如搜索引擎抓取的页面,图片等等,数据尺寸普遍在4k~1m范围内,大部分不到100k。
  3. 大文件数据,例如压缩包,软件安装包,高清图片,等等,数据尺寸普遍在1m~100m+范围内

存储引擎和分布式系统通常是解耦的,所以这里仅考虑存储引擎怎么设计的问题。

对于1类型数据,特点是:

  1. 数据量庞大,继而要求系统对读写请求的负载能力非常高
  2. 数据小,高IO

可以按照类似levelDB/nessDB等存储引擎设计,关键是,如何降低随机IO,如何提高缓存命中率。所以重点是缓存怎么设计,这个相比其他两类的系统,复杂很多。

对于3类型数据,特点是:

  1. 数据大,网络带宽受限,如何提升传输速度和降低网络时延?

数据需要进行分块,需要有元数据中心保存key到块列表的map信息,分块的目的,是为了充分利用网络带宽的资源,提高吞吐量。例如GFS等。

而对于2类型数据,则又比较特别,量不算大,尺寸也不大。可以按照类似fastDFS/weed-fs/haystack等系统进行设计,文件聚合存储 + index,减少file handler数量,利用mmap来做缓存,依赖os特性,IO复杂度尽可能的为O(1)

NAPI: the new device driver packet processing framework

七月 19, 2013
tcp/ip internals

在napi方式还没有出现之前,网络数据包的接收过程大概是这样的:

  1. 数据包达到网卡,硬中断,中断处理程序处理数据包,调用netif_rx()将数据包放到cpu队列
  2. 然后出发软中断去调用net_rx_action()函数,内核将cpu队列中的数据包送到网络层

napi要解决的问题是:

  • Interrupt mitigation: High-speed networking can create thousands of interrupts per second, all of which tell the system something it already knew: it has lots of packets to process. NAPI allows drivers to run with (some) interrupts disabled during times of high traffic, with a corresponding decrease in system load.
  • Packet throttling :When the system is overwhelmed and must drop packets, it's better if those packets are disposed of before much effort goes into processing them. NAPI-compliant drivers can often cause packets to be dropped in the network adaptor itself, before the kernel sees them at all.

一个是尽可能的减少软中断的触发,二是提前丢弃数据包。napi实现后,网卡驱动通常有自己的缓冲队列,当有数据包到达时,将自己挂载在cpu的轮询队列poll_list里,然后触发NET_RX_SOFTIRQ中断,核心的数据结构是softnet_data和napi_struct,napi_struct是内核与网卡驱动之间的接口,驱动必须实现自己的napi_struct(对于一些不暂时不支持napi方式接收数据包的驱动,内核中有默认的处理方法,后面会提到)

引用 NAPI 技术在 Linux 网络驱动上的应用和完善 一文中的图

/*
 * Incoming packets are placed on per-cpu queues
 */
struct softnet_data {
	struct Qdisc		*output_queue;
	struct Qdisc		**output_queue_tailp;
	// 当前cpu上的待询轮设备队列
	struct list_head	poll_list;
	struct sk_buff		*completion_queue;
	struct sk_buff_head	process_queue;

	/* stats */
	unsigned int		processed;
	unsigned int		time_squeeze;
	unsigned int		cpu_collision;
	unsigned int		received_rps;

#ifdef CONFIG_RPS
	struct softnet_data	*rps_ipi_list;

	/* Elements below can be accessed between CPUs for RPS */
	struct call_single_data	csd ____cacheline_aligned_in_smp;
	struct softnet_data	*rps_ipi_next;
	unsigned int		cpu;
	unsigned int		input_queue_head;
	unsigned int		input_queue_tail;
#endif
	unsigned int		dropped;
	struct sk_buff_head	input_pkt_queue;
	struct napi_struct	backlog;
};

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
	/* The poll_list must only be managed by the entity which
	 * changes the state of the NAPI_STATE_SCHED bit.  This means
	 * whoever atomically sets that bit can add this napi_struct
	 * to the per-cpu poll_list, and whoever clears that bit
	 * can remove from the list right before clearing the bit.
	 */
	struct list_head	poll_list;

	unsigned long		state;
	int			weight;
	unsigned int		gro_count;
	// 驱动相关的实现内核可回调的poll方法
	int			(*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
	spinlock_t		poll_lock;
	int			poll_owner;
#endif
	struct net_device	*dev;
	struct sk_buff		*gro_list;
	struct sk_buff		*skb;
	struct list_head	dev_list;
};

进入NET_RX_SOFTIRQ中断处理程序后,也就是net_rx_action函数,会遍历当前cpu上的待轮询设备队列,执行相应的poll方法,poll方法的主要作用是将网卡驱动队列里的数据包送到网络层。例如,可以看看drivers/net/ethenet/intel/e1000e/netdev.c驱动代码的相关处理

static void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = &__get_cpu_var(softnet_data);
	unsigned long time_limit = jiffies + 2;
	int budget = netdev_budget;
	void *have;

	local_irq_disable();

	while (!list_empty(&sd->poll_list)) {
		struct napi_struct *n;
		int work, weight;

		/* If softirq window is exhuasted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
			goto softnet_break;

		local_irq_enable();

		/* Even though interrupts have been re-enabled, this
		 * access is safe because interrupts can only add new
		 * entries to the tail of this list, and only ->poll()
		 * calls can remove this head entry from the list.
		 */
		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

		have = netpoll_poll_lock(n);

		weight = n->weight;

		/* This NAPI_STATE_SCHED test is for avoiding a race
		 * with netpoll's poll_napi().  Only the entity which
		 * obtains the lock and sees NAPI_STATE_SCHED set will
		 * actually make the ->poll() call.  Therefore we avoid
		 * accidentally calling ->poll() when NAPI is not scheduled.
		 */
		work = 0;
		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
			work = n->poll(n, weight);
			trace_napi_poll(n);
		}

		WARN_ON_ONCE(work > weight);

		budget -= work;

		local_irq_disable();

		/* Drivers must not modify the NAPI state if they
		 * consume the entire weight.  In such cases this code
		 * still "owns" the NAPI instance and therefore can
		 * move the instance around on the list at-will.
		 */
		if (unlikely(work == weight)) {
			if (unlikely(napi_disable_pending(n))) {
				local_irq_enable();
				napi_complete(n);
				local_irq_disable();
			} else {
				if (n->gro_list) {
					/* flush too old packets
					 * If HZ < 1000, flush all packets.
					 */
					local_irq_enable();
					napi_gro_flush(n, HZ >= 1000);
					local_irq_disable();
				}
				list_move_tail(&n->poll_list, &sd->poll_list);
			}
		}

		netpoll_poll_unlock(have);
	}
out:
	net_rps_action_and_irq_enable(sd);

#ifdef CONFIG_NET_DMA
	/*
	 * There may not be any more sk_buffs coming right now, so push
	 * any pending DMA copies to hardware
	 */
	dma_issue_pending_all();
#endif

	return;

softnet_break:
	sd->time_squeeze++;
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	goto out;
}

为了兼容新的napi,对于一些旧的或者还没有支持以napi方式处理数据包的驱动,内核通过在softnet_data里增加一个模拟的napi结构backlog来达到这个目的,netif_rx的作用就是看当前softnet_data里的input_pkt_queue队列是否为空,如果为空,说明当前backlog并不在cpu的轮询队列上,所以要挂载backlog,否则,就直接将数据包插入到input_pkt_queue队尾,然后触发NET_RX_SOFTIRQ中断

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
{
	struct softnet_data *sd;
	unsigned long flags;

	sd = &per_cpu(softnet_data, cpu);

	local_irq_save(flags);

	rps_lock(sd);
	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
		if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
			__skb_queue_tail(&sd->input_pkt_queue, skb);
			input_queue_tail_incr_save(sd, qtail);
			rps_unlock(sd);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
			if (!rps_ipi_queued(sd))
				____napi_schedule(sd, &sd->backlog);
		}
		goto enqueue;
	}

	sd->dropped++;
	rps_unlock(sd);

	local_irq_restore(flags);

	atomic_long_inc(&skb->dev->rx_dropped);
	kfree_skb(skb);
	return NET_RX_DROP;
}

/**
 *	netif_rx	-	post buffer to the network code
 *	@skb: buffer to post
 *
 *	This function receives a packet from a device driver and queues it for
 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 *	may be dropped during processing for congestion control or by the
 *	protocol layers.
 *
 *	return values:
 *	NET_RX_SUCCESS	(no congestion)
 *	NET_RX_DROP     (packet was dropped)
 *
 */

int netif_rx(struct sk_buff *skb)
{
	int ret;

	/* if netpoll wants it, pretend we never saw it */
	if (netpoll_rx(skb))
		return NET_RX_DROP;

	net_timestamp_check(netdev_tstamp_prequeue, skb);

	trace_netif_rx(skb);
#ifdef CONFIG_RPS
	if (static_key_false(&rps_needed)) {
		struct rps_dev_flow voidflow, *rflow = &voidflow;
		int cpu;

		preempt_disable();
		rcu_read_lock();

		cpu = get_rps_cpu(skb->dev, skb, &rflow);
		if (cpu < 0)
			cpu = smp_processor_id();

		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

		rcu_read_unlock();
		preempt_enable();
	} else
#endif
	{
		unsigned int qtail;
		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
		put_cpu();
	}
	return ret;
}

数据结构对齐问题

七月 16, 2013
Compiler and vm

昨天去了迅雷,有个笔试基础题,结构体对齐相关,sizeof关键字值。回来后感觉不太确信,于是回头翻了翻lcc编译器的代码,看了结构体字段相关的处理过程。
几个基础的数据结构,其中

  • Metrics表示数据类型的大小,对齐值,autofline目前不太清楚
  • 结构体类型是通过type中的u.sym表示的,里面包含一个field类型的链表

typedef struct metrics {
    unsigned char size, align, outofline;
} Metrics;

struct type {
    int op;
    Type type;
    int align;
    int size;
    union {
	Symbol sym;
	struct {
	    unsigned oldstyle:1;
	    Type *proto;
	} f;
    } u;
    Xtype x;
};

struct field {
    char *name;
    Type type;
    int offset;
    short bitsize;
    short lsb;
    Field link;
};

在x86平台上,基础类型的Metries值(分别是 size align outofline)如下:

Interface x86linuxIR = {
        1, 1, 0,  /* char */
        2, 2, 0,  /* short */
        4, 4, 0,  /* int */
        4, 4, 0,  /* long */
        4, 4, 0,  /* long long */
        4, 4, 1,  /* float */
        8, 4, 1,  /* double */
        8, 4, 1,  /* long double */
        4, 4, 0,  /* T * */
        0, 1, 0,  /* struct */
...
};

然后我们来看看编译器是如何计算结构体字段的偏移量和大小的,相关代码在src/decl.c的fields()函数里,调用栈如下:

#0  fields (ty=0x6b39d0) at src/decl.c:709
#1  0x000000000040413d in structdcl (op=9) at src/decl.c:600
#2  0x000000000040263b in specifier (sclass=0x7fffffffd7f4) at src/decl.c:119
#3  0x00000000004029dc in decl (dcl=0x402de9 ) at src/decl.c:184
#4  0x00000000004023fb in program () at src/decl.c:40
#5  0x00000000004014a5 in main (argc=4, argv=0x7fffffffd998) at src/main.c:85

fields()函数代码:

static void fields(Type ty)
{
  // 词法和语法分析
  {
    int n = 0;
    while (istypename(t, tsym)) {
      static char stop[] = { IF, CHAR, '}', 0 };
      Type ty1 = specifier(NULL);
      for (;;) {
        Field p;
        char *id = NULL;
        Type fty = dclr(ty1, &id, NULL, 0);
        p = newfield(id, ty, fty);
        if (Aflag >= 1 && !hasproto(p->type))
          warning("missing prototype\n");
        if (t == ':') {
          if (unqual(p->type) != inttype
            &&  unqual(p->type) != unsignedtype) {
            error("`%t' is an illegal bit-field type\n",
                p->type);
            p->type = inttype;
          }
          t = gettok();
          p->bitsize = intexpr(0, 0);
          if (p->bitsize > 8*inttype->size || p->bitsize < 0) { 
            error("`%d' is an illegal bit-field size\n", p->bitsize);
            p->bitsize = 8*inttype->size;
          } else if (p->bitsize == 0 && id) {
            warning("extraneous 0-width bit field `%t %s' ignored\n", 
                    p->type, id);
            p->name = stringd(genlabel(1));
          }
          p->lsb = 1;
        } else {
          if (id == NULL)
            error("field name missing\n");
          else if (isfunc(p->type))
            error("`%t' is an illegal field type\n", p->type);
          else if (p->type->size == 0)
            error("undefined size for field `%t %s'\n",
                p->type, id);
        }
        if (isconst(p->type))
          ty->u.sym->u.s.cfields = 1;
        if (isvolatile(p->type))
          ty->u.sym->u.s.vfields = 1;
        n++;
        if (Aflag >= 2 && n == 128)
          warning("more than 127 fields in `%t'\n", ty);
        if (t != ',')
          break;
        t = gettok();
      }
      test(';', stop);
    }
  }

  // 结构体字段offset和size的处理
  {
    int bits = 0, off = 0, overflow = 0;
    Field p, *q = &ty->u.sym->u.s.flist;
    ty->align = IR->structmetric.align;
    for (p = *q; p; p = p->link) {
      int a = p->type->align ? p->type->align : 1;
      if (p->lsb)
        a = unsignedtype->align;
      if (ty->op == UNION)
        off = bits = 0;
      else if (p->bitsize == 0 || bits == 0
           || bits - 1 + p->bitsize > 8*unsignedtype->size) {
        off = add(off, bits2bytes(bits-1));
        bits = 0;
        chkoverflow(off, a - 1);
        off = roundup(off, a);
      }
      if (a > ty->align)
        ty->align = a;
      p->offset = off;

      if (p->lsb) {
        if (bits == 0)
          bits = 1;
        if (IR->little_endian)
          p->lsb = bits;
        else
          p->lsb = 8*unsignedtype->size - bits + 1
            - p->bitsize + 1;
        bits += p->bitsize;
      } else
        off = add(off, p->type->size);
      if (off + bits2bytes(bits-1) > ty->size)
        ty->size = off + bits2bytes(bits-1);
      if (p->name == NULL
        || !('1' <= *p->name && *p->name <= '9')) { *q = p; q = &p->link;
      }
    }
    *q = NULL;
    chkoverflow(ty->size, ty->align - 1);
    ty->size = roundup(ty->size, ty->align);
    if (overflow) {
      error("size of `%t' exceeds %d bytes\n", ty, inttype->u.sym->u.limits.max.i);
      ty->size = inttype->u.sym->u.limits.max.i&(~(ty->align - 1));
    }
  }
}

代码这么清晰,就不继续码字了。

some developer tools

七月 12, 2013
Compiler and vm

Performance tools: ifstat(1), iftop(8), iostat(1), mpstat(1), netstat(1), nfsstat(1), nstat, vmstat(1), xosview(1)
Debugging tools: htop(1), lslk(1), lsof(8), top(1)
Process tracing: ltrace(1), pmap(1), ps(1), pstack(1), strace(1)
Binary debugging: ldd(1), file(1), nm(1), objdump(1), readelf(1)
Memory usage tools: free(1), memusage, memusagestat, slabtop(1)
Accounting tools: dump-acct, dump-utmp, sa(8)
Hardware debugging tools: dmidecode, ifinfo(1), lsdev(1), lshal(1), lshw(1), lsmod(8), lspci(8), lsusb(8), smartctl(8), x86info(1)
Application debugging: mailstats(8), qshape(1)
Xorg related tools: xdpyinfo(1), xrestop(1)
Other useful info: collectl(1), proc(5), procinfo(8)

李鸿章遗折

六月 14, 2013
Life

Unreliable Guide To Locking

六月 10, 2013
Kernel

- rusty Unreliable Guide To Locking

简单来说,就两个要点:

  1. 跨上下文要避免抢占
  2. 上锁过程是否可以睡眠

http://blog.pipul.org/2012/11/锁/