扩展net_cls子系统实现端口隔离

Time: 二月 28, 2015
Category: cgroup

端口隔离的目的是要严格隔离不同容器所使用的端口段。我们通过扩展内核net_cls模块来达到此目的

  1. bind端口隔离:限制bind使用的端口范围,通过net_cls.bind_port_range配置文件设置
  2. connect端口隔离:限制进程能connect的端口范围,通过net_cls.connect_port_range配置文件设置

其中bind_port_range和connect_port_range配置文件的接口定义如下:

static struct cftype ss_files[] = {
  {
    .name = "bind_port_range",
    .read_seq_string = port_range_read,
    .write_string = port_range_write,
    .max_write_len = 20U,
    .private = FILE_BIND,
  },
  {
    .name = "connect_port_range",
    .read_seq_string = port_range_read,
    .write_string = port_range_write,
    .max_write_len = 20U,
    .private = FILE_CONNECT,
  },
  //...
}

通过port_range_read/port_range_write函数来实现配置参数的读写。我们重点关注cgrp_local_port_range()函数,这个函数取得当前进程所在的cgroup能使用的端口范围段,区分于是否是bind/connect:

/**
 * cgrp_local_port_range  -  get available local port range
 *
 * @connect:  select port range
 */
int cgrp_local_port_range(int *low, int *high, int connect)
{
  struct port_range_st *pr;

  rcu_read_lock();
  pr = &task_cls_state(current)->port_range;
  rcu_read_unlock();

  if (connect) {
    *low = pr->conn_low_port;
    *high = pr->conn_high_port;
  } else {
    *low = pr->bind_low_port;
    *high = pr->bind_high_port;
  }

  return 0;
}

1. connect端口隔离

connect端口隔离的时候,cgrp_local_port_range()函数主要被 EXPORT_SYMBOL_GPL(inet_csk_get_port) 使用

/net/ipv4/inet_connection_sock.c

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
  struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
  struct inet_bind_hashbucket *head;
  struct hlist_node *node;
  struct inet_bind_bucket *tb;
  int ret, attempts = 5;
  struct net *net = sock_net(sk);
  int smallest_size = -1, smallest_rover;
  int low = 1, high = 65535;

  local_bh_disable();
  if (!snum) {
    int remaining, rover;

again:
    inet_get_local_port_range(&low, &high);
    /* limit by net_cls.connect_port_range */
    if (cgrp_limit_local_port_range(&low, &high, 0) < 0) {
      ret = 1;
      goto fail;
    }
    remaining = (high - low) + 1;
    smallest_rover = rover = net_random() % remaining + low;

    smallest_size = -1;
    do {
      if (inet_is_reserved_local_port(rover))
        goto next_nolock;
      head = &hashinfo->bhash[inet_bhashfn(net, rover,
          hashinfo->bhash_size)];
      spin_lock(&head->lock);
      inet_bind_bucket_for_each(tb, node, &head->chain)
        if (ib_net(tb) == net && tb->port == rover) {
          if (tb->fastreuse > 0 &&
              sk->sk_reuse &&
              sk->sk_state != TCP_LISTEN &&
              (tb->num_owners < smallest_size || smallest_size == -1)) {
            smallest_size = tb->num_owners;
            smallest_rover = rover;
            if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
              snum = smallest_rover;
              goto tb_found;
            }
          }
          if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
            snum = rover;
            goto tb_found;
          }
          goto next;
        }
      break;
    next:
      spin_unlock(&head->lock);
    next_nolock:
      if (++rover > high)
        rover = low;
    } while (--remaining > 0);

    /* Exhausted local port range during search?  It is not
     * possible for us to be holding one of the bind hash
     * locks if this test triggers, because if 'remaining'
     * drops to zero, we broke out of the do/while loop at
     * the top level, not from the 'break;' statement.
     */
    ret = 1;
    if (remaining <= 0) {
      if (smallest_size != -1) {
        snum = smallest_rover;
        goto have_snum;
      }
      goto fail;
    }
    /* OK, here is the one we will use.  HEAD is
     * non-NULL and we hold it's mutex.
     */
    snum = rover;
  } else {
have_snum:
    /* limit by net_cls.connect_port_range */
    cgrp_limit_local_port_range(&low, &high, 0);
    if (snum < low || snum > high) {
      ret = 1;
      goto fail;
    }

    head = &hashinfo->bhash[inet_bhashfn(net, snum,
        hashinfo->bhash_size)];
    spin_lock(&head->lock);
    inet_bind_bucket_for_each(tb, node, &head->chain)
      if (ib_net(tb) == net && tb->port == snum)
        goto tb_found;
  }
  tb = NULL;
  goto tb_not_found;
tb_found:
  if (!hlist_empty(&tb->owners)) {
    if (tb->fastreuse > 0 &&
        sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
        smallest_size == -1) {
      goto success;
    } else {
      ret = 1;
      if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
        if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
            smallest_size != -1 && --attempts >= 0) {
          spin_unlock(&head->lock);
          goto again;
        }
        goto fail_unlock;
      }
    }
  }
tb_not_found:
  ret = 1;
  if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
          net, head, snum)) == NULL)
    goto fail_unlock;
  if (hlist_empty(&tb->owners)) {
    if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
      tb->fastreuse = 1;
    else
      tb->fastreuse = 0;
  } else if (tb->fastreuse &&
       (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
    tb->fastreuse = 0;
success:
  if (!inet_csk(sk)->icsk_bind_hash)
    inet_bind_hash(sk, tb, snum);
  WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
  ret = 0;

fail_unlock:
  spin_unlock(&head->lock);
fail:
  local_bh_enable();
  return ret;
} 

cgrp_limit_local_port_range() 是 cgrp_local_port_range()的一个包裹函数,主要用于校准端口范围,inet_get_local_port_range

2. bind端口隔离

同理,bind端口的限制是在 /net/ipv4/inet_connection_sock.c 中的 __inet_hash_connect() 函数实现的,只是增加了很小一段代码

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
    struct sock *sk, u32 port_offset,
    int (*check_established)(struct inet_timewait_death_row *,
      struct sock *, __u16, struct inet_timewait_sock **),
    void (*hash)(struct sock *sk))
{
  struct inet_hashinfo *hinfo = death_row->hashinfo;
  const unsigned short snum = inet_sk(sk)->num;
  struct inet_bind_hashbucket *head;
  struct inet_bind_bucket *tb;
  int ret;
  struct net *net = sock_net(sk);

  if (!snum) {
    int i, remaining, low, high, port;
    static u32 hint;
    u32 offset = hint + port_offset;
    struct hlist_node *node;
    struct inet_timewait_sock *tw = NULL;

    inet_get_local_port_range(&low, &high);
    /* limit by net_cls.bind_port_range */
    if (cgrp_limit_local_port_range(&low, &high, 1) < 0)
      return -EADDRNOTAVAIL;
    remaining = (high - low) + 1;

    local_bh_disable();
    for (i = 1; i <= remaining; i++) {
      port = low + (i + offset) % remaining;
      if (inet_is_reserved_local_port(port))
        continue;
      head = &hinfo->bhash[inet_bhashfn(net, port,
          hinfo->bhash_size)];
      spin_lock(&head->lock);

      /* Does not bother with rcv_saddr checks,
       * because the established check is already
       * unique enough.
       */
      inet_bind_bucket_for_each(tb, node, &head->chain) {
        if (ib_net(tb) == net && tb->port == port) {
          if (tb->fastreuse >= 0)
            goto next_port;
          WARN_ON(hlist_empty(&tb->owners));
          if (!check_established(death_row, sk,
                port, &tw))
            goto ok;
          goto next_port;
        }
      }

      tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
          net, head, port);
      if (!tb) {
        spin_unlock(&head->lock);
        break;
      }
      tb->fastreuse = -1;
      goto ok;

    next_port:
      spin_unlock(&head->lock);
    }
    local_bh_enable();

    return -EADDRNOTAVAIL;

ok:
    hint += i;

    /* Head lock still held and bh's disabled */
    inet_bind_hash(sk, tb, port);
    if (sk_unhashed(sk)) {
      inet_sk(sk)->sport = htons(port);
      hash(sk);
    }
    spin_unlock(&head->lock);

    if (tw) {
      inet_twsk_deschedule(tw, death_row);
      inet_twsk_put(tw);
    }

    ret = 0;
    goto out;
  }

  head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
  tb  = inet_csk(sk)->icsk_bind_hash;
  spin_lock_bh(&head->lock);
  if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
    hash(sk);
    spin_unlock_bh(&head->lock);
    return 0;
  } else {
    spin_unlock(&head->lock);
    /* No definite answer... Walk to established hash table */
    ret = check_established(death_row, sk, snum, NULL);
out:
    local_bh_enable();
    return ret;
  }
}

Leave a Comment