File Descriptor Table

// include/linux/sched.h

struct task_struct {
  // ...
  /* open file information */
        struct files_struct *files;
  // ...
};

// include/linux/fdtable.h

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

// include/linux/fs.h

struct file {
        union {
                struct llist_node       fu_llist;
                struct rcu_head         fu_rcuhead;
        } f_u;
        struct path             f_path;
        struct inode            *f_inode;       /* cached value */
        const struct file_operations    *f_op;

        // ...

        atomic_long_t           f_count;

        // ...

        /* needed for tty driver, and maybe others */
        void                    *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
        struct list_head        f_tfile_llink;
#endif
};

struct socket{}

// include/linux/net.h

struct socket_wq {
        /* Note: wait MUST be first field of socket_wq */
        wait_queue_head_t       wait;
        struct fasync_struct    *fasync_list;
        unsigned long           flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */
        struct rcu_head         rcu;
} ____cacheline_aligned_in_smp;

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wq: wait queue for several uses
 */
struct socket {
        socket_state            state;

        kmemcheck_bitfield_begin(type);
        short                   type;
        kmemcheck_bitfield_end(type);

        unsigned long           flags;

        struct socket_wq __rcu  *wq;

        struct file             *file;
        struct sock             *sk;
        const struct proto_ops  *ops;
};

// include/net/sock.h

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

// include/linux/fs.h
struct inode {
        umode_t                 i_mode;     // S_IFSOCK | S_IRWXUGO;
        unsigned short          i_opflags;
        kuid_t                  i_uid;
        kgid_t                  i_gid;
        unsigned int            i_flags;
// ...
        const struct inode_operations   *i_op;
        struct super_block      *i_sb;
// ...
        unsigned long           i_ino;      // sockfs_dname: "socket:[%lu]"
// ...
        union {
                struct hlist_head       i_dentry;
                struct rcu_head         i_rcu;
        };
// ...
};

socket(2)

int sys_socket(int family, int type, int protocol)
  -> struct socket* sock_create(family, type, protocol)
    -> __sock_create(family, type, protocol)
      -> struct socket* sock = sock_alloc()
        -> inode = new_inode_pseudo(super_block of sockfs)      // fs/inode.c
          -> alloc_inode
            -> sock_alloc_inode (sb->s_op->alloc_inod)
              -> socket_alloc *ei = kmem_cache_alloc(sock_inode_cachep)  // 306B
              -> socket_wq *wq = kmalloc(sizeof(*wq))  // 24B
              ei->socket.wq = wq;
              ei->socket.state = SS_UNCONNECTED;
            -> inode_init_always
        sock = SOCKET_I(inode);
        inode->i_op = &sockfs_inode_ops;  // .listxattr = sockfs_listxattr,
      -> net_families[family]->create(sock, protocol)
        -> inet_create(sock, protocol)                  // net/ipv4/af_inet.c
  -> sock_map_fd()
    -> get_unused_fd_flags
    -> sock_alloc_file
      -> d_alloc_pseudo  // sizeof(struct dentry) == 128
      -> file = alloc_file(..., &socket_file_ops);  // sizeof(struct file) == 140
        -> get_empty_filp
        file.f_op = &socket_file_ops
      sock->file = file;
      file->private_data = sock;
    -> fd_install

inet_create(sock, protocol)
  -> find inet_protosw for sock->type & protocol
  -> struct sock* sk = sk_alloc()
    -> sk_prot_alloc -> kmem_cache_alloc (tcp_prot is 1360B)
  -> sock_init_data
    sk->sk_socket = sock;
    sock->sk = sk;
  -> sk->sk_prot->init(sk)
    -> tcp_v4_init_sock
      -> tcp_init_sock
        -> tcp_init_xmit_timers
        -> tcp_prequeue_init
        -> tcp_enable_early_retrans
        -> tcp_assign_congestion_control
        sk->sk_state = TCP_CLOSE;
      icsk->icsk_af_ops = &ipv4_specific;

After socket() inode

Memory usage

On x86-64 Deiban 8, Linux 3.16, /proc/slabinfo

struct size slab cache name
file 256 "filp"
dentry 192 "dentry"
socket_alloc 640 "sock_inode_cache"
tcp_sock 1792 "TCP"
socket_wq 64 "kmalloc-64"
inet_bind_bucket 64 "tcp_bind_bucket"
tcp_request_sock 256 "request_sock_TCP"
inet_timewait_sock 192 "tw_sock_TCP"
epitem 128 "eventpoll_epi"

Total : 2944 bytes/socket, not including send/receive buffers. For client sockets, +64B for inet_bind_bucket.

net_proto_family

family type protocol
AF_UNIX=1 SOCK_STREAM=1 0
AF_UNIX SOCK_DGRAM=2 0
AF_INET=2 SOCK_STREAM IPPROTO_TCP=6
AF_INET SOCK_DGRAM IPPROTO_UDP=17
AF_INET SOCK_DGRAM IPPROTO_ICMP=1
AF_INET SOCK_RAW=3 IPPROTO_IP=0
NPROTO=AF_MAX=41 SOCK_MAX=11 no max?
// include/linux/net.h
// How to create a socket for an address family
struct net_proto_family {
  int family;  // AF_INET, AF_UNIX, AF_INET6, etc. must: 0 <= family < NPROTO
  int (*create)(struct net *net, struct socket *sock, int protocol, int kern);
  struct module *owner;
};

// net/socket.c
static const struct net_proto_family *net_families[NPROTO];

// net/ipv4/af_inet.c
static int inet_create(struct net *net, struct socket *sock, int protocol, int kern);

static const struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE
};

// inet_init() calls sock_register():
net_families[AF_INET] = &inet_family_ops;

inet_protosw

// include/net/protocol.h
/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
  struct list_head list;

  /* These two fields form the lookup key.  */
  unsigned short   type;     /* This is the 2nd argument to socket(2).  eg. SOCK_STREAM, SOCK_DGRAM */
  unsigned short   protocol; /* This is the L4 protocol number.  eg. IPPROTO_TCP, IPPROTO_TCP */

  struct proto           *prot;
  const struct proto_ops *ops;

  unsigned char    flags;      /* See INET_PROTOSW_* below.  */
};
#define INET_PROTOSW_REUSE      0x01  /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT  0x02  /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK       0x04  /* Is this an inet_connection_sock? */
// net/ipv4/af_inet.c
/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
  {
    .type =       SOCK_STREAM,
    .protocol =   IPPROTO_TCP,
    .prot =       &tcp_prot,
    .ops =        &inet_stream_ops,
    .flags =      INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK,
  },
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_UDP,
    .prot =       &udp_prot,
    .ops =        &inet_dgram_ops,
    .flags =      INET_PROTOSW_PERMANENT,
  },
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_ICMP,
    .prot =       &ping_prot,
    .ops =        &inet_dgram_ops,
    .flags =      INET_PROTOSW_REUSE,
  },
  {
    .type =       SOCK_RAW,
    .protocol =   IPPROTO_IP,        /* wild card */
    .prot =       &raw_prot,
    .ops =        &inet_sockraw_ops,
    .flags =      INET_PROTOSW_REUSE,
  }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

// inet_init()
  sock_register(&inet_family_ops);

  /* Register the socket-side information for inet_create. */
  for (struct list_head *r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    INIT_LIST_HEAD(r);

  for (struct inet_protosw *q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    inet_register_protosw(q);

After Initialization: inetsw

skbuff

A sample of receive skb for TCP SYN segment: skbuff