The code in this article is based on Linux kernel 5.13. This article is with reference to socket-interface-and-network-protocol and Send/Recv UDP.

1. Socket

From man 2 socket

1
2
3
4
#include <sys/types.h>          /* See NOTES */
#include <sys/socket.h>

int socket(int domain, int type, int protocol);
  • domain: like AF_IENT, AF_INET6, AF_NETLINK, AF_PACKET, see more with man 7 address_families, or see all domains defined in kernel file include/linux/socket.h
  • type: like SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, see all sock_type in include/linux/net.h
  • protocol: usually 0, but may also have many protocols for one domain, e.g. AF_NETLINK(man 7 netlink)

From the syscall you will see socket is not bind a specific protocol, it only create an endpoint for communication.

Note: for every AF_XXX, there is an PF_XXX, and the values are same. AF_XXX is address family, which PF_XXX is protocol family. So please try use AF_XXX in your struct sockaddr_in and PF_XXX in your call to socket().

2. struct socket

From include/linux/net.h

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wq: wait queue for several uses
 */
struct socket {
        socket_state            state;

        short                   type;

        unsigned long           flags;

        struct file             *file;
        struct sock             *sk;
        const struct proto_ops  *ops;

        struct socket_wq        wq;
};
  • file: fd used in userspace

2.1 struct net_proto_family

1
2
3
4
5
6
struct net_proto_family {
        int             family;
        int             (*create)(struct net *net, struct socket *sock,
                                  int protocol, int kern);
        struct module   *owner;
};
  • family: PF_XXX
  • create: create() syscall in kernel

e.g.

1
2
3
4
5
static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner  = THIS_MODULE,
};

Every protocol family need sock_register() to register its own net_proto_family.

1
2
3
4
5
6
  * net/ipv4/af_inet.c <<inet_init>>
    - sock_register(&inet_family_ops);
  * net/ipv6/af_inet6.c <<inet6_init>>
    - err = sock_register(&inet6_family_ops);
  * net/packet/af_packet.c <<packet_init>>
    - rc = sock_register(&packet_family_ops);

sock_register() add each protocol in here.

1
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

2.2 struct proto_ops

Defines all protocol handlers

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
From include/linux/net.h

struct proto_ops {
        int             family;
        struct module   *owner;
        int             (*release)   (struct socket *sock);
        int             (*bind)      (struct socket *sock,
                                      struct sockaddr *myaddr,
                                      int sockaddr_len);
        int             (*connect)   (struct socket *sock,
                                      struct sockaddr *vaddr,
                                      int sockaddr_len, int flags);
        [...]
        int             (*sendmsg)   (struct socket *sock, struct msghdr *m,
                              size_t total_len);
        int             (*recvmsg)   (struct socket *sock, struct msghdr *m,
                              size_t total_len, int flags);
        [...]

2.4 struct proto

Defines all protocol specific handlers

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
From include/net/sock.h

struct proto {
        void                    (*close)(struct sock *sk,
                                        long timeout);
        int                     (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                     (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        [...]

        int                     (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                     (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int noblock, int flags,
                                           int *addr_len);
        [...]

Even in the same address family, there may have many protocols. So linux used struct proto to define specific handler. If not defined, then use the common handler in struct proto_ops. If the handler is same for different protocols, they can share the same struct proto.

Most address families only declare struct proto, but not define details. But AF_INET and AF_IENT6 has many proto structs. e.g.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
net/ipv4/af_inet.c:

static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .flags =      INET_PROTOSW_PERMANENT |
                              INET_PROTOSW_ICSK,
        },
        ...
};

In net_proto_family.create(), we need alloc sk by sk_alloc(). Here the sk_prot was assign suitable prot struct.e.g. inet_create() -> sk_alloc(), which prots are from the inetsw_array.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
/**
 *      sk_alloc - All socket objects are allocated here
 *      @net: the applicable net namespace
 *      @family: protocol family
 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *      @prot: struct proto associated with this new sock instance
 *      @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;
        [...]
        sk->sk_prot = sk->sk_prot_creator = prot;
        [...]

If sk->sk_prot implemented, then it could be called in proto_ops. e.g. IPPROTO_ICMP has prot = &ping_prot and ops = &inet_sockraw_ops. When call recvmsg, it calls inet_recvmsg() which defined in inet_sockraw_ops, then sk->sk_prot->recvmsg was called later.

For every struct prot, we need to call proto_register() to register it in proto_list. You can read /proc/net/protocols to see them. e.g.

1
2
3
4
5
6
7
8
9
static int __init inet_init(void)
{
  [...]
  rc = proto_register(&tcp_prot, 1);
  rc = proto_register(&udp_prot, 1);
  rc = proto_register(&raw_prot, 1);
  rc = proto_register(&ping_prot, 1);
  [...]
}

3. Create socket

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/* userspace */
socket()

/* kernel */ net/socket.c
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
  - __sys_socket()
    - sock_create()
      - __sock_create()
        - pf->create()  /* net_proto_family->create() for AF_INET */
    - sock_map_fd()

int __sock_create(struct net *net, int family, int type, int protocol,
                  struct socket **res, int kern)
{
       [...]
       sock = sock_alloc()
       [...]
       sock->type = type;
       [...]
       /* get correspond net_proto_family*/
       pf = rcu_dereference(net_families[family]);
       [...]
       /* Call net_proto_family create() function. e.g. inet_family_ops -> inet_create() */
       err = pf->create(net, sock, protocol, kern);
       [...]
}

As in userspace we use fd to describe the socket, we should bind the socket and file describer in kernel.

1
2
3
4
5
6
7
- sock_map_fd()
  - sock_alloc_file()
    - file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
                              O_RDWR | (flags & O_NONBLOCK),
                              &socket_file_ops)
      - file = alloc_file(&path, flags, fops);
        - file->f_op = fop;

So, socket_file_ops will handle the operations from user space socket fd.

4. Socket handlers

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
- read()
  - SYSCALL_DEFINE3(read, ...)  /* fs/read_write.c */
    - ksys_read()
      - vfs_read()
        - file->f_op->read_iter(file, buf, count, pos) /* For socket, it's socket_file_ops->read_iter() */
          - sock_read_iter()
            - sock_recvmsg()
              - sock_recvmsg_nosec()
                - sock->ops->recvmsg()
                  - tcp_prot: tcp_recvmsg()
                    udp_prot: udp_recvmsg()
- write()
  - SYSCALL_DEFINE3(write, ...) /* fs/read_write.c */
    - ksys_write()
      - vfs_write()
        - file->f_op->write_iter()
          - sock_write_iter()
            - sock_sendmsg()
- ioctl()
  - SYSCALL_DEFINE3(ioctl, ...) /* fs/ioctl.c */
    - do_vfs_ioctl()
      - vfs_ioctl()
        - filp->f_op->unlocked_ioctl()
          - sock_ioctl()
            - dev_ioctl()

Note: some of the ioctl in socket fd are related to network interface, and some are related to network protocol. Therefore, starting from vfs_ioctl(), the ioctl cmds is handled in each layer function.

5. AF_INET

For AF_IENT, struct net_proto_family inet_family_ops is declared.

5.1 protocols in AF_IENT

For different socket type, there are different proto_ops. e.g.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
const struct proto_ops inet_stream_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
        .release           = inet_release,
        .bind              = inet_bind,
        .connect           = inet_stream_connect,
        .socketpair        = sock_no_socketpair,
        .accept            = inet_accept,
        ...
}
const struct proto_ops inet_dgram_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
        .release           = inet_release,
        .bind              = inet_bind,
        ...
}
static const struct proto_ops inet_sockraw_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
        .release           = inet_release,
        .bind              = inet_bind,
        ...
}

As you can see these proto_ops are defined for SOCK_STREAM, SOCK_DGRAM and SOCK_RAW respectively.

For an address family, multiple network protocols can be supported, the common method can be stored in struct proto_ops, and the method of the network protocol itself can be stored in proto. i.e.

soket.ops: store struct proto_ops socket.sk.sk_prot: store struct proto.

There are some proto structs defined in AF_INET. e.g. tcp, udp, ping, raw.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
net/ipv4/af_inet.c:

static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .flags =      INET_PROTOSW_PERMANENT |
                              INET_PROTOSW_ICSK,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .flags =      INET_PROTOSW_PERMANENT,
       },

       {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_ICMP,
                .prot =       &ping_prot,
                .ops =        &inet_sockraw_ops,
                .flags =      INET_PROTOSW_REUSE,
       },

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,        /* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_sockraw_ops,
               .flags =      INET_PROTOSW_REUSE,
       }
};

5.2 struct inet_protosw

In AF_INET, struct inet_protosw is used to show the protocol related info.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
        struct list_head list;

        /* These two fields form the lookup key.  */
        unsigned short   type;     /* This is the 2nd argument to socket(2). */
        unsigned short   protocol; /* This is the L4 protocol number.  */

        struct proto     *prot;
        const struct proto_ops *ops;

        unsigned char    flags;      /* See INET_PROTOSW_* below.  */
};

All the protocols infos are defined in static struct inet_protosw inetsw_array[], with struct inet_protosw, the socket type, protocol and related struct proto, struct proto_ops are combined together.

inet_register_protosw() is called to register these inet_protosw structs. All of them are registerd in static list static struct list_head inetsw[SOCK_MAX].

inetsw is an array that stores a linked list header for each socket type, that is, the inet_protosw structure is classified according to the socket type and placed in different linked lists

5.3 AF_IENT socket create.

This is followed with 3. Create socket.

In net_proto_family.create(), it will assign proto_ops to socket.ops. e.g.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static int inet_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
  ...
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
    ...
    /* Check the non-wild match. */
    if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                    break;
    } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                    protocol = answer->protocol;
                    break;
            }
            if (IPPROTO_IP == answer->protocol)
                    break;
    }
    ...
  }
  [...]
  sock->ops = answer->ops;
  answer_prot = answer->prot;
  [...]
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
  [...]
  sock_init_data(sock, sk);
  [...]
  if (sk->sk_prot->init)
    err = sk->sk_prot->init(sk);
  ...
}
  1. Find correspond struct inet_protosw based on protocol in inetsw[sock->type].
  2. Assign the inet_protosw.ops and inet_protosw.prot to sock->ops, socket->sk.sk_prot.
  3. Call socket->sk->sk_prot->init()

5.4 socket handlers in AF_INET

In 4. Socket handler, we know for each read/write, it will calls sock->ops->xxx at last. And this will calls socket->sk->sk_prot->xxx. e.g.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
/* userspace */
connect()

/* Kernel */
- connect()
  - SYSCALL_DEFINE3(connect, ...)
    ...
    - socket->ops->connect()  /* It's inet_stream_ops.connect() for tcp */
      - inet_stream_connect()
        - __inet_stream_connect()
          - sk->sk_prot->connect()
            - tcp_v4_connect()

6. Kernel UDP send/recvmsg

The sending of UDP packets, starting from the system call at the socket interface and running all the way until the completed packet is added to the output queue of the network interface, is handled in just one pass.

But the receiving of UDP packets requires two separate steps: Once a packet has been received, udp_rcv() first allocates it to a socket in bottom-half context and places it into that socket’s receive queue. From there, the packet is fetched via system call of a user process, which is mapped to udp_recvmsg().

6.1 UDP sendmsg

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* user space */
- sendmsg(int sockfd, const struct msghdr *msg, int flags)

/* kernel part */
- SYSCALL_DEFINE3(sendmsg, ...)
  - __sys_sendmsg()
    - sock_sendmsg(sock, msg_sys)
      - sock_sendmsg_nosec()
        - sock->ops->sendmsg()  /* For UDP, it's inet_dgram_ops->sendmsg() */
          - inet_sendmsg()
            - sk->sk_prot->sendmsg()  /* udp_prot->sendmsg() */
              - udp_sendmsg()
                - if (msg->msg_controllen): ip_cmsg_send()
                - if (connected): rt = (struct rtable *)sk_dst_check();
                  else: rt = ip_route_output_flow(net, fl4, sk);
                - udp_send_skb()
                  - ip_send_skb()
                    - ip_local_out()
                      - skb_dst(skb)->output()  /* rt_dst_alloc(): rt->dst.output = ip_output; */
                        - ip_output()
                          - ip_finish_output()
                            - __ip_finish_output()
                              - ip_finish_output2()
                                - neigh_hh_output()
                                  - dev_queue_xmit()

6.2 UDP recvmsg

As we said, The recvmsg has 2 steps, one is kernel part, after udp_rcv all the messages are queued in the buffer. Another step is from user space, which calls recvmsg() to get the data in kernel buffer.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/* Kernel part */
- NET_RX_SOFTIRQ
  - net_rx_action()
    - napi_poll()
      - n->poll() /* sd->backlog.poll = process_backlog; */
        - process_backlog()
          - __netif_receive_skb()
            - __netif_receive_skb_one_core()
              - pt_prev->func() /* see following ip_packet_type */
                - ip_rcv()
                  - ip_rcv_finish()
                    - dst_input()
                      - skb_dst(skb)->input /* rt->dst.input = ip_local_deliver; */
   ------------------------^
   |
- ip_local_deliver()
  - ip_local_deliver_finish()
    - ip_protocol_deliver_rcu()
      - ipprot = rcu_dereference(inet_protos[protocol]);
        - ipprot->handler() /* see the following udp_protocol struct */
          - udp_rcv()
            - __udp4_lib_rcv()
              - sk = __udp4_lib_lookup_skb()
              - udp_unicast_rcv_skb(sk, skb, uh)
                - udp_queue_rcv_skb()
                  - udp_queue_rcv_one_skb()


/* User part */
- recvmsg()
  - SYSCALL_DEFINE3(recvmsg, ...)
    - __sys_recvmsg()
      ...
      - socket->ops->recvmsg()  /* inet_dgram_ops->recvmsg() */
        - inet_recvmsg()
          - sk->sk_prot->recvmsg()  /* udp_prot->recvmsg() */
            - udp_recvmsg()
              - skb_copy_datagram_msg()

Call backs for truct packet_type and struct net_protocol.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
static struct packet_type ip_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .func = ip_rcv,
        .list_func = ip_list_rcv,
};

static struct net_protocol udp_protocol = {
        .early_demux =  udp_v4_early_demux,
        .early_demux_handler =  udp_v4_early_demux,
        .handler =      udp_rcv,
        .err_handler =  udp_err,
        .no_policy =    1,
};