2017/10/31_OVS+DPDK封包處理流程追蹤

https://sites.google.com/a/cnsrl.cycu.edu.tw/da-shu-bi-ji/dpdk-ovs

https://feisky.gitbooks.io/sdn/dpdk/ovs-dpdk.html

https://software.intel.com/en-us/articles/set-up-open-vswitch-with-dpdk-on-ubuntu-server

DPDK加速的OVS數據流轉發的大致流程如下：

1）OVS的ovs-vswitchd接收到從OVS連接的某個Port發來的封包，從封包中取出Src / Dst IP、Src / Dst MAC、Port 等訊息。

2）OVS在用戶態查看精確流表(Exact Match)和模糊流表，如果命中，則直接轉發。

3）如果還不命中，在SDN控制器接入的情況下，經過OpenFlow協議，通告給控制器，由控制器處理。

4）控制器下發新的Flow，該封包重新發起選路，匹配；封包轉發，結束。

A tap device is used by dpif-netdev to create internal devices.

Without this patch, adding any bridge backed by the userspace datapath

would fail.

This doesn't mean that we can run Open vSwitch with DPDK under SELinux

yet, but at least we can use the userspace datapath.

Example of Packet Forwarding to a Physical Port

http://zhaozhanxu.com/2016/09/08/SDN/OVS/2016-09-08-ovs-dpdk-pkts-flow/

netdev.c

純kernel ??

/* Retrieves 'netdev''s MAC address.  If successful, returns 0 and copies the

 * the MAC address into 'mac'.  On failure, returns a positive errno value and

 * clears 'mac' to all-zeros. */

int

netdev_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)

    return netdev->netdev_class->get_etheraddr(netdev, mac);

dpif-netdev.c pmd 拉上來的，有dpdk

#4205行

dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,

                  struct dp_packet_batch *packets,  //<<<<<<<<<<<<<<<<<<

                  bool md_is_valid, odp_port_t port_no)

int cnt = packets->count; //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#if !defined(__CHECKER__) && !defined(_WIN32)

    const size_t PKT_ARRAY_SIZE = cnt;

#else

    /* Sparse or MSVC doesn't like variable length array. */

    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };

#endif

    struct netdev_flow_key keys[PKT_ARRAY_SIZE];

    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];

long long now = time_msec();

    size_t newcnt, n_batches, i;

    odp_port_t in_port;

    n_batches = 0;

    newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,

                            md_is_valid, port_no);

    if (OVS_UNLIKELY(newcnt)) {

        packets->count = newcnt;

        /* Get ingress port from first packet's metadata. */

        in_port = packets->packets[0]->md.in_port.odp_port;

        fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now);

    for (i = 0; i < n_batches; i++) {

        batches[i].flow->batch = NULL;

    for (i = 0; i < n_batches; i++) {

        packet_batch_per_flow_execute(&batches[i], pmd, now);

dp-packet.h

#597

struct dp_packet_batch {

int count;

    bool trunc; /* true if the batch needs truncate. */

struct dp_packet *packets[NETDEV_MAX_BURST];

};

dp-packet.h

#41

/* Buffer for holding packet data.  A dp_packet is automatically reallocated

 * as necessary if it grows too large for the available memory.

*/

struct dp_packet {

#ifdef DPDK_NETDEV

    struct rte_mbuf mbuf;       /* DPDK mbuf */

#else

    void *base_;                /* First byte of allocated space. */

uint16_t allocated_; /* Number of bytes allocated. */

uint16_t data_ofs; /* First byte actually in use. */

uint32_t size_; /* Number of bytes in use. */

uint32_t rss_hash; /* Packet hash. */

    bool rss_hash_valid;        /* Is the 'rss_hash' valid? */

#endif

    enum dp_packet_source source;  /* Source of memory allocated as 'base'. */

uint8_t l2_pad_size; /* Detected l2 padding size.

                                    * Padding is non-pullable. */

uint16_t l2_5_ofs; /* MPLS label stack offset, or UINT16_MAX */

uint16_t l3_ofs; /* Network-level header offset,

                                    * or UINT16_MAX. */

uint16_t l4_ofs; /* Transport-level header offset,

                                      or UINT16_MAX. */

uint32_t cutlen; /* length in bytes to cut from the end. */

    union {

struct pkt_metadata md;

uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];

};

};

struct rte_mbuf

http://dpdk.org/doc/api/structrte__mbuf.html

struct rte_mbuf {

  426         MARKER cacheline0;

  428         void *buf_addr;

  435         phys_addr_t buf_physaddr __rte_aligned(sizeof(phys_addr_t));

  437         /* next 8 bytes are initialised on RX descriptor rearm */

  438         MARKER64 rearm_data;

  439         uint16_t data_off;

  450         RTE_STD_C11

  451         union {

  452                 rte_atomic16_t refcnt_atomic;

  453                 uint16_t refcnt;

  454         };

  455         uint16_t nb_segs;

  458         uint16_t port;

  460         uint64_t ol_flags;

  462         /* remaining bytes are set on RX when pulling packet from descriptor */

  463         MARKER rx_descriptor_fields1;

  465         /*

  466          * The packet type, which is the combination of outer/inner L2, L3, L4

  467          * and tunnel types. The packet_type is about data really present in the

  468          * mbuf. Example: if vlan stripping is enabled, a received vlan packet

  469          * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the

  470          * vlan is stripped from the data.

  471          */

  472         RTE_STD_C11

  473         union {

  474                 uint32_t packet_type;

  475                 struct {

  476                         uint32_t l2_type:4;

  477                         uint32_t l3_type:4;

  478                         uint32_t l4_type:4;

  479                         uint32_t tun_type:4;

  480                         RTE_STD_C11

  481                         union {

  482                                 uint8_t inner_esp_next_proto;

  487                                 __extension__

  488                                 struct {

  489                                         uint8_t inner_l2_type:4;

  491                                         uint8_t inner_l3_type:4;

  493                                 };

  494                         };

  495                         uint32_t inner_l4_type:4;

  496                 };

  497         };

  499         uint32_t pkt_len;

  500         uint16_t data_len;

  502         uint16_t vlan_tci;

  504         union {

  505                 uint32_t rss;

  506                 struct {

  507                         RTE_STD_C11

  508                         union {

  509                                 struct {

  510                                         uint16_t hash;

  511                                         uint16_t id;

  512                                 };

  513                                 uint32_t lo;

  515                         };

  516                         uint32_t hi;

  519                 } fdir;

  520                 struct {

  521                         uint32_t lo;

  522                         uint32_t hi;

  523                 } sched;

  524                 uint32_t usr;

  525         } hash;

  528         uint16_t vlan_tci_outer;

  530         uint16_t buf_len;

  535         uint64_t timestamp;

  537         /* second cache line - fields only used in slow path or on TX */

  538         MARKER cacheline1 __rte_cache_min_aligned;

  540         RTE_STD_C11

  541         union {

  542                 void *userdata;

  543                 uint64_t udata64;

  544         };

  546         struct rte_mempool *pool;

  547         struct rte_mbuf *next;

  549         /* fields to support TX offloads */

  550         RTE_STD_C11

  551         union {

  552                 uint64_t tx_offload;

  553                 __extension__

  554                 struct {

  555                         uint64_t l2_len:7;

  559                         uint64_t l3_len:9;

  560                         uint64_t l4_len:8;

  561                         uint64_t tso_segsz:16;

  563                         /* fields for TX offloading of tunnels */

  564                         uint64_t outer_l3_len:9;

  565                         uint64_t outer_l2_len:7;

  567                         /* uint64_t unused:8; */

  568                 };

  569         };

  573         uint16_t priv_size;

  576         uint16_t timesync;

  579         uint32_t seqn;

  581 } __rte_cache_aligned;packets.h

#93

Struct packet_metadata

/* Datapath packet metadata */

struct pkt_metadata {

uint32_t recirc_id; /* Recirculation id carried with the

                                   recirculating packets. 0 for packets

                                   received from the wire. */

uint32_t dp_hash; /* hash value computed by the recirculation

                                   action. */

uint32_t skb_priority; /* Packet priority for QoS. */

uint32_t pkt_mark; /* Packet mark. */

uint16_t ct_state; /* Connection state. */

uint16_t ct_zone; /* Connection zone. */

uint32_t ct_mark; /* Connection mark. */

ovs_u128 ct_label; /* Connection label. */

union flow_in_port in_port; /* Input port. */

struct flow_tnl tunnel; /* Encapsulating tunnel parameters. Note that

                                 * if 'ip_dst' == 0, the rest of the fields may

                                 * be uninitialized. */

};

上面此路不通

dpif-netdev.c

emc_processing

emc_processing主要是將收到的幾個封包解析key值，並且從cache中查找表格，match到的封包放入表格；返回match的封包個數。

使用miniflow_extract 將封包解析到key值。使用emc_lookup從hash表中找尋，並且進行key值的比較。

如果match，使用dp_netdev_queue_batches将封包加在flow->batches中。不match將不match的封包當前排。

使用dp_netdev_count_packet統計match的封包數目。

static inline size_t

emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,

               struct netdev_flow_key *keys,

               struct packet_batch_per_flow batches[], size_t *n_batches,

               bool md_is_valid, odp_port_t port_no)

    struct emc_cache *flow_cache = &pmd->flow_cache;

    struct netdev_flow_key *key = &keys[0];

    size_t i, n_missed = 0, n_dropped = 0;

    struct dp_packet **packets = packets_->packets;

    int cnt = packets_->count;

    for (i = 0; i < cnt; i++) {

        struct dp_netdev_flow *flow;

struct dp_packet *packet = packets[i];

// 如果封包長度有問題就丟掉？？

        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {

            dp_packet_delete(packet);

            n_dropped++;

            continue;

// 如果封包還沒處理到最後一個，取出下一筆封包資料跟metadata

        if (i != cnt - 1) {

            /* Prefetch next packet data and metadata. */

            OVS_PREFETCH(dp_packet_data(packets[i+1]));

            pkt_metadata_prefetch_init(&packets[i+1]->md);

        if (!md_is_valid) {

            pkt_metadata_init(&packet->md, port_no);

        miniflow_extract(packet, &key->mf);

// https://software.intel.com/en-us/articles/the-open-vswitch-exact-match-cache

//在OVS-DPDK中，DPDK處理過的封包，會傳給OVS的miniflow_extract函數進行解析，提取key，進行後續的表格匹配。

        key->len = 0; /* Not computed yet. */

        key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);

// 可能在這個function裡面？

        flow = emc_lookup(flow_cache, key);

        if (OVS_LIKELY(flow)) {

            dp_netdev_queue_batches(packet, flow, &key->mf, batches,

                                    n_batches);

        } else {

            // EMC cache miss 跳到下一個fast path

            /* Exact match cache missed. Group missed packets together at

             * the beginning of the 'packets' array.  */

            packets[n_missed] = packet;

            /* 'key[n_missed]' contains the key of the current packet and it

             * must be returned to the caller. The next key should be extracted

             * to 'keys[n_missed + 1]'. */

            key = &keys[++n_missed];

// 處理了多少封包？？總共減丟掉的減沒EMC中的

    dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);

    return n_missed;

miniflow_extract()

// https://lists.onap.org/pipermail/ovs-dev/2014-April/282612.html

miniflow_extract() extracts packet headers directly to a miniflow, which is a compressed form of the struct flow.

This does not require a large struct to be cleared to begin with, and accesses less memory.

These performance benefits should allow this to be used in the DPDK datapath. miniflow_extract() takes a miniflow as an input/output parameter.

On input the buffer for values to be extracted must be properly initialized.

On output the map contains ones for all the fields that have been extracted.

Some struct flow fields are reordered to make miniflow_extract to progress in the logical order.

Some explicit "inline" keywords are necessary for GCC to optimize this properly.

Also, macros are used for same reason instead of inline functions for pushing data to the miniflow.

/* The 'dst' must follow with buffer space for FLOW_U64S 64-bit units.

 * 'dst->map' is ignored on input and set on output to indicate which fields

 * were extracted. */

void miniflow_extract(struct dp_packet *packet, struct miniflow *dst);

/* Caller is responsible for initializing 'dst' with enough storage for

* FLOW_U64S * 8 bytes. */

void

miniflow_extract(struct dp_packet *packet, struct miniflow *dst)

{

const struct pkt_metadata *md = &packet->md;

const void *data = dp_packet_data(packet);

size_t size = dp_packet_size(packet);

ovs_be32 packet_type = packet->packet_type;

uint64_t *values = miniflow_values(dst);

struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,

values + FLOW_U64S };

const char *frame;

ovs_be16 dl_type = OVS_BE16_MAX;

uint8_t nw_frag, nw_tos, nw_ttl, nw_proto;

uint8_t *ct_nw_proto_p = NULL;

ovs_be16 ct_tp_src = 0, ct_tp_dst = 0;

..............................

/* Network layer. */

packet->l3_ofs = (char *)data - frame;

nw_frag = 0;

if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {

const struct ip_header *nh = data;

int ip_len;

uint16_t tot_len;

if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {

goto out;

}

ip_len = IP_IHL(nh->ip_ihl_ver) * 4;

if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {

goto out;

}

if (OVS_UNLIKELY(size < ip_len)) {

goto out;

}

tot_len = ntohs(nh->ip_tot_len);

if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len)) {

goto out;

}

if (OVS_UNLIKELY(size - tot_len > UINT8_MAX)) {

goto out;

}

dp_packet_set_l2_pad_size(packet, size - tot_len);

size = tot_len; /* Never pull padding. */

/* Push both source and destination address at once. */

miniflow_push_words(mf, nw_src, &nh->ip_src, 1);

if (ct_nw_proto_p && !md->ct_orig_tuple_ipv6) {

*ct_nw_proto_p = md->ct_orig_tuple.ipv4.ipv4_proto;

if (*ct_nw_proto_p) {

miniflow_push_words(mf, ct_nw_src,

&md->ct_orig_tuple.ipv4.ipv4_src, 1);

ct_tp_src = md->ct_orig_tuple.ipv4.src_port;

ct_tp_dst = md->ct_orig_tuple.ipv4.dst_port;

}

miniflow_push_be32(mf, ipv6_label, 0); /* Padding for IPv4. */

nw_tos = nh->ip_tos;

nw_ttl = nh->ip_ttl;

nw_proto = nh->ip_proto;

if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {

nw_frag = FLOW_NW_FRAG_ANY;

if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {

nw_frag |= FLOW_NW_FRAG_LATER;

}

data_pull(&data, &size, ip_len);

} else if (dl_type == htons(ETH_TYPE_IPV6)) {

const struct ovs_16aligned_ip6_hdr *nh;

ovs_be32 tc_flow;

uint16_t plen;

..............................

out:

dst->map = mf.map;

}

emc_lookup

static inline struct dp_netdev_flow *

emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)

    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {

        if (current_entry->key.hash == key->hash

            && emc_entry_alive(current_entry)

            && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {

            /* We found the entry with the 'key->mf' miniflow */

            return current_entry->flow;

    return NULL;

struct emc_entry

struct emc_entry {

    struct dp_netdev_flow *flow;

    struct netdev_flow_key key;   /* key.hash used for emc hash value. */

};

struct dp_netdev_flow

/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.

 * Thread-safety

 * =============

 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to

 * its pmd thread's classifier.  The text below calls this classifier 'cls'.

 * Motivation

 * ----------

 * The thread safety rules described here for "struct dp_netdev_flow" are

 * motivated by two goals:

 *    - Prevent threads that read members of "struct dp_netdev_flow" from

 *      reading bad data due to changes by some thread concurrently modifying

 *      those members.

 *    - Prevent two threads making changes to members of a given "struct

 *      dp_netdev_flow" from interfering with each other.

 * Rules

 * -----

 * A flow 'flow' may be accessed without a risk of being freed during an RCU

 * grace period.  Code that needs to hold onto a flow for a while

 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().

 * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the

 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'

 * from modification.

 * Some members, marked 'const', are immutable.  Accessing other members

 * requires synchronization, as noted in more detail below.

*/

struct dp_netdev_flow {

const struct flow flow; /* Unmasked flow that created this entry. */

    /* Hash table index by unmasked flow. */

const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */

                                 /* 'flow_table'. */

    const ovs_u128 ufid;         /* Unique flow identifier. */

const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */

                                 /* flow. */

    /* Number of references.

     * The classifier owns one reference.

     * Any thread trying to keep a rule from being freed should hold its own

     * reference. */

    struct ovs_refcount ref_cnt;

    bool dead;

    /* Statistics. */

    struct dp_netdev_flow_stats stats;

    /* Actions. */

    OVSRCU_TYPE(struct dp_netdev_actions *) actions;

    /* While processing a group of input packets, the datapath uses the next

     * member to store a pointer to the output batch for the flow.  It is

     * reset after the batch has been sent out (See dp_netdev_queue_batches(),

     * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */

    struct packet_batch_per_flow *batch;

    /* Packet classification. */

    struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */

    /* 'cr' must be the last member. */

};

struct miniflow

/* A sparse representation of a "struct flow".

 * A "struct flow" is fairly large and tends to be mostly zeros.  Sparse

 * representation has two advantages.  First, it saves memory and, more

 * importantly, minimizes the number of accessed cache lines.  Second, it saves

 * time when the goal is to iterate over only the nonzero parts of the struct.

 * The map member hold one bit for each uint64_t in a "struct flow".  Each

 * 0-bit indicates that the corresponding uint64_t is zero, each 1-bit that it

 * *may* be nonzero (see below how this applies to minimasks).

 * The values indicated by 'map' always follow the miniflow in memory.  The

 * user of the miniflow is responsible for always having enough storage after

* the struct miniflow corresponding to the number of 1-bits in maps.

 * Elements in values array are allowed to be zero.  This is useful for "struct

 * minimatch", for which ensuring that the miniflow and minimask members have

 * same maps allows optimization.  This allowance applies only to a miniflow

 * that is not a mask.  That is, a minimask may NOT have zero elements in its

 * values.

 * A miniflow is always dynamically allocated so that the maps are followed by

 * at least as many elements as there are 1-bits in maps. */

struct miniflow {

struct flowmap map;

    /* Followed by:

     *     uint64_t values[n];

     * where 'n' is miniflow_n_values(miniflow). */

};

struct flowmap

struct flowmap {

map_t bits[FLOWMAP_UNITS];

};

struct netdev_flow_key

/* Stores a miniflow with inline values */

struct netdev_flow_key {

    uint32_t hash;       /* Hash function differs for different users. */

    uint32_t len;        /* Length of the following miniflow (incl. map). */

    struct miniflow mf;

    uint64_t buf[FLOW_MAX_PACKET_U64S];

};

Page updated

Report abuse