https://sites.google.com/a/cnsrl.cycu.edu.tw/da-shu-bi-ji/dpdk-ovs
https://feisky.gitbooks.io/sdn/dpdk/ovs-dpdk.html
https://software.intel.com/en-us/articles/set-up-open-vswitch-with-dpdk-on-ubuntu-server
DPDK加速的OVS數據流轉發的大致流程如下:
1)OVS的ovs-vswitchd接收到從OVS連接的某個Port發來的封包,從封包中取出Src / Dst IP、Src / Dst MAC、Port 等訊息。
2)OVS在用戶態查看精確流表(Exact Match)和模糊流表,如果命中,則直接轉發。
3)如果還不命中,在SDN控制器接入的情況下,經過OpenFlow協議,通告給控制器,由控制器處理。
4)控制器下發新的Flow,該封包重新發起選路,匹配;封包轉發,結束。
A tap device is used by dpif-netdev to create internal devices.
Without this patch, adding any bridge backed by the userspace datapath
would fail.
This doesn't mean that we can run Open vSwitch with DPDK under SELinux
yet, but at least we can use the userspace datapath.
Example of Packet Forwarding to a Physical Port
http://zhaozhanxu.com/2016/09/08/SDN/OVS/2016-09-08-ovs-dpdk-pkts-flow/
netdev.c
純kernel ??
/* Retrieves 'netdev''s MAC address. If successful, returns 0 and copies the * the MAC address into 'mac'. On failure, returns a positive errno value and * clears 'mac' to all-zeros. */intnetdev_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac){ return netdev->netdev_class->get_etheraddr(netdev, mac);}dpif-netdev.c pmd 拉上來的,有dpdk
#4205行
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets, //<<<<<<<<<<<<<<<<<< bool md_is_valid, odp_port_t port_no){ int cnt = packets->count; //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#if !defined(__CHECKER__) && !defined(_WIN32) const size_t PKT_ARRAY_SIZE = cnt;#else /* Sparse or MSVC doesn't like variable length array. */ enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };#endif struct netdev_flow_key keys[PKT_ARRAY_SIZE]; struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; long long now = time_msec();
size_t newcnt, n_batches, i; odp_port_t in_port; n_batches = 0; newcnt = emc_processing(pmd, packets, keys, batches, &n_batches, md_is_valid, port_no); if (OVS_UNLIKELY(newcnt)) { packets->count = newcnt; /* Get ingress port from first packet's metadata. */ in_port = packets->packets[0]->md.in_port.odp_port; fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now); } for (i = 0; i < n_batches; i++) { batches[i].flow->batch = NULL; } for (i = 0; i < n_batches; i++) { packet_batch_per_flow_execute(&batches[i], pmd, now); }}dp-packet.h
#597
struct dp_packet_batch {
int count;
bool trunc; /* true if the batch needs truncate. */ struct dp_packet *packets[NETDEV_MAX_BURST];
};dp-packet.h
#41
/* Buffer for holding packet data. A dp_packet is automatically reallocated * as necessary if it grows too large for the available memory. */struct dp_packet {
#ifdef DPDK_NETDEV struct rte_mbuf mbuf; /* DPDK mbuf */#else void *base_; /* First byte of allocated space. */ uint16_t allocated_; /* Number of bytes allocated. */
uint16_t data_ofs; /* First byte actually in use. */
uint32_t size_; /* Number of bytes in use. */
uint32_t rss_hash; /* Packet hash. */
bool rss_hash_valid; /* Is the 'rss_hash' valid? */#endif enum dp_packet_source source; /* Source of memory allocated as 'base'. */ uint8_t l2_pad_size; /* Detected l2 padding size.
* Padding is non-pullable. */ uint16_t l2_5_ofs; /* MPLS label stack offset, or UINT16_MAX */
uint16_t l3_ofs; /* Network-level header offset,
* or UINT16_MAX. */ uint16_t l4_ofs; /* Transport-level header offset,
or UINT16_MAX. */ uint32_t cutlen; /* length in bytes to cut from the end. */
union { struct pkt_metadata md;
uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
};};struct rte_mbuf
http://dpdk.org/doc/api/structrte__mbuf.html
struct rte_mbuf { 426 MARKER cacheline0; 427 428 void *buf_addr; 435 phys_addr_t buf_physaddr __rte_aligned(sizeof(phys_addr_t)); 436 437 /* next 8 bytes are initialised on RX descriptor rearm */ 438 MARKER64 rearm_data; 439 uint16_t data_off; 440 450 RTE_STD_C11 451 union { 452 rte_atomic16_t refcnt_atomic; 453 uint16_t refcnt; 454 }; 455 uint16_t nb_segs; 458 uint16_t port; 459 460 uint64_t ol_flags; 462 /* remaining bytes are set on RX when pulling packet from descriptor */ 463 MARKER rx_descriptor_fields1; 464 465 /* 466 * The packet type, which is the combination of outer/inner L2, L3, L4 467 * and tunnel types. The packet_type is about data really present in the 468 * mbuf. Example: if vlan stripping is enabled, a received vlan packet 469 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the 470 * vlan is stripped from the data. 471 */ 472 RTE_STD_C11 473 union { 474 uint32_t packet_type; 475 struct { 476 uint32_t l2_type:4; 477 uint32_t l3_type:4; 478 uint32_t l4_type:4; 479 uint32_t tun_type:4; 480 RTE_STD_C11 481 union { 482 uint8_t inner_esp_next_proto; 487 __extension__ 488 struct { 489 uint8_t inner_l2_type:4; 491 uint8_t inner_l3_type:4; 493 }; 494 }; 495 uint32_t inner_l4_type:4; 496 }; 497 }; 498 499 uint32_t pkt_len; 500 uint16_t data_len; 502 uint16_t vlan_tci; 503 504 union { 505 uint32_t rss; 506 struct { 507 RTE_STD_C11 508 union { 509 struct { 510 uint16_t hash; 511 uint16_t id; 512 }; 513 uint32_t lo; 515 }; 516 uint32_t hi; 519 } fdir; 520 struct { 521 uint32_t lo; 522 uint32_t hi; 523 } sched; 524 uint32_t usr; 525 } hash; 528 uint16_t vlan_tci_outer; 529 530 uint16_t buf_len; 535 uint64_t timestamp; 536 537 /* second cache line - fields only used in slow path or on TX */ 538 MARKER cacheline1 __rte_cache_min_aligned; 539 540 RTE_STD_C11 541 union { 542 void *userdata; 543 uint64_t udata64; 544 }; 545 546 struct rte_mempool *pool; 547 struct rte_mbuf *next; 549 /* fields to support TX offloads */ 550 RTE_STD_C11 551 union { 552 uint64_t tx_offload; 553 __extension__ 554 struct { 555 uint64_t l2_len:7; 559 uint64_t l3_len:9; 560 uint64_t l4_len:8; 561 uint64_t tso_segsz:16; 563 /* fields for TX offloading of tunnels */ 564 uint64_t outer_l3_len:9; 565 uint64_t outer_l2_len:7; 567 /* uint64_t unused:8; */ 568 }; 569 }; 570 573 uint16_t priv_size; 574 576 uint16_t timesync; 577 579 uint32_t seqn; 580 581 } __rte_cache_aligned;packets.h#93
Struct packet_metadata
/* Datapath packet metadata */struct pkt_metadata {
uint32_t recirc_id; /* Recirculation id carried with the
recirculating packets. 0 for packets received from the wire. */ uint32_t dp_hash; /* hash value computed by the recirculation
action. */ uint32_t skb_priority; /* Packet priority for QoS. */
uint32_t pkt_mark; /* Packet mark. */
uint16_t ct_state; /* Connection state. */
uint16_t ct_zone; /* Connection zone. */
uint32_t ct_mark; /* Connection mark. */
ovs_u128 ct_label; /* Connection label. */
union flow_in_port in_port; /* Input port. */
struct flow_tnl tunnel; /* Encapsulating tunnel parameters. Note that
* if 'ip_dst' == 0, the rest of the fields may * be uninitialized. */};上面此路不通
dpif-netdev.c
emc_processing
emc_processing主要是將收到的幾個封包解析key值,並且從cache中查找表格,match到的封包放入表格;返回match的封包個數。
使用miniflow_extract 將封包解析到key值。使用emc_lookup從hash表中找尋,並且進行key值的比較。
如果match,使用dp_netdev_queue_batches将封包加在flow->batches中。不match將不match的封包當前排。
使用dp_netdev_count_packet統計match的封包數目。
static inline size_t
emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_, struct netdev_flow_key *keys, struct packet_batch_per_flow batches[], size_t *n_batches, bool md_is_valid, odp_port_t port_no){ struct emc_cache *flow_cache = &pmd->flow_cache; struct netdev_flow_key *key = &keys[0]; size_t i, n_missed = 0, n_dropped = 0; struct dp_packet **packets = packets_->packets; int cnt = packets_->count; for (i = 0; i < cnt; i++) { struct dp_netdev_flow *flow; struct dp_packet *packet = packets[i];
// 如果封包長度有問題就丟掉??
if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { dp_packet_delete(packet); n_dropped++; continue; }// 如果封包還沒處理到最後一個,取出下一筆封包資料跟metadata
if (i != cnt - 1) { /* Prefetch next packet data and metadata. */ OVS_PREFETCH(dp_packet_data(packets[i+1])); pkt_metadata_prefetch_init(&packets[i+1]->md); } if (!md_is_valid) { pkt_metadata_init(&packet->md, port_no); } miniflow_extract(packet, &key->mf);// https://software.intel.com/en-us/articles/the-open-vswitch-exact-match-cache
//在OVS-DPDK中,DPDK處理過的封包,會傳給OVS的miniflow_extract函數進行解析,提取key,進行後續的表格匹配。
key->len = 0; /* Not computed yet. */ key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);// 可能在這個function裡面?
flow = emc_lookup(flow_cache, key); if (OVS_LIKELY(flow)) { dp_netdev_queue_batches(packet, flow, &key->mf, batches, n_batches); } else { // EMC cache miss 跳到下一個fast path /* Exact match cache missed. Group missed packets together at * the beginning of the 'packets' array. */ packets[n_missed] = packet; /* 'key[n_missed]' contains the key of the current packet and it * must be returned to the caller. The next key should be extracted * to 'keys[n_missed + 1]'. */ key = &keys[++n_missed]; } }// 處理了多少封包??總共 減 丟掉的 減 沒EMC中的
dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed); return n_missed;}miniflow_extract()
// https://lists.onap.org/pipermail/ovs-dev/2014-April/282612.html
miniflow_extract() extracts packet headers directly to a miniflow, which is a compressed form of the struct flow.
This does not require a large struct to be cleared to begin with, and accesses less memory.
These performance benefits should allow this to be used in the DPDK datapath. miniflow_extract() takes a miniflow as an input/output parameter.
On input the buffer for values to be extracted must be properly initialized.
On output the map contains ones for all the fields that have been extracted.
Some struct flow fields are reordered to make miniflow_extract to progress in the logical order.
Some explicit "inline" keywords are necessary for GCC to optimize this properly.
Also, macros are used for same reason instead of inline functions for pushing data to the miniflow.
/* The 'dst' must follow with buffer space for FLOW_U64S 64-bit units. * 'dst->map' is ignored on input and set on output to indicate which fields * were extracted. */void miniflow_extract(struct dp_packet *packet, struct miniflow *dst);
/* Caller is responsible for initializing 'dst' with enough storage for
* FLOW_U64S * 8 bytes. */
void
miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
{
const struct pkt_metadata *md = &packet->md;
const void *data = dp_packet_data(packet);
size_t size = dp_packet_size(packet);
ovs_be32 packet_type = packet->packet_type;
uint64_t *values = miniflow_values(dst);
struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
values + FLOW_U64S };
const char *frame;
ovs_be16 dl_type = OVS_BE16_MAX;
uint8_t nw_frag, nw_tos, nw_ttl, nw_proto;
uint8_t *ct_nw_proto_p = NULL;
ovs_be16 ct_tp_src = 0, ct_tp_dst = 0;
..............................
/* Network layer. */
packet->l3_ofs = (char *)data - frame;
nw_frag = 0;
if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {
const struct ip_header *nh = data;
int ip_len;
uint16_t tot_len;
if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
goto out;
}
ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
goto out;
}
if (OVS_UNLIKELY(size < ip_len)) {
goto out;
}
tot_len = ntohs(nh->ip_tot_len);
if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len)) {
goto out;
}
if (OVS_UNLIKELY(size - tot_len > UINT8_MAX)) {
goto out;
}
dp_packet_set_l2_pad_size(packet, size - tot_len);
size = tot_len; /* Never pull padding. */
/* Push both source and destination address at once. */
miniflow_push_words(mf, nw_src, &nh->ip_src, 1);
if (ct_nw_proto_p && !md->ct_orig_tuple_ipv6) {
*ct_nw_proto_p = md->ct_orig_tuple.ipv4.ipv4_proto;
if (*ct_nw_proto_p) {
miniflow_push_words(mf, ct_nw_src,
&md->ct_orig_tuple.ipv4.ipv4_src, 1);
ct_tp_src = md->ct_orig_tuple.ipv4.src_port;
ct_tp_dst = md->ct_orig_tuple.ipv4.dst_port;
}
}
miniflow_push_be32(mf, ipv6_label, 0); /* Padding for IPv4. */
nw_tos = nh->ip_tos;
nw_ttl = nh->ip_ttl;
nw_proto = nh->ip_proto;
if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
nw_frag = FLOW_NW_FRAG_ANY;
if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
nw_frag |= FLOW_NW_FRAG_LATER;
}
}
data_pull(&data, &size, ip_len);
} else if (dl_type == htons(ETH_TYPE_IPV6)) {
const struct ovs_16aligned_ip6_hdr *nh;
ovs_be32 tc_flow;
uint16_t plen;
..............................
out:
dst->map = mf.map;
}
emc_lookup
static inline struct dp_netdev_flow *
emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
{ struct emc_entry *current_entry; EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) { if (current_entry->key.hash == key->hash && emc_entry_alive(current_entry) && netdev_flow_key_equal_mf(¤t_entry->key, &key->mf)) { /* We found the entry with the 'key->mf' miniflow */ return current_entry->flow; } } return NULL;}struct emc_entry
struct emc_entry { struct dp_netdev_flow *flow; struct netdev_flow_key key; /* key.hash used for emc hash value. */};struct dp_netdev_flow
/* A flow in 'dp_netdev_pmd_thread's 'flow_table'. * * * Thread-safety * ============= * * Except near the beginning or ending of its lifespan, rule 'rule' belongs to * its pmd thread's classifier. The text below calls this classifier 'cls'. * * Motivation * ---------- * * The thread safety rules described here for "struct dp_netdev_flow" are * motivated by two goals: * * - Prevent threads that read members of "struct dp_netdev_flow" from * reading bad data due to changes by some thread concurrently modifying * those members. * * - Prevent two threads making changes to members of a given "struct * dp_netdev_flow" from interfering with each other. * * * Rules * ----- * * A flow 'flow' may be accessed without a risk of being freed during an RCU * grace period. Code that needs to hold onto a flow for a while * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref(). * * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the * flow from being deleted from 'cls' and it doesn't protect members of 'flow' * from modification. * * Some members, marked 'const', are immutable. Accessing other members * requires synchronization, as noted in more detail below. */struct dp_netdev_flow { const struct flow flow; /* Unmasked flow that created this entry. */
/* Hash table index by unmasked flow. */ const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
/* 'flow_table'. */ const ovs_u128 ufid; /* Unique flow identifier. */ const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
/* flow. */ /* Number of references. * The classifier owns one reference. * Any thread trying to keep a rule from being freed should hold its own * reference. */ struct ovs_refcount ref_cnt; bool dead; /* Statistics. */ struct dp_netdev_flow_stats stats; /* Actions. */ OVSRCU_TYPE(struct dp_netdev_actions *) actions; /* While processing a group of input packets, the datapath uses the next * member to store a pointer to the output batch for the flow. It is * reset after the batch has been sent out (See dp_netdev_queue_batches(), * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */ struct packet_batch_per_flow *batch; /* Packet classification. */ struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */ /* 'cr' must be the last member. */};struct miniflow
/* A sparse representation of a "struct flow". * * A "struct flow" is fairly large and tends to be mostly zeros. Sparse * representation has two advantages. First, it saves memory and, more * importantly, minimizes the number of accessed cache lines. Second, it saves * time when the goal is to iterate over only the nonzero parts of the struct. * * The map member hold one bit for each uint64_t in a "struct flow". Each * 0-bit indicates that the corresponding uint64_t is zero, each 1-bit that it * *may* be nonzero (see below how this applies to minimasks). * * The values indicated by 'map' always follow the miniflow in memory. The * user of the miniflow is responsible for always having enough storage after * the struct miniflow corresponding to the number of 1-bits in maps.
* * Elements in values array are allowed to be zero. This is useful for "struct * minimatch", for which ensuring that the miniflow and minimask members have * same maps allows optimization. This allowance applies only to a miniflow * that is not a mask. That is, a minimask may NOT have zero elements in its * values. * * A miniflow is always dynamically allocated so that the maps are followed by * at least as many elements as there are 1-bits in maps. */struct miniflow {
struct flowmap map;
/* Followed by: * uint64_t values[n]; * where 'n' is miniflow_n_values(miniflow). */};struct flowmap
struct flowmap {
map_t bits[FLOWMAP_UNITS];
};struct netdev_flow_key
/* Stores a miniflow with inline values */struct netdev_flow_key { uint32_t hash; /* Hash function differs for different users. */ uint32_t len; /* Length of the following miniflow (incl. map). */ struct miniflow mf; uint64_t buf[FLOW_MAX_PACKET_U64S];};oj