虚拟交换机项目Open vSwitch源码简单阅读--网络数据包流向

i龙家小少

2877人浏览 · 2012-08-22 10:10:55

i龙家小少 · 2012-08-22 10:10:55 发布

听说Xen Cloud Platform 就是用了这个来管理各个虚拟机直接的网络接口。大概看了一下文档，感觉特点是管理的控制接口很方便吧，这样虚拟机主机就可以方便的控制虚拟机的网络，进行一个网卡迁移到另外一个网卡上面等，动态配置这个vswitch应该是很方便的。比如物理交换机里面的查看mac table这些操作估计不好做，但使用vswitch这种的虚机设备的话应该是很容易控制的。总之管理各个交换机端口等很方便，特别是你有很多虚拟机群，大量的网络配置要做的时候。另外一个特点是支持vlan、openflow、QOS流量控制等功能，好像其他常见的交换机管理协议也都是支持的。

下载了代码回来，大概看了一下网络数据包在这个open vswitch着哦你的流向，关键代码都在datapath目录下。

/**
 * struct vport - one port within a datapath
 * @port_no: Index into @dp's @ports array.
 * @dp: Datapath to which this port belongs.
 * @kobj: Represents /sys/class/net/<devname>/brport.
 * @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
 * &struct vport.  (We keep this around so that we can delete it if the
 * device gets renamed.)  Set to the null string when no link exists.
 * @node: Element in @dp's @port_list.
 * @sflow_pool: Number of packets that were candidates for sFlow sampling,
 * regardless of whether they were actually chosen and sent down to userspace.
 * @hash_node: Element in @dev_table hash table in vport.c.
 * @ops: Class structure.
 * @percpu_stats: Points to per-CPU statistics used and maintained by the vport
 * code if %VPORT_F_GEN_STATS is set to 1 in @ops flags, otherwise unused.
 * @stats_lock: Protects @err_stats and @offset_stats.
 * @err_stats: Points to error statistics used and maintained by the vport code
 * if %VPORT_F_GEN_STATS is set to 1 in @ops flags, otherwise unused.
 * @offset_stats: Added to actual statistics as a sop to compatibility with
 * XAPI for Citrix XenServer.  Deprecated.
 */
struct vport {
    u16 port_no;
    struct datapath    *dp;
    struct kobject kobj;
    char linkname[IFNAMSIZ];
    struct list_head node;
    atomic_t sflow_pool;

    struct hlist_node hash_node;
    const struct vport_ops *ops;

    struct vport_percpu_stats *percpu_stats;

    spinlock_t stats_lock;
    struct vport_err_stats err_stats;
    struct rtnl_link_stats64 offset_stats;
};

#define VPORT_F_REQUIRED    (1 << 0) /* If init fails, module loading fails. */
#define VPORT_F_GEN_STATS    (1 << 1) /* Track stats at the generic layer. */
#define VPORT_F_FLOW        (1 << 2) /* Sets OVS_CB(skb)->flow. */
#define VPORT_F_TUN_ID        (1 << 3) /* Sets OVS_CB(skb)->tun_id. */

/**
 * struct vport_parms - parameters for creating a new vport
 *
 * @name: New vport's name.
 * @type: New vport's type.
 * @config: Kernel copy of 'config' member of &struct odp_port describing
 * configuration for new port.  Exactly %VPORT_CONFIG_SIZE bytes.
 * @dp: New vport's datapath.
 * @port_no: New vport's port number.
 */
struct vport_parms {
    const char *name;
    const char *type;
    const void *config;

    /* For vport_alloc(). */
    struct datapath *dp;
    u16 port_no;
};

/**
 * struct vport_ops - definition of a type of virtual port
 *
 * @type: Name of port type, such as "netdev" or "internal" to be matched
 * against the device type when a new port needs to be created.
 * @flags: Flags of type VPORT_F_* that influence how the generic vport layer
 * handles this vport.
 * @init: Called at module initialization.  If VPORT_F_REQUIRED is set then the
 * failure of this function will cause the module to not load.  If the flag is
 * not set and initialzation fails then no vports of this type can be created.
 * @exit: Called at module unload.
 * @create: Create a new vport configured as specified.  On success returns
 * a new vport allocated with vport_alloc(), otherwise an ERR_PTR() value.
 * @modify: Modify the configuration of an existing vport.  May be null if
 * modification is not supported.
 * @destroy: Detach and destroy a vport.
 * @set_mtu: Set the device's MTU.  May be null if not supported.
 * @set_addr: Set the device's MAC address.  May be null if not supported.
 * @set_stats: Provides stats as an offset to be added to the device stats.
 * May be null if not supported.
 * @get_name: Get the device's name.
 * @get_addr: Get the device's MAC address.
 * @get_kobj: Get the kobj associated with the device (may return null).
 * @get_stats: Fill in the transmit/receive stats.  May be null if stats are
 * not supported or if generic stats are in use.  If defined and
 * VPORT_F_GEN_STATS is also set, the error stats are added to those already
 * collected.
 * @get_dev_flags: Get the device's flags.
 * @is_running: Checks whether the device is running.
 * @get_operstate: Get the device's operating state.
 * @get_ifindex: Get the system interface index associated with the device.
 * May be null if the device does not have an ifindex.
 * @get_iflink: Get the system interface index associated with the device that
 * will be used to send packets (may be different than ifindex for tunnels).
 * May be null if the device does not have an iflink.
 * @get_mtu: Get the device's MTU.
 * @send: Send a packet on the device.  Returns the length of the packet sent.
 */
struct vport_ops {
    const char *type;
    u32 flags;

    /* Called at module init and exit respectively. */
    int (*init)(void);
    void (*exit)(void);

    /* Called with RTNL lock. */
    struct vport *(*create)(const struct vport_parms *);
    int (*modify)(struct vport *, struct odp_port *);
    int (*destroy)(struct vport *);

    int (*set_mtu)(struct vport *, int mtu);
    int (*set_addr)(struct vport *, const unsigned char *);
    int (*set_stats)(const struct vport *, struct rtnl_link_stats64 *);

    /* Called with rcu_read_lock or RTNL lock. */
    const char *(*get_name)(const struct vport *);
    const unsigned char *(*get_addr)(const struct vport *);
    struct kobject *(*get_kobj)(const struct vport *);
    int (*get_stats)(const struct vport *, struct rtnl_link_stats64 *);

    unsigned (*get_dev_flags)(const struct vport *);
    int (*is_running)(const struct vport *);
    unsigned char (*get_operstate)(const struct vport *);

    int (*get_ifindex)(const struct vport *);
    int (*get_iflink)(const struct vport *);

    int (*get_mtu)(const struct vport *);

    int (*send)(struct vport *, struct sk_buff *);
};

如上图所示，open vswitch 有eth0 eth1 tap1 tap24个虚拟端口，这个的创建应该是自己使用open vswitch的控制工具挂载到某个系统接口上得到。像tap1在xen 里面对于那个vif0等虚拟设备)这些网络设备应该有你自己负责创建，然后自己控制虚拟设备上的处理办法，比如我vswich转发包给你tap0，你自己负责再通过hypercall转发给对应的虚拟机等。open vswttch复杂的是在不同的网络设备之间转发的逻辑的控制。

vswitch关键的一个结构是struct vport，就是用来表示对应的物理端口的，比如上图连接到eth0就是vswitch的一个vport连过去的。
================vport.h============================

========================================================

是可以自己实现vport端口，然后往相应的datapath上面的注册的吧，关键是要实现vport_ops 这个接口的各个函数，比如int (*send)(struct vport *, struct sk_buff *);这个是vswitch用来往某个port上发送数据包的。然后你自己的vport的实现里面调用 vport_receive这个函数通知vswitch核心你这个port有包要从switch通过了。在vswitch的角度看来，他就是关注一个端口上面的发送和接受两个数据流向，其他的他不管了吧。

我们再看看把一个net_device 挂载为一个 vswitch的vport端口后的，数据包的流向是怎么样的,他这种 net_device 的vport是怎么实现的。
首先，vswitch根据你传过来的interface的名字，比如eth0，用dev_get_by_name找到对应的 net_device结构。然后给这个net_device注册
rx_handler函数。这样linux系统就会在这个net-device收到数据包的时候调用我们的rx_handler函数了。

====================vportnetdev.c======================================

static struct vport *netdev_create(const struct vport_parms *parms)
{
    struct vport *vport;
    struct netdev_vport *netdev_vport;
    int err;

    vport = vport_alloc(sizeof(struct netdev_vport), &netdev_vport_ops, parms);
    if (IS_ERR(vport)) {
        err = PTR_ERR(vport);
        goto error;
    }

    netdev_vport = netdev_vport_priv(vport);

    netdev_vport->dev = dev_get_by_name(&init_net, parms->name);
    if (!netdev_vport->dev) {
        err = -ENODEV;
        goto error_free_vport;
    }

    if (netdev_vport->dev->flags & IFF_LOOPBACK ||
        netdev_vport->dev->type != ARPHRD_ETHER ||
        is_internal_dev(netdev_vport->dev)) {
        err = -EINVAL;
        goto error_put;
    }

    /* If we are using the vport stats layer initialize it to the current
     * values so we are roughly consistent with the device stats. */
    if (USE_VPORT_STATS) {
        struct rtnl_link_stats64 stats;

        err = netdev_get_stats(vport, &stats);
        if (!err)
            vport_set_stats(vport, &stats);
    }

    err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
                     vport);
    if (err)
        goto error_put;

    dev_set_promiscuity(netdev_vport->dev, 1);
    dev_disable_lro(netdev_vport->dev);
    netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;

    return vport;

error_put:
    dev_put(netdev_vport->dev);
error_free_vport:
    vport_free(vport);
error:
    return ERR_PTR(err);
}

================================================================

rx_handler应该是2.6.36里面改动后才有的，看样子是专门用于brigde的网桥实现而做的，之前的都是直接在内核里面导出一个br_handle_frame_hook函数，然后内核在网络数据包的收包的地方调用这个函数来处理网桥相关的逻辑。不过看现在的代码只能一个net-device注册一个rx_handler函数的。之前看的cisco vpn客户端，其实也可以用这种办法来实现，轻松挂钩某个网络设备的收包点，然后如果这个rx_handler消耗了某个skb，内核的代码也是不会往下继续传的。
看看内核里明注册处理函数相关的代码：

/**
 *      netdev_rx_handler_register - register receive handler
 *      @dev: device to register a handler for
 *      @rx_handler: receive handler to register
 *      @rx_handler_data: data pointer that is used by rx handler
 *
 *      Register a receive hander for a device. This handler will then be
 *      called from __netif_receive_skb. A negative errno code is returned
 *      on a failure.
 *
 *      The caller must hold the rtnl_mutex.
 */
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data)
{
        ASSERT_RTNL();
        if (dev->rx_handler)
                return -EBUSY;

        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
        rcu_assign_pointer(dev->rx_handler, rx_handler);

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *      netdev_rx_handler_unregister - unregister receive handler
 *      @dev: device to unregister a handler from
 *
 *      Unregister a receive hander from a device.
 *
 *      The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{
        ASSERT_RTNL();
        rcu_assign_pointer(dev->rx_handler, NULL);
        rcu_assign_pointer(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);


static int __netif_receive_skb(struct sk_buff *skb)
{
       /* Handle special case of bridge or macvlan */
        rx_handler = rcu_dereference(skb->dev->rx_handler);   ///__netif_receive_skb函数里面会调用注册的处理函数的
        if (rx_handler) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                skb = rx_handler(skb);
                if (!skb)
                        goto out;
        }

=======================vport-netdev.c================

static int netdev_init(void)
{
    /* Hook into callback used by the bridge to intercept packets.
     * Parasites we are. */
    br_handle_frame_hook = netdev_frame_hook;      /以前久版本内核，还是采用直接替换内核导出的bridge的处理函数的办法

    return 0;
}





static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
{
    struct vport *vport;

    if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
        return skb;

    vport = netdev_get_vport(skb->dev);

    netdev_port_receive(vport, skb);  

    return NULL;
}


/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
    /* Make our own copy of the packet.  Otherwise we will mangle the
     * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
     * (No one comes after us, since we tell handle_bridge() that we took
     * the packet.) */
    skb = skb_share_check(skb, GFP_ATOMIC);
    if (unlikely(!skb))
        return;

    skb_warn_if_lro(skb);

    skb_push(skb, ETH_HLEN);
    compute_ip_summed(skb, false);

    vport_receive(vport, skb);    调用vport_receive 通知核心，我们这个端口有数据进来了
}

===========================vport.c=============================

/**
 *    vport_receive - pass up received packet to the datapath for processing
 *
 * @vport: vport that received the packet
 * @skb: skb that was received
 *
 * Must be called with rcu_read_lock.  The packet cannot be shared and
 * skb->data should point to the Ethernet header.  The caller must have already
 * called compute_ip_summed() to initialize the checksumming fields.
 */
void vport_receive(struct vport *vport, struct sk_buff *skb)
{
    if (vport->ops->flags & VPORT_F_GEN_STATS) {
        struct vport_percpu_stats *stats;

        local_bh_disable();
        stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

        write_seqcount_begin(&stats->seqlock);
        stats->rx_packets++;
        stats->rx_bytes += skb->len;
        write_seqcount_end(&stats->seqlock);

        local_bh_enable();
    }

    if (!(vport->ops->flags & VPORT_F_FLOW))
        OVS_CB(skb)->flow = NULL;

    if (!(vport->ops->flags & VPORT_F_TUN_ID))
        OVS_CB(skb)->tun_id = 0;

    dp_process_received_packet(vport, skb);               //进去datapath核心里面处理
}

============================datapath.c=========================================
这个函数里面会进行处理逻辑判断了，判断netflow流类型，然后执行相应的控制规则action等等，根据你的配置来进行的吧。这里面才是open vswitch的控制核心所在。

/* Must be called with rcu_read_lock. */
void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{
    struct datapath *dp = p->dp;
    struct dp_stats_percpu *stats;
    int stats_counter_off;
    struct sw_flow_actions *acts;
    struct loop_counter *loop;
    int error;

    OVS_CB(skb)->vport = p;

    if (!OVS_CB(skb)->flow) {
        struct odp_flow_key key;
        struct tbl_node *flow_node;
        bool is_frag;

        /* Extract flow from 'skb' into 'key'. */
        error = flow_extract(skb, p ? p->port_no : ODPP_NONE, &key, &is_frag);
        if (unlikely(error)) {
            kfree_skb(skb);
            return;
        }

        if (is_frag && dp->drop_frags) {
            kfree_skb(skb);
            stats_counter_off = offsetof(struct dp_stats_percpu, n_frags);
            goto out;
        }

        /* Look up flow. */   /搜索匹配的 流类型的，比如是不是某个tcp连接来的阿 等等？？？？？
        flow_node = tbl_lookup(rcu_dereference(dp->table), &key,
                    flow_hash(&key), flow_cmp);
        if (unlikely(!flow_node)) {
            dp_output_control(dp, skb, _ODPL_MISS_NR, OVS_CB(skb)->tun_id);
            stats_counter_off = offsetof(struct dp_stats_percpu, n_missed);
            goto out;
        }

        OVS_CB(skb)->flow = flow_cast(flow_node);
    }

    stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
    flow_used(OVS_CB(skb)->flow, skb);

    acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);

    /* Check whether we've looped too much. */
    loop = loop_get_counter();
    if (unlikely(++loop->count > MAX_LOOPS))
        loop->looping = true;
    if (unlikely(loop->looping)) {
        loop_suppress(dp, acts);
        kfree_skb(skb);
        goto out_loop;
    }

    /* Execute actions. */
    execute_actions(dp, skb, &OVS_CB(skb)->flow->key, acts->actions,
            acts->actions_len);    执行相应的规则？？？？？？？？？？？？？？？

    /* Check whether sub-actions looped too much. */
    if (unlikely(loop->looping))
        loop_suppress(dp, acts);

out_loop:
    /* Decrement loop counter. */
    if (!--loop->count)
        loop->looping = false;
    loop_put_counter();

out:
    /* Update datapath statistics. */
    local_bh_disable();
    stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());

    write_seqcount_begin(&stats->seqlock);
    (*(u64 *)((u8 *)stats + stats_counter_off))++;
    write_seqcount_end(&stats->seqlock);

    local_bh_enable();
}

=========================actiona.c============================

/* Execute a list of actions against 'skb'. */
int execute_actions(struct datapath *dp, struct sk_buff *skb,
            const struct odp_flow_key *key,
            const struct nlattr *actions, u32 actions_len)
{
    /* Every output action needs a separate clone of 'skb', but the common
     * case is just a single output action, so that doing a clone and
     * then freeing the original skbuff is wasteful.  So the following code
     * is slightly obscure just to avoid that. */
    int prev_port = -1;
    u32 priority = skb->priority;
    const struct nlattr *a;
    int rem, err;

    if (dp->sflow_probability) {
        struct vport *p = OVS_CB(skb)->vport;
        if (p) {
            atomic_inc(&p->sflow_pool);
            if (dp->sflow_probability == UINT_MAX ||
                net_random() < dp->sflow_probability)
                sflow_sample(dp, skb, actions, actions_len, p);
        }
    }

    OVS_CB(skb)->tun_id = 0;

    for (a = actions, rem = actions_len; rem > 0; a = nla_next(a, &rem)) {
        if (prev_port != -1) {
            do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);     根据规则，决定从那个端口出去了。
            prev_port = -1;
        }

        switch (nla_type(a)) {
        case ODPAT_OUTPUT:
            prev_port = nla_get_u32(a);
            break;

        case ODPAT_CONTROLLER:
            err = output_control(dp, skb, nla_get_u64(a));
            if (err) {
                kfree_skb(skb);
                return err;
            }
            break;

        case ODPAT_SET_TUNNEL:
            OVS_CB(skb)->tun_id = nla_get_be64(a);
            break;

        case ODPAT_SET_DL_TCI:
            skb = modify_vlan_tci(dp, skb, key, a, rem);
            if (IS_ERR(skb))
                return PTR_ERR(skb);
            break;

        case ODPAT_STRIP_VLAN:
            skb = strip_vlan(skb);
            break;

        case ODPAT_SET_DL_SRC:
            skb = make_writable(skb, 0);
            if (!skb)
                return -ENOMEM;
            memcpy(eth_hdr(skb)->h_source, nla_data(a), ETH_ALEN);
            break;

        case ODPAT_SET_DL_DST:
            skb = make_writable(skb, 0);
            if (!skb)
                return -ENOMEM;
            memcpy(eth_hdr(skb)->h_dest, nla_data(a), ETH_ALEN);
            break;

        case ODPAT_SET_NW_SRC:
        case ODPAT_SET_NW_DST:
            skb = set_nw_addr(skb, key, a);
            break;

        case ODPAT_SET_NW_TOS:
            skb = set_nw_tos(skb, key, nla_get_u8(a));
            break;

        case ODPAT_SET_TP_SRC:
        case ODPAT_SET_TP_DST:
            skb = set_tp_port(skb, key, a);
            break;

        case ODPAT_SET_PRIORITY:
            skb->priority = nla_get_u32(a);
            break;

        case ODPAT_POP_PRIORITY:
            skb->priority = priority;
            break;

        case ODPAT_DROP_SPOOFED_ARP:
            if (unlikely(is_spoofed_arp(skb, key)))
                goto exit;
            break;
        }
        if (!skb)
            return -ENOMEM;
    }
exit:
    if (prev_port != -1)
        do_output(dp, skb, prev_port);
    else
        kfree_skb(skb);
    return 0;
}


static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
{
    struct vport *p;

    if (!skb)
        goto error;

    p = rcu_dereference(dp->ports[out_port]);
    if (!p)
        goto error;

    vport_send(p, skb);             //从端口发送出去
    return;

error:
    kfree_skb(skb);
}

=========================vport.c=======================================

/**
 *    vport_send - send a packet on a device
 *
 * @vport: vport on which to send the packet
 * @skb: skb to send
 *
 * Sends the given packet and returns the length of data sent.  Either RTNL
 * lock or rcu_read_lock must be held.
 */
int vport_send(struct vport *vport, struct sk_buff *skb)
{
    int mtu;
    int sent;

    mtu = vport_get_mtu(vport);
    if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
        if (net_ratelimit())
            pr_warn("%s: dropped over-mtu packet: %d > %d\n",
                dp_name(vport->dp), packet_length(skb), mtu);
        goto error;
    }

    sent = vport->ops->send(vport, skb);    我们注册vport时候的发送出去的函数。

    if (vport->ops->flags & VPORT_F_GEN_STATS && sent > 0) {
        struct vport_percpu_stats *stats;

        local_bh_disable();
        stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

        write_seqcount_begin(&stats->seqlock);
        stats->tx_packets++;
        stats->tx_bytes += sent;
        write_seqcount_end(&stats->seqlock);

        local_bh_enable();
    }

    return sent;

error:
    kfree_skb(skb);
    vport_record_error(vport, VPORT_E_TX_DROPPED);
    return 0;
}

======================vport-netdev.c==================
看看我们net_device attach类型的vport的处理函数。

static int netdev_send(struct vport *vport, struct sk_buff *skb)
{
    struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
    int len = skb->len;

    skb->dev = netdev_vport->dev;
    forward_ip_summed(skb);
    dev_queue_xmit(skb);   ///加到网络设备的发送队列里面，从net_device 发送出去外面网络

    return len;
}

=============================vport-internal_dev.c=====================================
上面的 netdev类型 port，我们可以看到应该是被vswitch使用之后，他那个网络设备就没有办法正常的把网络包分发给系统上层协议来处理的了。比如说eth0被vswitch接管了，linux内核是不能直接收到eth0过来的包了，而是有open vswitch接管了，vswtich可能根据规则就直接转发给另外一个vport的net_device,这个另外一个net-device可能是对应的虚拟机的接口的，比如xen里面vif网络设备，然后包就通过vif过去虚拟机了。这样eth0的包，自己的host主机是看不到他过来的包的。不过open vswitch还实现了另外一种internal_dev类型的vport 。这种vport他会自己注册一个网络设备，通过这个特定的网络设备，host主机是可以给vswitch发送网络包的，然后它这个vport是受到vswtich过来的包的话，他也是往上传给内核协议栈的。

static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
{
    struct net_device *netdev = netdev_vport_priv(vport)->dev;
    int len;

    skb->dev = netdev;                 //传给vport的网络设备
    len = skb->len;
    skb->pkt_type = PACKET_HOST;
    skb->protocol = eth_type_trans(skb, netdev);

    if (in_interrupt())
        netif_rx(skb);              /net_device收到包，上传给上层处理
    else
        netif_rx_ni(skb);

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
    netdev->last_rx = jiffies;
#endif

    return len;
}

const struct vport_ops internal_vport_ops = {
    .type        = "internal",
    .flags        = VPORT_F_REQUIRED | VPORT_F_GEN_STATS | VPORT_F_FLOW,
    .create        = internal_dev_create,                //创建vport的函数
    .destroy    = internal_dev_destroy,
    .set_mtu    = netdev_set_mtu,
    .set_addr    = netdev_set_addr,
    .get_name    = netdev_get_name,
    .get_addr    = netdev_get_addr,
    .get_kobj    = netdev_get_kobj,
    .get_dev_flags    = netdev_get_dev_flags,
    .is_running    = netdev_is_running,
    .get_operstate    = netdev_get_operstate,
    .get_ifindex    = netdev_get_ifindex,
    .get_iflink    = netdev_get_iflink,
    .get_mtu    = netdev_get_mtu,
    .send        = internal_dev_recv,                vswitch 要给你这个vport发包的时候，就调用的这个。
};

总结：大概看了一下之后，vswitch的流程和大概实现就清除一点了。他也是通过内核里面net_device结构，挂钩网络设备的发包出口点和接受点来做到的。然后让包在不同的netdevice之间转发数据包，修改包的流向等，这就是一个虚拟交换机的功能了。当然他里面的逻辑控制还是要做很多工作的。不过这些在net-device之间玩弄网络skb数据包的办法也可以学习一下。

原文地址：http://hi.baidu.com/widebright/item/e1b561c3c44660bb0d0a7ba7