[PATCH 20 of 20] ipath - ethernet emulation driver

From: Bryan O'Sullivan
Date: Thu Mar 09 2006 - 19:39:34 EST

Next message: Bryan O'Sullivan: "[PATCH 16 of 20] ipath - misc infiniband code, part 2"
Previous message: Bryan O'Sullivan: "[PATCH 14 of 20] ipath - infiniband RC protocol support"
In reply to: Bryan O'Sullivan: "[PATCH 14 of 20] ipath - infiniband RC protocol support"
Next in thread: Bryan O'Sullivan: "[PATCH 16 of 20] ipath - misc infiniband code, part 2"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

The ethernet emulation driver makes an eth* interface available. It
uses Infiniband UD packets, but is not IPoIB compatible. It provides
higher bandwidth and lower latency than IPoIB.

The driver is implemented using the ipath_layer code, as is the ipath
driver's OpenIB support.

Signed-off-by: Bryan O'Sullivan <bos@xxxxxxxxxxxxx>

diff -r d5a8cb977923 -r 7f00f404094f drivers/infiniband/hw/ipath/Kconfig
--- a/drivers/infiniband/hw/ipath/Kconfig Thu Mar 9 16:17:14 2006 -0800
+++ b/drivers/infiniband/hw/ipath/Kconfig Thu Mar 9 16:17:14 2006 -0800
@@ -16,3 +16,10 @@ config INFINIBAND_IPATH
allows these devices to be used with both kernel upper level
protocols such as IP-over-InfiniBand as well as with userspace
applications (in conjunction with InfiniBand userspace access).
+
+config IPATH_ETHER
+ tristate "PathScale InfiniPath ethernet driver"
+ depends on IPATH_CORE
+ ---help---
+ This is an ethernet emulator layer for the PathScale InfiniPath
+ host channel adapters (HCAs).
diff -r d5a8cb977923 -r 7f00f404094f drivers/infiniband/hw/ipath/ipath_eth.c
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/drivers/infiniband/hw/ipath/ipath_eth.c Thu Mar 9 16:17:14 2006 -0800
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ */
+
+/*
+ * ipath_ether.c ethernet driver emulation over PathScale Infinipath
+ * for Linux.
+ */
+
+#define ipath_ether_ioctl_support
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+
+#include "ipath_debug.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+/* Not static, because we don't want the compiler removing it */
+#define DRV_NAME "ipath_ether"
+const char ipath_ether_version[] = DRV_NAME " " IPATH_IDSTR;
+#define DRV_VERSION "1.0"
+
+#if _IPATH_DEBUGGING
+
+#define __IPATH_DBG_WHICH(which,fmt,...) \
+ do { \
+ if (unlikely(ipath_debug&(which))) \
+ printk(KERN_DEBUG DRV_NAME ": %s: " fmt, \
+ __func__,##__VA_ARGS__); \
+ } while (0)
+
+#define ipath_eth_dbg(fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_IPATHDBG,fmt,##__VA_ARGS__)
+#define ipath_eth_cdbg(which,fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+#define ipath_eth_warn(fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_IPATHWARN,fmt,##__VA_ARGS__)
+#define ipath_eth_err(fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_IPATHERR ,fmt,##__VA_ARGS__)
+#define ipath_eth_table(fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_IPATHTABLE ,fmt,##__VA_ARGS__)
+
+#else
+
+#define ipath_eth_dbg(fmt,...)
+#define ipath_eth_warn(fmt,...)
+#define ipath_eth_err(fmt,...)
+#define ipath_eth_table(fmt,...)
+
+#endif
+
+#define MAX_IPATH_LAYER_DEVICE 4
+#define ETHER_MAC_SIZE 6
+
+#define TX_TIMEOUT 2000
+
+#define BROADCAST_MASK 0x0001
+
+#define IPATH_LAYER_DOWN 0
+#define IPATH_LAYER_UP 1
+
+#define MAC_LENGTH 6
+
+#define MAX_HASH_ENTRIES 4129
+
+#define LID_ARP_REQUEST 1
+#define LID_ARP_RESPONSE 2
+
+#define ETH_ARP_PROTOCOL 0x0806 /* ARP protocol ID */
+
+#define HASH_ALLOC_ENTRIES 256
+
+#define priv_data(dev) ((struct ipath_ether_priv *)(dev)->priv)
+
+#define make_hash_key(mac) ((mac[0] + mac[1] + mac[2]) % MAX_HASH_ENTRIES)
+
+/* This structure is used to reassemble packets for large MTUs. */
+struct ipath_frag_state {
+ spinlock_t lock;
+ struct sk_buff *skb;
+ struct sk_buff *last_skb;
+ uint16_t lid;
+ uint8_t frag_num; /* ips_message_header.unused */
+ uint8_t seq_num; /* ips_message_header.tinylen */
+ uint32_t len; /* ips_message_header.ack_seq_num */
+};
+
+struct ipath_ether_priv {
+ struct ipath_devdata *dd;
+ int device_id;
+ uint16_t my_lid; /* set in network order */
+ uint16_t my_bcast; /* set in network order */
+ uint16_t my_mac_addr[3];
+ int ipath_ether_if_stat;
+ struct net_device_stats ipath_ether_stats;
+ wait_queue_head_t lid_wait; /* when waiting for LID at open */
+ struct copy_data_s cpc;
+ struct ipath_frag_state *fstate; /* Fragment reassembly table */
+ struct ether_header protocol_header;
+};
+
+struct ether_hash { /* _ips_message_header */
+ struct ether_hash *next;
+ uint16_t mac[3];
+ uint16_t lid;
+};
+
+static struct net_device *dev_ipath_ether[MAX_IPATH_LAYER_DEVICE];
+static struct ipath_ether_priv private_data[MAX_IPATH_LAYER_DEVICE];
+static int number_of_devices;
+
+static atomic_t send_continue;
+
+/*
+ * this will have to change to be per-device when we support
+ * multiple infinipath devices that aren't all on the saem fabric
+ */
+static struct ether_hash hash_table[MAX_HASH_ENTRIES];
+static struct ether_hash *all_hash_entries;
+static struct ether_hash *free_hash_entries;
+static DEFINE_SPINLOCK(ipath_ether_lock);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("PathScale <support@xxxxxxxxxxxxx>");
+MODULE_DESCRIPTION("Pathscale InfiniPath ethernet driver");
+
+static struct net_device_stats *ipath_ether_get_stats(
+ struct net_device *dev);
+int ipath_ether_init(void);
+void ipath_ether_exit(void);
+
+module_init(ipath_ether_init);
+module_exit(ipath_ether_exit);
+
+static unsigned int ipath_fragtable_size = 1033;
+module_param_named(fragtable_size, ipath_fragtable_size, uint, S_IRUGO);
+MODULE_PARM_DESC(fragtable_size,
+ "size of the fragment reassembly hash table (prime)");
+
+static int _send_lid_message(uint16_t * mac,
+ uint8_t cmd,
+ uint16_t dest_lid, struct net_device *dev)
+{
+ struct ether_header protocol_header;
+ uint32_t total_frame_size_in_words = sizeof(protocol_header) >> 2;
+
+ protocol_header.lrh[0] = htons(IPS_LRH_BTH);
+ protocol_header.lrh[1] = dest_lid; /* DEST LID */
+ protocol_header.lrh[2] =
+ htons(total_frame_size_in_words + SIZE_OF_CRC);
+ protocol_header.lrh[3] =
+ priv_data(dev)->my_lid; /* SRC LID */
+
+ protocol_header.bth[0] =
+ htonl((OPCODE_ITH4X << 24) + IPS_DEFAULT_P_KEY);
+ protocol_header.sub_opcode = OPCODE_LID_ARP;
+
+ protocol_header.bth[1] = htonl(IPATH_KD_QP);
+ protocol_header.bth[2] = 0;
+
+ /* port, version, and TID are already known to be in range, no
+ * masking needed; offset in low INFINIPATH_I_OFFSET_MASK bits */
+ protocol_header.iph.ver_port_tid_offset =
+ (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) +
+ (EAGER_TID_ID << INFINIPATH_I_TID_SHIFT);
+ /* port is zero */
+ /* offset is zero */
+
+ /* generate an interrupt on the receive side */
+ protocol_header.iph.pkt_flags = INFINIPATH_KPF_INTR;
+
+ protocol_header.iph.chksum =
+ (uint16_t) IPS_LRH_BTH +
+ (uint16_t) (total_frame_size_in_words + SIZE_OF_CRC) -
+ (uint16_t) ((protocol_header.iph.
+ ver_port_tid_offset >> 16) & 0xFFFF) -
+ (uint16_t) (protocol_header.iph.ver_port_tid_offset &
+ 0xFFFF) -
+ (uint16_t) protocol_header.iph.pkt_flags;
+
+ protocol_header.cmd = cmd;
+ protocol_header.lid =
+ (cmd == LID_ARP_RESPONSE) ? priv_data(dev)->my_lid : 0;
+ protocol_header.mac[0] = mac[0];
+ protocol_header.mac[1] = mac[1];
+ protocol_header.mac[2] = mac[2];
+
+ return ipath_layer_send_hdr(priv_data(dev)->dd, &protocol_header);
+}
+
+/**
+ * _add_mac_lid - add a MAC LID
+ * @mac: the MAC
+ * @lid: the LID
+ @
+ * XXX problem? can grow to unbounded in size.
+ * NOTE: this should only be called from interrupt context.
+ */
+static int _add_mac_lid(uint16_t * mac, uint16_t lid)
+{
+ uint16_t hashkey = make_hash_key(mac);
+ struct ether_hash *hash_entry = &hash_table[hashkey];
+ struct ether_hash *last_entry = NULL;
+ struct ether_hash *index;
+ int counter;
+ int rc = 0;
+
+ /* spin_lock_irq(&ipath_ether_lock); */
+
+ if (!hash_entry->lid) {
+ memcpy(hash_entry->mac, mac, MAC_LENGTH);
+ hash_entry->lid = lid;
+ hash_entry->next = NULL;
+
+ goto _add_mac_lid_complete;
+ }
+
+ while (hash_entry) {
+ if ((hash_entry->mac[0] == mac[0]) &&
+ (hash_entry->mac[1] == mac[1]) &&
+ (hash_entry->mac[2] == mac[2])
+ ) {
+ hash_entry->lid = lid;
+ goto _add_mac_lid_complete;
+ }
+
+ last_entry = hash_entry;
+ hash_entry = hash_entry->next;
+ }
+
+ /* MAC address was not found - so add it! */
+ if (!free_hash_entries) {
+ index = kmalloc(HASH_ALLOC_ENTRIES *
+ sizeof(struct ether_hash), GFP_ATOMIC);
+
+ if (!index) {
+ rc = -1;
+ goto _add_mac_lid_complete;
+ }
+
+ /*
+ * The first entry is used to keep a list of all the
+ * entries.
+ */
+ index->next = all_hash_entries;
+ all_hash_entries = index;
+ free_hash_entries = ++index;
+
+ for (counter = 2; counter < HASH_ALLOC_ENTRIES; counter++) {
+ index->next = index + 1;
+ index++;
+ }
+
+ index->next = NULL;
+ }
+
+ hash_entry = free_hash_entries;
+ free_hash_entries = free_hash_entries->next;
+
+ /* Initialize the new entry before linking into the list. */
+ memcpy(hash_entry->mac, mac, MAC_LENGTH);
+ hash_entry->lid = lid;
+ hash_entry->next = NULL;
+
+ last_entry->next = hash_entry;
+
+_add_mac_lid_complete:
+ /* spin_unlock_irq(&ipath_ether_lock); */
+ return rc;
+}
+
+/**
+ * _lookup_lid - look up the LID for a MAC
+ * @mac: the MAC
+ * @dev: the network device
+ *
+ * LID '0' (zero) is returned when lookup failed.
+ * Since only one CPU can update the list at a time, we are
+ * careful to initialize the entry before linking into the list,
+ * and its a single linked list, the readers can safely walk the list
+ * without holding the lock.
+ */
+static uint16_t _lookup_lid(uint16_t * mac, struct net_device *dev)
+{
+ uint16_t hashkey = make_hash_key(mac);
+ struct ether_hash *hash_entry = &hash_table[hashkey];
+
+ while (hash_entry) {
+ if ((hash_entry->mac[0] == mac[0]) &&
+ (hash_entry->mac[1] == mac[1]) &&
+ (hash_entry->mac[2] == mac[2])
+ )
+ break;
+
+ hash_entry = hash_entry->next;
+ }
+
+ if (hash_entry) {
+ return hash_entry->lid;
+ } else {
+ _send_lid_message(mac, LID_ARP_REQUEST,
+ priv_data(dev)->my_bcast, dev);
+
+ return 0;
+ }
+}
+
+static int ipath_ether_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ether_header *protocol_header =
+ &priv_data(dev)->protocol_header;
+ int rc = 0;
+ uint16_t dest_lid;
+ static uint32_t jumboseq;
+
+ if (skb->data[0] & BROADCAST_MASK) {
+
+ ipath_eth_dbg("Dest MAC: %x:%x:%x:%x:%x:%x\n",
+ ((uint8_t *)skb->data)[0],
+ ((uint8_t *)skb->data)[1],
+ ((uint8_t *)skb->data)[2],
+ ((uint8_t *)skb->data)[3],
+ ((uint8_t *)skb->data)[4],
+ ((uint8_t *)skb->data)[5]);
+
+ dest_lid = priv_data(dev)->my_bcast;
+ if (!dest_lid) {
+ /*
+ * Can't broadcast, broadcast LID isn't set yet.
+ * not the best possible error, but reasonable
+ */
+ dev_kfree_skb(skb);
+ priv_data(dev)->ipath_ether_stats.tx_dropped++;
+ /* just return 0. Old return was -ENOBUFS; */
+ return 0;
+ }
+ } else {
+ dest_lid = _lookup_lid((uint16_t *) skb->data, dev);
+ if (!dest_lid) {
+ dev_kfree_skb(skb);
+ priv_data(dev)->ipath_ether_stats.tx_dropped++;
+ /* just return 0. Old return was -ENOBUFS; */
+ return 0;
+ }
+ }
+
+ if (ipath_debug & __IPATH_IPATHPD) {
+ int loop_count;
+
+ printk("Send:\n");
+ for (loop_count = 0; loop_count < skb->len;
+ loop_count++) {
+ if (!(loop_count % 16))
+ printk("\n");
+
+ printk("%02X ",
+ ((uint8_t *) skb->data)[loop_count]);
+ }
+
+ printk("\n\n");
+ }
+
+ /* This is used as the sequence ID of a jumbo packet. */
+ protocol_header->seq_num = ++jumboseq;
+ /* This is the total length the receiver should expect. */
+ protocol_header->len = skb->len;
+ /*
+ * This is used as the fragment number for segmented jumbo packets.
+ */
+ protocol_header->frag_num = 0;
+
+ /*
+ * Copy 2 bytes of the ethernet header into the infinipath header so
+ * the rest of the data is 32-bit aligned.
+ */
+ protocol_header->first_2_bytes = *((uint16_t *) skb->data);
+ skb_pull(skb, sizeof(uint16_t));
+
+ protocol_header->lrh[0] = htons(IPS_LRH_BTH);
+ protocol_header->lrh[1] = dest_lid;
+ protocol_header->lrh[3] =
+ priv_data(dev)->my_lid; /* SRC LID */
+
+ protocol_header->bth[0] =
+ htonl((OPCODE_ITH4X << 24) + IPS_DEFAULT_P_KEY);
+ protocol_header->sub_opcode = OPCODE_ENCAP;
+
+ protocol_header->bth[1] = htonl(IPATH_KD_QP);
+
+ /* port, version, and TID are already known to be in range, no
+ * masking needed; offset in low INFINIPATH_I_OFFSET_MASK bits */
+ protocol_header->iph.ver_port_tid_offset =
+ (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) +
+ (EAGER_TID_ID << INFINIPATH_I_TID_SHIFT);
+ /* port is zero */
+ /* offset is zero */
+
+ protocol_header->flags = NETIF_F_SG;
+ if ((dev->features & NETIF_F_HW_CSUM) &&
+ skb->ip_summed == CHECKSUM_HW)
+ protocol_header->flags |= NETIF_F_HW_CSUM;
+
+ /* init cpc */
+ priv_data(dev)->cpc.hdr = protocol_header;
+ priv_data(dev)->cpc.to = NULL;
+ priv_data(dev)->cpc.error = 0;
+ priv_data(dev)->cpc.extra = 0;
+ priv_data(dev)->cpc.len = skb->len;
+ priv_data(dev)->cpc.flen = 0;
+ priv_data(dev)->cpc.skb = skb;
+ priv_data(dev)->cpc.csum = 0;
+ priv_data(dev)->cpc.pos = 0;
+ priv_data(dev)->cpc.offset = 0;
+ priv_data(dev)->cpc.checksum_calc = 0;
+
+ rc = ipath_layer_send_skb(priv_data(dev)->dd, &priv_data(dev)->cpc);
+ if (rc == 0) {
+ priv_data(dev)->ipath_ether_stats.tx_packets++;
+ priv_data(dev)->ipath_ether_stats.tx_bytes += skb->len;
+
+ priv_data(dev)->cpc.skb = NULL;
+ dev_kfree_skb(skb);
+ } else {
+ netif_stop_queue(dev);
+
+ if (rc == -ENOBUFS)
+ priv_data(dev)->ipath_ether_stats.tx_fifo_errors++;
+
+ if (rc == -EBUSY) {
+ atomic_set(&send_continue, 1);
+ ipath_layer_set_piointbufavail_int(
+ priv_data(dev)->dd);
+ rc = 0;
+ }
+ }
+
+ return rc;
+}
+
+/**
+ * ipath_ether_process_lid_arp - process an ARP message for a LID
+ * @device: the network device
+ * @hdr: the message header
+ */
+static int ipath_ether_process_lid_arp(int device, void *hdr)
+{
+ struct ipath_ether_priv *priv = &private_data[device];
+ struct ether_header *ihdr = (struct ether_header *) hdr;
+
+ switch (ihdr->cmd) {
+ case LID_ARP_REQUEST:
+ if ((priv->my_mac_addr[0] == ihdr->mac[0]) &&
+ (priv->my_mac_addr[1] == ihdr->mac[1]) &&
+ (priv->my_mac_addr[2] == ihdr->mac[2])) {
+ _send_lid_message(priv->my_mac_addr,
+ LID_ARP_RESPONSE,
+ ihdr->lrh[3],
+ dev_ipath_ether[device]);
+ }
+ break;
+
+ case LID_ARP_RESPONSE:
+ spin_lock_irq(&ipath_ether_lock);
+ _add_mac_lid(ihdr->mac, ihdr->lid);
+ spin_unlock_irq(&ipath_ether_lock);
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * get_state - get fragment state
+ * @table: the fragment state table
+ * @lid: the LID
+ *
+ * The purpose of the fragment reassembly hash table is to reduce the
+ * probability of losing state due to hash collisions.
+ * In order to add the least amount of overhead, there is no locking used
+ * on the lookup and a LID hashes to only one entry.
+ * Locking would be required if we tried to support removal of entries or
+ * dynamically growing the hash table.
+ */
+static inline struct ipath_frag_state *get_state(
+ struct ipath_frag_state *table, uint16_t lid)
+{
+ unsigned int probe = lid % ipath_fragtable_size;
+ struct ipath_frag_state *entry = table + probe;
+
+ return entry;
+}
+
+/**
+ * ipath_ether_rx - receive an ethernet packet
+ * @device: the network device
+ * @hdr: the packet header
+ * @skb: the sk_buff
+ *
+ * Callback handler called by the lower layer.
+ * Note that the skb is now our responsibility to either pass to the
+ * network stack or free it.
+ */
+static int ipath_ether_rx(int device, void *hdr, struct sk_buff *skb)
+{
+ struct ether_header *ihdr = (struct ether_header *) hdr;
+ struct ipath_ether_priv *priv = &private_data[device];
+ struct ipath_frag_state *fs = get_state(priv->fstate, ihdr->lrh[3]);
+ struct sk_buff *lskb;
+ uint16_t *h;
+
+ spin_lock_irq(&fs->lock);
+ lskb = fs->skb;
+ if (lskb != NULL) {
+ if (fs->lid != ihdr->lrh[3] ||
+ fs->seq_num != ihdr->seq_num ||
+ fs->frag_num != ihdr->frag_num) {
+
+ ipath_eth_warn("Drop %x %x, %u %u, %u %u, "
+ "%u %u, %x\n", fs->lid,
+ ihdr->lrh[3], fs->seq_num,
+ ihdr->seq_num, fs->frag_num,
+ ihdr->frag_num, fs->len,
+ ihdr->len,
+ ihdr->flags); /* XXX */
+
+ dev_kfree_skb_irq(lskb);
+ fs->skb = NULL;
+ if (fs->lid != ihdr->lrh[3])
+ priv->ipath_ether_stats.collisions++;
+ else
+ priv->ipath_ether_stats.rx_dropped++;
+ goto restart;
+ }
+ fs->frag_num++;
+ /*
+ * Linux network stack expects the last buff's next pointer
+ * to be NULL.
+ */
+ if (skb_shinfo(lskb)->frag_list == NULL)
+ skb_shinfo(lskb)->frag_list = skb;
+ else
+ fs->last_skb->next = skb;
+ fs->last_skb = skb;
+ lskb->len += skb->len;
+ lskb->data_len += skb->len;
+ } else {
+ restart:
+ /* Check to be sure this is the first fragment. */
+ if (ihdr->frag_num != 0) {
+ spin_unlock_irq(&fs->lock);
+ dev_kfree_skb_irq(skb);
+ priv->ipath_ether_stats.rx_dropped++;
+ return 0;
+ }
+ skb->dev = dev_ipath_ether[device];
+ skb->ip_summed = (ihdr->flags & NETIF_F_NO_CSUM) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ fs->skb = skb;
+ fs->lid = ihdr->lrh[3]; /* src LID */
+ fs->len = ihdr->len;
+ fs->frag_num = 1; /* next expected frag number */
+ fs->seq_num = ihdr->seq_num;
+ /*
+ * Copy two bytes of the ethernet hdr from the infinipath
+ * hdr
+ */
+ h = (uint16_t *) skb_push(skb, 2);
+ *h = ihdr->first_2_bytes;
+
+ /*
+ * Is this an ARP frame?
+ * The data should now contain the 6 byte destination
+ * ether address, the source ether address, and then
+ * the protocol field.
+ */
+ if (h[6] == htons(ETH_ARP_PROTOCOL))
+ _add_mac_lid(&h[3], ihdr->lrh[3]);
+ }
+
+ fs->len -= skb->len;
+ if (fs->len == 0) {
+ skb = fs->skb;
+ fs->skb = NULL;
+ spin_unlock_irq(&fs->lock);
+
+ /* Stuff the checksum back into the message. */
+ if (ihdr->flags & NETIF_F_HW_CSUM) {
+ /*
+ * Check to be sure the offset is in the first
+ * fragment.
+ */
+ if (ihdr->csum_offset < skb_headlen(skb)) {
+ *((uint16_t *) (skb->data +
+ ihdr->csum_offset)) =
+ ihdr->csum;
+ } else {
+ /*
+ * This should "never happen" so drop
+ * packet to be safe.
+ */
+ dev_kfree_skb_irq(skb);
+ priv->ipath_ether_stats.rx_dropped++;
+ return 0;
+ }
+ }
+
+ if (ipath_debug & __IPATH_IPATHPD) {
+ int loop_count;
+
+ printk("Recv:\n");
+ for (loop_count = 0; loop_count < skb->len;
+ loop_count++) {
+ if (!(loop_count % 16))
+ printk("\n");
+
+ printk("%02X ", skb->data[loop_count]);
+ }
+
+ printk("\n\n");
+ }
+
+ priv->ipath_ether_stats.rx_packets++;
+ priv->ipath_ether_stats.rx_bytes += skb->len;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ netif_rx(skb);
+ } else
+ spin_unlock_irq(&fs->lock);
+ dev_ipath_ether[device]->last_rx = jiffies;
+
+ return 0;
+}
+
+/**
+ * ipath_ether_interrupt - ether driver interrupt handler
+ * @device: the infinipath device number
+ * @interrupts: the interrupt mask
+ */
+static int ipath_ether_interrupt(int device, uint32_t interrupts)
+{
+ struct ipath_ether_priv *priv = &private_data[device];
+ unsigned wakeup_needed = 0;
+ int rc = 0;
+
+ ipath_eth_cdbg(VERBOSE, "Took ipath_ether_interrupt\n");
+
+ /*
+ * This can happen when hardware initialization fails in some way,
+ * and may avoid other bugs as well.
+ */
+ if ((uint32_t) device > MAX_IPATH_LAYER_DEVICE ||
+ !dev_ipath_ether[device]) {
+ ipath_eth_warn("ipath device %u not initialized, "
+ "ignoring interrupt\n", device);
+ return 0;
+ }
+
+ if (interrupts & IPATH_LAYER_INT_SEND_CONTINUE) {
+ if (atomic_dec_and_test(&send_continue)) {
+ if (priv->cpc.skb)
+ ipath_layer_send_skb(priv->dd, &priv->cpc);
+
+ if (priv->cpc.error == 0) {
+ if (priv->cpc.skb) {
+ dev_kfree_skb_any(priv->cpc.skb);
+ priv->cpc.skb = NULL;
+ }
+
+ netif_wake_queue(dev_ipath_ether[device]);
+ } else {
+ atomic_set(&send_continue, 1);
+
+ rc = 1; /* don't clean the interrupt */
+ }
+ }
+ }
+
+ if (interrupts & IPATH_LAYER_INT_IF_DOWN) {
+ priv->ipath_ether_if_stat = IPATH_LAYER_DOWN;
+ dev_ipath_ether[device]->flags &= ~IFF_UP;
+ netif_stop_queue(dev_ipath_ether[device]);
+ }
+
+ if (interrupts & IPATH_LAYER_INT_LID) {
+ wakeup_needed = 1;
+
+ if (!priv->my_mac_addr[0] && !priv->my_mac_addr[1] &&
+ !priv->my_mac_addr[2]) {
+ if (ipath_layer_get_mac
+ (priv->dd, (uint8_t *) priv->my_mac_addr)) {
+ ipath_eth_warn("Fall back to default OUI, "
+ "couldn't get MAC\n");
+ priv->my_mac_addr[0] = IPATH_SRC_OUI_1 |
+ (IPATH_SRC_OUI_2 << 8);
+ priv->my_mac_addr[1] = IPATH_SRC_OUI_3;
+ }
+ }
+
+ /* convert to network order */
+ priv->my_lid = htons(ipath_layer_get_lid(priv->dd));
+
+ memcpy(dev_ipath_ether[device]->dev_addr,
+ priv->my_mac_addr,
+ dev_ipath_ether[device]->addr_len);
+
+ /*
+ * else get it below, after possible BCAST processing as
+ * well or at open if mcast lid times out
+ */
+ }
+
+ if (interrupts & IPATH_LAYER_INT_BCAST) {
+ wakeup_needed = 1;
+ /*
+ * we may never get this, because some SMs don't support
+ * multicast, so at open, we will do the same thing if the
+ * wait for bcast times out
+ */
+ priv->my_bcast = htons(ipath_layer_get_bcast(priv->dd));
+ }
+
+ if (interrupts & IPATH_LAYER_INT_IF_UP) {
+ /* after LID/MLID processing */
+ priv->ipath_ether_if_stat = IPATH_LAYER_UP;
+ /* in case we get both set as result of open */
+ if (priv->my_lid && priv->my_bcast) {
+ netif_wake_queue(dev_ipath_ether[device]);
+ dev_ipath_ether[device]->flags |= IFF_UP;
+ }
+ }
+
+ if (wakeup_needed) {
+ /*
+ * arguably this should be waiting for lid and mlid,
+ * but since mlid isn't the only possible path for now,
+ * just wait for the lid.
+ */
+ wake_up_interruptible(&priv->lid_wait);
+ }
+
+ return rc;
+}
+
+static int ipath_ether_open(struct net_device *dev)
+{
+ uint32_t mtu;
+ int rc;
+
+ rc = ipath_layer_open(priv_data(dev)->dd, &mtu);
+ if (rc != 0)
+ return rc;
+
+ /*
+ * wait here until LID is set, otherwise "standard" networking
+ * over ipath won't work, because we'll continue on through
+ * starting up networking services, but ipath won't yet be usable,
+ * since it takes up to 30 seconds for SM and sma to chat and get
+ * our LID assigned.
+ */
+ wait_event_interruptible_timeout(priv_data(dev)->lid_wait,
+ priv_data(dev)->my_lid, 75 * HZ);
+
+ if (!priv_data(dev)->my_lid) {
+ ipath_eth_err("ipath_ether_open timed out waiting for LID -"
+ " can't send packets\n");
+
+ return -EPERM;
+ }
+
+ wait_event_interruptible_timeout(priv_data(dev)->lid_wait,
+ priv_data(dev)->my_bcast, 75 * HZ);
+
+ if (!priv_data(dev)->my_bcast) {
+ ipath_eth_err("ipath_ether_open timed out waiting for "
+ "MLID - can't send packets\n");
+
+ return -EPERM;
+ }
+
+ _send_lid_message(priv_data(dev)->my_mac_addr,
+ LID_ARP_RESPONSE, priv_data(dev)->my_bcast, dev);
+
+ dev->flags |= IFF_UP;
+ netif_wake_queue(dev);
+
+ return 0;
+}
+
+static int ipath_ether_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+
+ return 0;
+}
+
+static struct net_device_stats *ipath_ether_get_stats(
+ struct net_device *dev)
+{
+ return &priv_data(dev)->ipath_ether_stats;
+}
+
+static int ipath_ether_change_mtu(struct net_device *dev, int new_mtu)
+{
+ /*
+ * The MTU isn't really limited but we set an arbitrary limit of
+ * 16 * 2108 - 12. Which is 16 max infiniband sized packets minus
+ * the ethernet header (except for the 2 bytes we put in the
+ * ether_header header).
+ */
+ if ((new_mtu < 68) || new_mtu > 33716 || (new_mtu & 3))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static void ipath_ether_set_multicast_list(struct net_device *dev)
+{
+ struct dev_mc_list *mc_mac_entry = dev->mc_list;
+
+ /*
+ * No entries are really added but the can be displayed for debug
+ * purpose
+ */
+
+ while (mc_mac_entry) {
+ ipath_eth_table("Adding multicast MAC "
+ "[%02x:%02x:%02x:%02x:%02x:%02x]\n",
+ mc_mac_entry->dmi_addr[0],
+ mc_mac_entry->dmi_addr[1],
+ mc_mac_entry->dmi_addr[2],
+ mc_mac_entry->dmi_addr[3],
+ mc_mac_entry->dmi_addr[4],
+ mc_mac_entry->dmi_addr[5]);
+
+ mc_mac_entry = mc_mac_entry->next;
+ }
+}
+
+#ifdef ipath_ether_ioctl_support
+
+/**
+ * ipath_ether_get_settings - get ethernet device settings
+ * @dev: the network device
+ * @ecmd: the results are placed here
+ *
+ * This function is here to allow "ethtool eth<N>" to report something
+ * reasonable for infinipath. We return values for 10Gb ethernet
+ * as being reasonably similar.
+ */
+static int ipath_ether_get_settings(struct net_device *dev,
+ struct ethtool_cmd *ecmd)
+{
+ ecmd->supported = SUPPORTED_10000baseT_Full;
+ ecmd->port = PORT_TP;
+ ecmd->transceiver = XCVR_INTERNAL;
+ ecmd->advertising = ADVERTISED_10000baseT_Full;
+ ecmd->speed = SPEED_10000;
+ ecmd->duplex = DUPLEX_FULL;
+ return 0;
+}
+
+static void ipath_ether_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, DRV_NAME);
+ strcpy(info->version, DRV_VERSION);
+ sprintf(info->bus_info, "InfiniPath");
+}
+
+static u32 ipath_ether_get_rx_csum(struct net_device *dev)
+{
+ return 0;
+}
+
+static u32 ipath_ether_get_tx_csum(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_HW_CSUM) != 0;
+}
+
+static int ipath_ether_set_tx_csum(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_HW_CSUM;
+ else
+ dev->features &= ~NETIF_F_HW_CSUM;
+ return 0;
+}
+
+static int ipath_ether_get_tables(struct net_device *dev)
+{
+ int counter;
+ int index;
+ struct ether_hash *hash_entry;
+ uint8_t *mac_byte_ptr;
+ int num_of_entries = 0;
+
+ /* Only dump the hash table if the interface is up */
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ ipath_eth_table("Dumping hash table ..\n");
+ for (counter = 0; counter < MAX_HASH_ENTRIES; counter++) {
+ hash_entry = &hash_table[counter];
+
+ if (hash_entry->lid == 0)
+ continue;
+
+ index = 1;
+ do {
+ num_of_entries++;
+
+ mac_byte_ptr = (uint8_t *) hash_entry->mac;
+ ipath_eth_table("%4d.%02d MAC = "
+ "%02x:%02x:%02x:%02x:%02x:%02x, "
+ "LID = %4d [0x%04x]\n",
+ counter, index++, mac_byte_ptr[0],
+ mac_byte_ptr[1], mac_byte_ptr[2],
+ mac_byte_ptr[3], mac_byte_ptr[4],
+ mac_byte_ptr[5],
+ ntohs(hash_entry->lid),
+ ntohs(hash_entry->lid));
+ } while ((hash_entry = hash_entry->next) != NULL);
+ }
+
+ ipath_eth_table("# of entries is %i\n", num_of_entries);
+
+ return 0;
+}
+
+static struct ethtool_ops ipath_ether_ethtool_ops = {
+ .get_settings = ipath_ether_get_settings,
+ .get_drvinfo = ipath_ether_get_drvinfo,
+ .get_rx_csum = ipath_ether_get_rx_csum,
+ .get_tx_csum = ipath_ether_get_tx_csum,
+ .set_tx_csum = ipath_ether_set_tx_csum,
+ .set_sg = ethtool_op_set_sg,
+ .get_sg = ethtool_op_get_sg,
+ .get_tso = ethtool_op_get_tso,
+ .get_stats_count = ipath_ether_get_tables
+};
+
+static int ipath_ether_ioctl(struct net_device *dev, struct ifreq *ifr,
+ int cmd)
+{
+ switch (cmd) {
+
+ case SIOCGMIIPHY: /* Get address of MII PHY in use. */
+ ipath_eth_dbg("Get address of MII PHY in use [%x]\n", cmd);
+ return 0;
+ case SIOCGMIIREG: /* Read MII PHY register. */
+ ipath_eth_dbg("Read MII PHY register [%x]\n", cmd);
+ return 0;
+ case SIOCSMIIREG: /* Write to MII PHY register. */
+ ipath_eth_dbg("Write to MII PHY register [%x]\n", cmd);
+ return 0;
+
+ case 0x8b01 /*SIOCGIWNAME*/:
+ /*
+ * Wireless getname; see this on every startup, so
+ * don't complain about it; don't want to include
+ * wireless.h, so just use the value
+ */
+ return -EOPNOTSUPP;
+
+ default:
+ /*
+ * need to make this conditional, or remove it, some day
+ * for now, we want to know about ioctls we get that we
+ * don't support
+ */
+ ipath_eth_dbg("got unsupported ipath_ether_ioctl with "
+ "cmd = %x\n", cmd);
+ return -EOPNOTSUPP;
+ }
+}
+#endif
+
+int __init ipath_ether_probe(int device)
+{
+ struct ipath_devdata *dd;
+ int rc = -ENODEV;
+ unsigned int i;
+
+ /*
+ * check for being able to register first, in case fewer infinipath
+ * devices are present than are supported; we don't want to register
+ * network devices for non-existent infinipath devices.
+ */
+ rc = ipath_layer_register(device,
+ ipath_ether_interrupt,
+ ipath_ether_rx,
+ OPCODE_ITH4X,
+ ipath_ether_process_lid_arp, OPCODE_ITH4X,
+ &dd);
+ if (rc < 0) {
+ ipath_eth_warn("Unable to register device %u: %d\n", device,
+ -rc);
+ /*
+ * this could be just fine, since we may have fewer than the
+ * max supported chips present
+ */
+ return -ENODEV;
+ }
+
+ dev_ipath_ether[device] = alloc_etherdev(32);
+ if (dev_ipath_ether[device] == NULL)
+ goto ipath_ether_probe_exit_level_0;
+
+ dev_ipath_ether[device]->priv = &private_data[device];
+
+ memset(&private_data[device], 0, sizeof(struct ipath_ether_priv));
+
+ SET_MODULE_OWNER(dev_ipath_ether[device]);
+
+ private_data[device].dd = dd;
+ private_data[device].device_id = device;
+ init_waitqueue_head(&private_data[device].lid_wait);
+
+ dev_ipath_ether[device]->flags &= ~IFF_UP;
+
+ dev_ipath_ether[device]->mtu = 16384;
+
+ /* The ipath_ether-specific entries in the device structure. */
+ dev_ipath_ether[device]->open = ipath_ether_open;
+ dev_ipath_ether[device]->hard_start_xmit = ipath_ether_start_xmit;
+ dev_ipath_ether[device]->stop = ipath_ether_close;
+ dev_ipath_ether[device]->get_stats = ipath_ether_get_stats;
+ dev_ipath_ether[device]->change_mtu = ipath_ether_change_mtu;
+ dev_ipath_ether[device]->set_multicast_list =
+ ipath_ether_set_multicast_list;
+ dev_ipath_ether[device]->tx_timeout = NULL;
+ dev_ipath_ether[device]->watchdog_timeo = TX_TIMEOUT;
+ dev_ipath_ether[device]->features |= NETIF_F_HW_CSUM | NETIF_F_SG |
+ NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
+
+#ifdef ipath_ether_ioctl_support
+ dev_ipath_ether[device]->do_ioctl = ipath_ether_ioctl;
+ dev_ipath_ether[device]->ethtool_ops = &ipath_ether_ethtool_ops;
+#else
+ dev_ipath_ether[device]->do_ioctl = NULL;
+#endif
+
+ private_data[device].fstate =
+ kzalloc(ipath_fragtable_size *
+ sizeof(struct ipath_frag_state), GFP_ATOMIC);
+
+ if (private_data[device].fstate == NULL) {
+ rc = -ENOMEM;
+ goto ipath_ether_probe_exit_level_0;
+ }
+
+ for (i = 0; i < ipath_fragtable_size; i++)
+ spin_lock_init(&private_data[device].fstate[i].lock);
+
+ /* make sure that the queue is inactive */
+ netif_stop_queue(dev_ipath_ether[device]);
+
+ /*
+ * make an attempt to get our MAC address before registering with
+ * the network layer. This works as long as we are not overriding
+ * the GUID or getting it from some method other than the flash. It
+ * increases the likelihood of SuSE network configuration working,
+ * and is pretty much the right thing to do, in any case.
+ */
+ (void)ipath_layer_get_mac(
+ private_data[device].dd,
+ (uint8_t *) private_data[device].my_mac_addr);
+ memcpy(dev_ipath_ether[device]->dev_addr,
+ private_data[device].my_mac_addr,
+ dev_ipath_ether[device]->addr_len);
+
+ strcpy(dev_ipath_ether[device]->name, "eth%d");
+ rc = register_netdev(dev_ipath_ether[device]);
+ if (rc != 0)
+ goto ipath_ether_probe_exit_level_1;
+
+ private_data[device].ipath_ether_stats.tx_fifo_errors = 0;
+ private_data[device].ipath_ether_stats.tx_carrier_errors = 0;
+
+ return 0;
+
+ipath_ether_probe_exit_level_1:
+ if (private_data[device].fstate != NULL) {
+ kfree(private_data[device].fstate);
+ private_data[device].fstate = NULL;
+ }
+
+ free_netdev(dev_ipath_ether[device]);
+
+ipath_ether_probe_exit_level_0:
+ return rc;
+}
+
+int __init ipath_ether_init(void)
+{
+ int counter, nfound = 0;
+ int rc, lasterr = 0;
+
+ /* safety checks */
+ if (!&ipath_debug) {
+ /*
+ * This has occasionally been seen when the module load code
+ * has errors loading dependent modules. This prevents an
+ * oops, and makes it more obvious what happened. Have to
+ * use printk() directly for this one
+ */
+ printk(KERN_ERR "Module error, %s loading, but ipath_core "
+ "not loaded!\n", DRV_NAME);
+ return -ENODEV;
+ }
+ if (sizeof(struct ips_message_header) !=
+ sizeof(struct ether_header)) {
+ ipath_eth_err("FATAL ERROR (ipath_ether_init): header size "
+ "is wrong [%i<>%i]!!!\n",
+ (int)sizeof(struct ips_message_header),
+ (int)sizeof(struct ether_header));
+ return -ENODEV;
+ }
+
+ number_of_devices = ipath_layer_get_num_of_dev();
+
+ for (counter = 0; counter < number_of_devices; counter++) {
+ rc = ipath_ether_probe(counter);
+ if (rc)
+ lasterr = rc;
+ else
+ nfound++;
+ }
+ if (!nfound)
+ return lasterr; /* no usable devices were found */
+
+ return 0;
+}
+
+void __exit ipath_ether_exit(void)
+{
+ int counter, ninuse = 0;
+
+ for (counter = 0; counter < number_of_devices; counter++) {
+ ipath_layer_close(private_data[counter].dd);
+
+ if (!dev_ipath_ether[counter])
+ /*
+ * never registered, probably infinipath device
+ * not present
+ */
+ continue;
+ ninuse++;
+ unregister_netdev(dev_ipath_ether[counter]);
+ free_netdev(dev_ipath_ether[counter]);
+
+ dev_ipath_ether[counter] = NULL;
+
+ if (private_data[counter].fstate != NULL) {
+ kfree(private_data[counter].fstate);
+ private_data[counter].fstate = NULL;
+ }
+ }
+
+ if (ninuse <= 1) { /* only if none are in use */
+ while (all_hash_entries) {
+ struct ether_hash *next = all_hash_entries->next;
+
+ kfree(all_hash_entries);
+ all_hash_entries = next;
+ }
+ }
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Bryan O'Sullivan: "[PATCH 16 of 20] ipath - misc infiniband code, part 2"
Previous message: Bryan O'Sullivan: "[PATCH 14 of 20] ipath - infiniband RC protocol support"
In reply to: Bryan O'Sullivan: "[PATCH 14 of 20] ipath - infiniband RC protocol support"
Next in thread: Bryan O'Sullivan: "[PATCH 16 of 20] ipath - misc infiniband code, part 2"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]