[PATCH 05/13] [RFC] ipath LLD core, part 2

From: Roland Dreier
Date: Fri Dec 16 2005 - 18:53:21 EST


Next part of ipath core driver

---

drivers/infiniband/hw/ipath/ipath_driver.c | 2290 ++++++++++++++++++++++++++++
1 files changed, 2290 insertions(+), 0 deletions(-)

fc2b052ff2abadc8547dc1b319883f9c942b0ae4
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index df650d6..0dee4ce 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -2587,3 +2587,2293 @@ static int ipath_get_unit_counters(struc
return -EFAULT;
return ipath_get_counters(c.unit, (struct infinipath_counters *)c.data);
}
+
+/*
+ * ioctls for the control device, which is useful when you don't want
+ * to open the main device and use up a port.
+ */
+
+static int ipath_ctrl_ioctl(struct file *fp, unsigned int cmd, unsigned long a)
+{
+ int ret = 0;
+
+ switch (cmd) {
+ case IPATH_GETSTATS: /* return driver stats */
+ ret = ipath_get_stats((struct infinipath_stats *) a);
+ break;
+ case IPATH_GETUNITCOUNTERS: /* return chip counters */
+ ret = ipath_get_unit_counters((struct infinipath_getunitcounters *) a);
+ break;
+ default:
+ _IPATH_DBG("%x not a valid CTRL ioctl for infinipath\n", cmd);
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+long ipath_ioctl(struct file *fp, unsigned int cmd, unsigned long a)
+{
+ int ret = 0;
+ ipath_portdata *pd;
+ ipath_type unit;
+ uint32_t tmp, i, nactive = 0;
+
+ if (cmd == IPATH_GETUNITS) {
+ /*
+ * Return number of units supported. This is called
+ * here as this ioctl is needed via both the normal and
+ * diags interface, and it does not need the device to
+ * be opened.
+ */
+ return ipath_get_units();
+ }
+
+ pd = port_fp(fp);
+ if (!pd) {
+ if (IPATH_SMA == (unsigned long)fp->private_data)
+ /* sma separate; no pd */
+ return (long)ipath_sma_ioctl(fp, cmd, a);
+#ifdef IPATH_DIAG
+ else if (IPATH_DIAG == (unsigned long)fp->private_data)
+ /* diags separate; no pd */
+ return (long)ipath_diags_ioctl(fp, cmd, a);
+#endif
+ else if (IPATH_CTRL == (unsigned long)fp->private_data)
+ /* ctrl separate; no pd */
+ return (long)ipath_ctrl_ioctl(fp, cmd, a);
+ else {
+ _IPATH_DBG("NULL pd from fp (%p), cmd=%x\n", fp, cmd);
+ return -ENODEV; /* bad; shouldn't ever happen */
+ }
+ }
+
+ unit = pd->port_unit;
+
+ if ((devdata[unit].ipath_flags & IPATH_PRESENT)
+ && (cmd == IPATH_GETCOUNTERS || cmd == IPATH_GETSTATS
+ || cmd == IPATH_READ_EEPROM || cmd == IPATH_WRITE_EEPROM)) {
+ /* allowed to do these, as long as chip is accessible */
+ } else if (!(devdata[unit].ipath_flags & IPATH_INITTED)) {
+ _IPATH_DBG
+ ("%s not initialized (flags=0x%x), failing ioctl #%u\n",
+ ipath_get_unit_name(unit), devdata[unit].ipath_flags,
+ _IOC_NR(cmd));
+ ret = -ENODEV;
+ } else
+ if ((devdata[unit].
+ ipath_flags & (IPATH_LINKDOWN | IPATH_LINKUNK))) {
+ _IPATH_DBG("%s link is down, failing ioctl #%u\n",
+ ipath_get_unit_name(unit), _IOC_NR(cmd));
+ ret = -ENETDOWN;
+ }
+
+ if (ret)
+ return ret;
+
+ /* normal driver ioctls, not sim-specific */
+ switch (cmd) {
+ case IPATH_USERINIT:
+ /* real application is starting on a port */
+ ret = ipath_do_user_init(pd, (struct ipath_user_info *) a);
+ break;
+ case IPATH_BASEINFO:
+ /* it's done the init, now return the info it needs */
+ ret = ipath_get_baseinfo(pd, (struct ipath_base_info *) a);
+ break;
+ case IPATH_GETPORT:
+ /*
+ * just return the unit:port that we were assigned,
+ * and the number of active chips. This is is used for
+ * doing sched_setaffinity() before initialization.
+ */
+ for (i = 0; i < infinipath_max; i++)
+ if ((devdata[i].ipath_flags & IPATH_PRESENT)
+ && devdata[i].ipath_kregbase
+ && devdata[i].ipath_lid
+ && !(devdata[i].ipath_flags &
+ (IPATH_LINKDOWN | IPATH_LINKUNK)))
+ nactive++;
+ tmp = (nactive << 24) | (unit << 16) | unit;
+ if(copy_to_user((void *)a, &tmp, sizeof(unit)))
+ ret = EFAULT;
+ break;
+ case IPATH_GETLID:
+ /* get LID for given unit # */
+ ret = ipath_layer_get_lid(a);
+ break;
+ case IPATH_UPDM_TID: /* update expected TID entries */
+ ret = ipath_tid_update(pd, (struct _tidupd *)a);
+ break;
+ case IPATH_FREE_TID: /* free expected TID entries */
+ ret = ipath_tid_free(pd, (struct _tidupd *)a);
+ break;
+ case IPATH_GETCOUNTERS: /* return chip counters */
+ ret = ipath_get_counters(unit, (struct infinipath_counters *)a);
+ break;
+ case IPATH_GETSTATS: /* return driver stats */
+ ret = ipath_get_stats((struct infinipath_stats *) a);
+ break;
+ case IPATH_GETUNITCOUNTERS: /* return chip counters */
+ ret = ipath_get_unit_counters((struct infinipath_getunitcounters *) a);
+ break;
+ case IPATH_SET_PKEY: /* set a partition key */
+ ret = ipath_set_partkey(pd, (uint16_t) a);
+ break;
+ case IPATH_RCVCTRL: /* error handling to manage the rcvq */
+ ret = ipath_manage_rcvq(pd, (uint16_t) a);
+ break;
+ case IPATH_WRITE_EEPROM:
+ /* write the eeprom (for GUID) */
+ ret = ipath_wr_eeprom(pd, (struct ipath_eeprom_req *)a);
+ break;
+ case IPATH_READ_EEPROM: /* read the eeprom (for GUID) */
+ ret = ipath_rd_eeprom(pd->port_unit,
+ (struct ipath_eeprom_req *)a);
+ break;
+ case IPATH_WAIT:
+ /*
+ * wait for a receive intr for this port, or PIO avail
+ */
+ ret = ipath_wait_intr(pd, (uint32_t) a);
+ break;
+
+ default:
+ _IPATH_DBG("cmd %x (%c,%u) not a valid ioctl\n", cmd,
+ _IOC_TYPE(cmd), _IOC_NR(cmd));
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static loff_t ipath_llseek(struct file *fp, loff_t off, int whence)
+{
+ loff_t ret;
+
+ /* range checking is done where offset is used, not here. */
+ down(&fp->f_dentry->d_inode->i_sem);
+ if (!whence)
+ ret = fp->f_pos = off;
+ else if (whence == 1) {
+ fp->f_pos += off;
+ ret = fp->f_pos;
+ } else
+ ret = -EINVAL;
+ up(&fp->f_dentry->d_inode->i_sem);
+ _IPATH_DBG("New offset %llx from seek %llx whence=%d\n", fp->f_pos, off,
+ whence);
+
+ return ret;
+}
+
+/*
+ * We use this to have a shared buffer between the kernel and the user
+ * code for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip. We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+
+static struct vm_operations_struct ipath_vmops = {
+ .nopage = ipath_nopage,
+};
+
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vm)
+{
+ int setlen = 0, ret = -EINVAL;
+ ipath_portdata *pd;
+
+ if (fp->private_data && 255UL < (unsigned long)fp->private_data) {
+ pd = port_fp(fp);
+ {
+ /*
+ * This is the ipath_do_user_init() code,
+ * mapping the shared buffers into the user
+ * process. The address referred to by vm_pgoff
+ * is the virtual, not physical, address; we only
+ * do one mmap for each space mapped.
+ */
+ uint64_t pgaddr, ureg;
+
+ pgaddr = vm->vm_pgoff << PAGE_SHIFT;
+
+ /*
+ * note that ureg does *NOT* have the kregvirt
+ * as part of it, to be sure that for 32 bit
+ * programs, we don't end up trying to map
+ * a > 44 address. Has to match ipath_get_baseinfo()
+ * code that sets __spi_uregbase
+ */
+
+ ureg = devdata[pd->port_unit].ipath_uregbase +
+ devdata[pd->port_unit].ipath_palign * pd->port_port;
+
+ _IPATH_MMDBG
+ ("ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n",
+ pgaddr, vm->vm_start, vm->vm_end - vm->vm_start);
+
+ if (pgaddr == ureg) {
+ /* it's the real hardware, so io_remap works */
+ unsigned long phys;
+ if ((vm->vm_end - vm->vm_start) > PAGE_SIZE) {
+ _IPATH_INFO
+ ("FAIL mmap userreg: reqlen %lx > PAGE\n",
+ vm->vm_end - vm->vm_start);
+ ret = -EFAULT;
+ } else {
+ phys =
+ devdata[pd->port_unit].
+ ipath_physaddr + ureg;
+ vm->vm_page_prot =
+ pgprot_noncached(vm->vm_page_prot);
+
+ vm->vm_flags |=
+ VM_DONTCOPY | VM_DONTEXPAND | VM_IO
+ | VM_SHM | VM_LOCKED;
+ ret =
+ io_remap_pfn_range(vm, vm->vm_start, phys >> PAGE_SHIFT,
+ vm->vm_end - vm->vm_start,
+ vm->vm_page_prot);
+ }
+ } else if (pgaddr == pd->port_piobufs) {
+ /*
+ * We use io_remap, so there is not a
+ * nopage handler for this case!
+ * when we map the PIO buffers, we want
+ * to map them as writeonly, no read possible.
+ */
+
+ unsigned long phys;
+ if ((vm->vm_end - vm->vm_start) >
+ (devdata[pd->port_unit].ipath_pbufsport *
+ devdata[pd->port_unit].ipath_palign)) {
+ _IPATH_INFO
+ ("FAIL mmap userreg: reqlen %lx > PAGE\n",
+ vm->vm_end - vm->vm_start);
+ ret = -EFAULT;
+ } else {
+ phys =
+ devdata[pd->port_unit].
+ ipath_physaddr + pd->port_piobufs;
+ /*
+ * Do *NOT* mark this as
+ * non-cached (PWT bit), or we
+ * don't get the write combining
+ * behavior we want on the
+ * PIO buffers!
+ * vm->vm_page_prot = pgprot_noncached(vm->vm_page_prot);
+ */
+
+#if defined (pgprot_writecombine) && defined(_PAGE_MA_WC)
+ /* Enable WC */
+ vm->vm_page_prot =
+ pgprot_writecombine(vm->
+ vm_page_prot);
+#endif
+
+ if (vm->vm_flags & VM_READ) {
+ _IPATH_INFO
+ ("Can't map piobufs as readable (flags=%lx)\n",
+ vm->vm_flags);
+ ret = -EPERM;
+ } else {
+ /*
+ * don't allow them to
+ * later change to readable
+ * with mprotect
+ */
+
+ vm->vm_flags &= ~VM_MAYWRITE;
+
+ vm->vm_flags |=
+ VM_DONTCOPY | VM_DONTEXPAND
+ | VM_IO | VM_SHM |
+ VM_LOCKED;
+ ret =
+ io_remap_pfn_range(vm, vm->vm_start, phys >> PAGE_SHIFT,
+ vm->vm_end - vm->vm_start,
+ vm->vm_page_prot);
+ }
+ }
+ } else if (pgaddr == (uint64_t) pd->port_rcvegr_phys) {
+ if (!pd->port_rcvegrbuf_virt)
+ return -EFAULT;
+ /*
+ * page_alloc'ed egr memory, not
+ * physically contiguous
+ * *BUT* to work around the 32 bit mmap64
+ * only handling 44 bits, we have remapped
+ * the first page to kernel virtual, so
+ * we have to do the conversion here to
+ * get back to the original virtual
+ * address (not contig pages) so we have
+ * to mark this for special handling.
+ */
+
+ /*
+ * not egrbufs * egrsize since they are
+ * no longer virtually contiguous.
+ */
+ setlen = pd->port_rcvegrbuf_chunks * PAGE_SIZE *
+ (1 << pd->port_rcvegrbuf_order);
+ if ((vm->vm_end - vm->vm_start) > setlen) {
+ _IPATH_INFO
+ ("FAIL on egr bufs: reqlen %lx > actual %x\n",
+ vm->vm_end - vm->vm_start, setlen);
+ ret = -EFAULT;
+ } else {
+ vm->vm_ops = &ipath_vmops;
+ vm->vm_private_data =
+ (void *)(3 | (uint64_t) pd);
+ if (vm->vm_flags & VM_WRITE) {
+ _IPATH_INFO
+ ("Can't map eager buffers as writable (flags=%lx)\n",
+ vm->vm_flags);
+ ret = -EPERM;
+ } else {
+ /*
+ * don't allow them to
+ * later change to writeable
+ * with mprotect
+ */
+
+ vm->vm_flags &= ~VM_MAYWRITE;
+ _IPATH_MMDBG
+ ("egrbufs, set private to %p, not %llx\n",
+ vm->vm_private_data,
+ pgaddr);
+ ret = 0;
+ }
+ }
+ } else if (pgaddr == (uint64_t) pd->port_rcvhdrq_phys) {
+ /*
+ * kmalloc'ed memory, physically
+ * contiguous; this is from
+ * spi_rcvhdr_base; we allow user to
+ * map read-write so they can write
+ * hdrq entries to allow protocol code
+ * to directly poll whether a hdrq entry
+ * has been written.
+ */
+ setlen =
+ round_up(devdata[pd->port_unit].
+ ipath_rcvhdrcnt *
+ devdata[pd->port_unit].
+ ipath_rcvhdrentsize *
+ sizeof(uint32_t), PAGE_SIZE);
+ if ((vm->vm_end - vm->vm_start) > setlen) {
+ _IPATH_INFO
+ ("FAIL on rcvhdrq: reqlen %lx > actual %x\n",
+ vm->vm_end - vm->vm_start, setlen);
+ ret = -EFAULT;
+ } else {
+ vm->vm_ops = &ipath_vmops;
+ vm->vm_private_data =
+ (void *)(pgaddr | 1);
+ ret = 0;
+ }
+ }
+ /*
+ * when we map the PIO bufferavail registers,
+ * we want to map them as readonly, no read
+ * possible.
+ */
+ else if (pgaddr ==
+ devdata[pd->port_unit].
+ ipath_pioavailregs_phys) {
+ /*
+ * kmalloc'ed memory, physically
+ * contiguous, one page only, readonly
+ */
+ setlen = PAGE_SIZE;
+ if ((vm->vm_end - vm->vm_start) > setlen) {
+ _IPATH_INFO
+ ("FAIL on pioavailregs_dma: reqlen %lx > actual %x\n",
+ vm->vm_end - vm->vm_start, setlen);
+ ret = -EFAULT;
+ } else if (vm->vm_flags & VM_WRITE) {
+ _IPATH_INFO
+ ("Can't map pioavailregs as writable (flags=%lx)\n",
+ vm->vm_flags);
+ ret = -EPERM;
+ } else {
+ /*
+ * don't allow them to later
+ * change with mprotect
+ */
+ vm->vm_flags &= ~VM_MAYWRITE;
+ vm->vm_ops = &ipath_vmops;
+ vm->vm_private_data =
+ (void *)(pgaddr | 2);
+ ret = 0;
+ }
+ }
+ if (!ret && setlen) {
+ /* keep page(s) from being swapped, etc. */
+ vm->vm_flags |=
+ VM_DONTEXPAND | VM_DONTCOPY | VM_RESERVED |
+ VM_IO | VM_SHM;
+ } else {
+ /* failure, or io_remap case */
+ vm->vm_private_data = NULL;
+ if (ret)
+ _IPATH_INFO
+ ("Failure %d, setlen %d, on addr %lx, off %lx\n",
+ ret, setlen, vm->vm_start,
+ vm->vm_pgoff);
+ }
+ }
+ } else /* something very wrong */
+ _IPATH_INFO("fp_private wasn't set, no mmaping\n");
+
+ return ret;
+}
+
+/* page fault handler. For each page that is first faulted in from the
+ * mmap'ed shared address buffer, this routine is called.
+ * It's always for a single page.
+ * We use the low bits of the private_data field to tell us which case
+ * we are dealing with.
+ */
+
+static struct page *ipath_nopage(struct vm_area_struct *vma, unsigned long addr,
+ int *type)
+{
+ unsigned long avirt, /* the original [kv]malloc virtual address */
+ paddr, /* physical address */
+ off; /* calculated page offset */
+ uint32_t which, chunk;
+ void *vaddr = NULL;
+ ipath_portdata *pd;
+ struct page *vpage = NOPAGE_SIGBUS;
+
+ if (!(avirt = (unsigned long)vma->vm_private_data)) {
+ _IPATH_DBG("NULL private_data, vm_pgoff %lx\n", vma->vm_pgoff);
+ which = 0; /* quiet incorrect gcc warning */
+ goto done;
+ }
+ which = avirt & 3;
+ avirt &= ~3ULL;
+
+ if (addr > vma->vm_end) {
+ _IPATH_DBG("trying to fault in addr %lx past end\n", addr);
+ goto done;
+ }
+
+ /*
+ * most of our memory is vmalloc'ed, but rcvhdr Q is physically
+ * contiguous, either from kmalloc or alloc_pages()
+ * pgoff is virtual.
+ */
+ switch (which) {
+ case 1: /* rcvhdrq_phys */
+ /* should always be 0 */
+ off = vma->vm_pgoff - (avirt >> PAGE_SHIFT);
+ paddr = addr - vma->vm_start + (off << PAGE_SHIFT) + avirt;
+ _IPATH_MMDBG("hdrq %lx (u=%lx)\n", paddr, addr);
+ vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+ break;
+ case 2: /* PIO buffer avail regs */
+ /* should always be 0 */
+ off = vma->vm_pgoff - (avirt >> PAGE_SHIFT);
+ paddr = (addr - vma->vm_start + (off << PAGE_SHIFT) + avirt);
+ _IPATH_MMDBG("pioav %lx\n", paddr);
+ vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+ break;
+ case 3:
+ /*
+ * rcvegrbufs; page_alloc()'ed like rcvhdrq, but we
+ * have to pick out which page_alloc()'ed chunk it is.
+ */
+ pd = (ipath_portdata *) avirt;
+ /* this should always be 0 */
+ off =
+ vma->vm_pgoff -
+ ((unsigned long)pd->port_rcvegr_phys >> PAGE_SHIFT);
+ off = (addr - vma->vm_start + (off << PAGE_SHIFT));
+
+ chunk = off / (PAGE_SIZE * (1 << pd->port_rcvegrbuf_order));
+ if (chunk > pd->port_rcvegrbuf_chunks)
+ _IPATH_DBG("Bad egrbuf chunk %u (max %u); off = %lx\n",
+ chunk, pd->port_rcvegrbuf_chunks, off);
+ vaddr = pd->port_rcvegrbuf_virt[chunk] +
+ off % (PAGE_SIZE * (1 << pd->port_rcvegrbuf_order));
+ paddr = virt_to_phys(vaddr);
+ vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+ _IPATH_MMDBG("egrb %p,%lx\n", vaddr, paddr);
+ break;
+ default:
+ _IPATH_DBG
+ ("trying to fault in mmap addr %lx (avirt %lx) that isn't known (case %u)\n",
+ addr, avirt, which);
+ }
+
+done:
+ if (vpage != NOPAGE_SIGBUS && vpage != NOPAGE_OOM) {
+ if (which == 2)
+ /*
+ * media/video/video-buf.c doesn't do get_page() for
+ * buffer from alloc_page(). Hmmm.
+ *
+ * keep it from being swapped, complaints if
+ * process exits before we [vf]free it, etc,
+ * and keep shared page counts correct, etc.
+ */
+ get_page(vpage);
+ mark_page_accessed(vpage);
+ if (type)
+ *type = VM_FAULT_MINOR;
+ } else
+ _IPATH_DBG("faultin of addr %lx vaddr %p avirt %lx failed\n",
+ addr, vaddr, avirt);
+
+ return vpage;
+}
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static void ipath_bad_intr(const ipath_type t, uint32_t * unexpectp)
+{
+ ipath_devdata *dd = &devdata[t];
+
+ /*
+ * sometimes happen during driver init and unload, don't want
+ * to process any interrupts at that point
+ */
+
+ /* this is just a bandaid, not a fix, if something goes badly wrong */
+ if (++*unexpectp > 100) {
+ if (++*unexpectp > 105) {
+ /*
+ * ok, we must be taking somebody else's interrupts,
+ * due to a messed up mptable and/or PIRQ table, so
+ * unregister the interrupt. We've seen this
+ * during linuxbios development work, and it
+ * may happen in the future again.
+ */
+ if (dd->pcidev && dd->pcidev->irq) {
+ _IPATH_UNIT_ERROR(t,
+ "Now %u unexpected interrupts, unregistering interrupt handler\n",
+ *unexpectp);
+ _IPATH_DBG("free_irq of irq %x\n",
+ dd->pcidev->irq);
+ free_irq(dd->pcidev->irq, dd);
+ dd->pcidev->irq = 0;
+ }
+ }
+ if (ipath_kget_kreg32(t, kr_intmask)) {
+ _IPATH_UNIT_ERROR(t,
+ "%u unexpected interrupts, disabling interrupts completely\n",
+ *unexpectp);
+ /* disable all interrupts, something is very wrong */
+ ipath_kput_kreg(t, kr_intmask, 0ULL);
+ }
+ } else if (*unexpectp > 1)
+ _IPATH_DBG
+ ("Interrupt when not ready, should not happen, ignoring\n");
+}
+
+/* separate routine, for better optimization of ipath_intr() */
+
+static void ipath_bad_regread(const ipath_type t)
+{
+ static int allbits;
+ ipath_devdata *dd = &devdata[t];
+
+ /*
+ * We print the message and disable interrupts, in hope of
+ * having a better chance of debugging the problem.
+ */
+ _IPATH_UNIT_ERROR(t,
+ "Read of interrupt status failed (all bits set)\n");
+ if (allbits++) {
+ /* disable all interrupts, something is very wrong */
+ ipath_kput_kreg(t, kr_intmask, 0ULL);
+ if (allbits == 2) {
+ _IPATH_UNIT_ERROR(t,
+ "Still bad interrupt status, unregistering interrupt\n");
+ free_irq(dd->pcidev->irq, dd);
+ dd->pcidev->irq = 0;
+ } else if (allbits > 2) {
+ if ((allbits % 10000) == 0)
+ printk(".");
+ } else
+ _IPATH_UNIT_ERROR(t,
+ "Disabling interrupts, multiple errors\n");
+ }
+}
+
+static irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
+{
+ ipath_devdata *dd = data;
+ const ipath_type t = IPATH_UNIT(dd);
+ uint32_t istat = ipath_kget_kreg32(t, kr_intstatus);
+ uint64_t estat = 0;
+ static unsigned unexpected = 0;
+
+ if (unlikely(!istat)) {
+ ipath_stats.sps_nullintr++;
+ /* not our interrupt, or already handled */
+ return IRQ_NONE;
+ }
+ if (unlikely(istat == ~0)) {
+ ipath_bad_regread(t);
+ /* don't know if it was our interrupt or not */
+ return IRQ_NONE;
+ }
+
+ ipath_stats.sps_ints++;
+
+ /*
+ * this needs to be flags&initted, not statusp, so we keep
+ * taking interrupts even after link goes down, etc.
+ * Also, we *must* clear the interrupt at some point, or we won't
+ * take it again, which can be real bad for errors, etc...
+ */
+
+ if (!(dd->ipath_flags & IPATH_INITTED)) {
+ ipath_bad_intr(t, &unexpected);
+ return IRQ_NONE;
+ }
+ if (unexpected)
+ unexpected = 0;
+
+ if (istat & ~infinipath_i_bitsextant)
+ _IPATH_UNIT_ERROR(t,
+ "interrupt with unknown interrupts %x set\n",
+ istat & (uint32_t) ~ infinipath_i_bitsextant);
+
+ if (istat & INFINIPATH_I_ERROR) {
+ ipath_stats.sps_errints++;
+ estat = ipath_kget_kreg64(t, kr_errorstatus);
+ if (!estat)
+ _IPATH_INFO
+ ("error interrupt (%x), but no error bits set!\n",
+ istat);
+ else if (estat == ~0ULL)
+ /*
+ * should we try clearing all, or hope next read
+ * works?
+ */
+ _IPATH_UNIT_ERROR(t,
+ "Read of error status failed (all bits set); ignoring\n");
+ else
+ ipath_handle_errors(t, estat);
+ }
+
+ if (istat & INFINIPATH_I_GPIO) {
+ /* Clear GPIO status bit 2 */
+ ipath_kput_kreg(t, kr_gpio_clear, (uint64_t)(1 << 2));
+
+ /*
+ * Packets are available in the port 0 receive queue.
+ * Eventually this needs to be generalized to check
+ * IPATH_GPIO_INTR, and the specific GPIO bit, when
+ * GPIO interrupts start being used for other things.
+ * We skip that now to improve performance.
+ */
+ ipath_kreceive(t);
+ }
+
+ /*
+ * clear the ones we will deal with on this round
+ * We clear it early, mostly for receive interrupts, so we
+ * know the chip will have seen this by the time we process
+ * the queue, and will re-interrupt if necessary. The processor
+ * itself won't take the interrupt again until we return.
+ */
+ ipath_kput_kreg(t, kr_intclear, istat);
+
+ if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+ atomic_clear_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+ &dd->ipath_sendctrl);
+ ipath_kput_kreg(t, kr_sendctrl, dd->ipath_sendctrl);
+
+ if (dd->ipath_portpiowait) {
+ uint32_t i;
+ /*
+ * start from port 1, since for now port 0 is
+ * never using wait_event for PIO
+ */
+ for (i = 1;
+ dd->ipath_portpiowait && i < dd->ipath_cfgports;
+ i++) {
+ if (dd->ipath_pd[i]
+ && dd->ipath_portpiowait & (1U << i)) {
+ atomic_clear_mask(1U << i,
+ &dd->
+ ipath_portpiowait);
+ if (dd->ipath_pd[i]->
+ port_flag & IPATH_PORT_WAITING_PIO)
+ {
+ dd->ipath_pd[i]->port_flag &=
+ ~IPATH_PORT_WAITING_PIO;
+ wake_up_interruptible(&dd->
+ ipath_pd
+ [i]->
+ port_wait);
+ }
+ }
+ }
+ }
+
+ if (dd->ipath_layer.l_intr) {
+ if (dd->ipath_layer.l_intr(t,
+ IPATH_LAYER_INT_SEND_CONTINUE)) {
+ atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+ &dd->ipath_sendctrl);
+ ipath_kput_kreg(t, kr_sendctrl,
+ dd->ipath_sendctrl);
+ }
+ }
+
+ if (dd->verbs_layer.l_piobufavail) {
+ if (!dd->verbs_layer.l_piobufavail(t)) {
+ atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+ &dd->ipath_sendctrl);
+ ipath_kput_kreg(t, kr_sendctrl,
+ dd->ipath_sendctrl);
+ }
+ }
+ }
+
+ /*
+ * we check for both transition from empty to non-empty, and urgent
+ * packets (those with the interrupt bit set in the header)
+ */
+
+ if (istat & ((infinipath_i_rcvavail_mask << INFINIPATH_I_RCVAVAIL_SHIFT)
+ | (infinipath_i_rcvurg_mask << INFINIPATH_I_RCVURG_SHIFT))) {
+ uint64_t portr;
+ int i;
+ uint32_t rcvdint = 0;
+
+ portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
+ infinipath_i_rcvavail_mask)
+ | ((istat >> INFINIPATH_I_RCVURG_SHIFT) &
+ infinipath_i_rcvurg_mask);
+ for (i = 0; i < dd->ipath_cfgports; i++) {
+ if (portr & (1 << i) && dd->ipath_pd[i]) {
+ if (i == 0)
+ ipath_kreceive(t);
+ else if (dd->ipath_pd[i]->
+ port_flag & IPATH_PORT_WAITING_RCV) {
+ atomic_clear_mask
+ (IPATH_PORT_WAITING_RCV,
+ &dd->ipath_pd[i]->port_flag);
+ wake_up_interruptible(&dd->ipath_pd[i]->
+ port_wait);
+ rcvdint |= 1U << i;
+ }
+ }
+ }
+ if (rcvdint) {
+ /*
+ * only want to take one interrupt, so turn off
+ * the rcv interrupt for all the ports that we
+ * did the wakeup on (but never for kernel port)
+ */
+ atomic_clear_mask(rcvdint <<
+ INFINIPATH_R_INTRAVAIL_SHIFT,
+ &dd->ipath_rcvctrl);
+ ipath_kput_kreg(t, kr_rcvctrl, dd->ipath_rcvctrl);
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+static void ipath_decode_err(char *buf, size_t blen, uint64_t err)
+{
+ *buf = '\0';
+ if (err & INFINIPATH_E_RHDRLEN)
+ strlcat(buf, "rhdrlen ", blen);
+ if (err & INFINIPATH_E_RBADTID)
+ strlcat(buf, "rbadtid ", blen);
+ if (err & INFINIPATH_E_RBADVERSION)
+ strlcat(buf, "rbadversion ", blen);
+ if (err & INFINIPATH_E_RHDR)
+ strlcat(buf, "rhdr ", blen);
+ if (err & INFINIPATH_E_RLONGPKTLEN)
+ strlcat(buf, "rlongpktlen ", blen);
+ if (err & INFINIPATH_E_RSHORTPKTLEN)
+ strlcat(buf, "rshortpktlen ", blen);
+ if (err & INFINIPATH_E_RMAXPKTLEN)
+ strlcat(buf, "rmaxpktlen ", blen);
+ if (err & INFINIPATH_E_RMINPKTLEN)
+ strlcat(buf, "rminpktlen ", blen);
+ if (err & INFINIPATH_E_RFORMATERR)
+ strlcat(buf, "rformaterr ", blen);
+ if (err & INFINIPATH_E_RUNSUPVL)
+ strlcat(buf, "runsupvl ", blen);
+ if (err & INFINIPATH_E_RUNEXPCHAR)
+ strlcat(buf, "runexpchar ", blen);
+ if (err & INFINIPATH_E_RIBFLOW)
+ strlcat(buf, "ribflow ", blen);
+ if (err & INFINIPATH_E_REBP)
+ strlcat(buf, "EBP ", blen);
+ if (err & INFINIPATH_E_SUNDERRUN)
+ strlcat(buf, "sunderrun ", blen);
+ if (err & INFINIPATH_E_SPIOARMLAUNCH)
+ strlcat(buf, "spioarmlaunch ", blen);
+ if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+ strlcat(buf, "sunexperrpktnum ", blen);
+ if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+ strlcat(buf, "sdroppeddatapkt ", blen);
+ if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+ strlcat(buf, "sdroppedsmppkt ", blen);
+ if (err & INFINIPATH_E_SMAXPKTLEN)
+ strlcat(buf, "smaxpktlen ", blen);
+ if (err & INFINIPATH_E_SMINPKTLEN)
+ strlcat(buf, "sminpktlen ", blen);
+ if (err & INFINIPATH_E_SUNSUPVL)
+ strlcat(buf, "sunsupVL ", blen);
+ if (err & INFINIPATH_E_SPKTLEN)
+ strlcat(buf, "spktlen ", blen);
+ if (err & INFINIPATH_E_INVALIDADDR)
+ strlcat(buf, "invalidaddr ", blen);
+ if (err & INFINIPATH_E_RICRC)
+ strlcat(buf, "CRC ", blen);
+ if (err & INFINIPATH_E_RVCRC)
+ strlcat(buf, "VCRC ", blen);
+ if (err & INFINIPATH_E_RRCVEGRFULL)
+ strlcat(buf, "rcvegrfull ", blen);
+ if (err & INFINIPATH_E_RRCVHDRFULL)
+ strlcat(buf, "rcvhdrfull ", blen);
+ if (err & INFINIPATH_E_IBSTATUSCHANGED)
+ strlcat(buf, "ibcstatuschg ", blen);
+ if (err & INFINIPATH_E_RIBLOSTLINK)
+ strlcat(buf, "riblostlink ", blen);
+ if (err & INFINIPATH_E_HARDWARE)
+ strlcat(buf, "hardware ", blen);
+ if (err & INFINIPATH_E_RESET)
+ strlcat(buf, "reset ", blen);
+}
+
+/* decode RHF errors; only used one place now, may want more later */
+static void get_rhf_errstring(uint32_t err, char *msg, size_t len)
+{
+ /* if no errors, and so don't need to check what's first */
+ *msg = '\0';
+
+ if (err & INFINIPATH_RHF_H_ICRCERR)
+ strlcat(msg, "icrcerr ", len);
+ if (err & INFINIPATH_RHF_H_VCRCERR)
+ strlcat(msg, "vcrcerr ", len);
+ if (err & INFINIPATH_RHF_H_PARITYERR)
+ strlcat(msg, "parityerr ", len);
+ if (err & INFINIPATH_RHF_H_LENERR)
+ strlcat(msg, "lenerr ", len);
+ if (err & INFINIPATH_RHF_H_MTUERR)
+ strlcat(msg, "mtuerr ", len);
+ if (err & INFINIPATH_RHF_H_IHDRERR)
+ /* infinipath hdr checksum error */
+ strlcat(msg, "ipathhdrerr ", len);
+ if (err & INFINIPATH_RHF_H_TIDERR)
+ strlcat(msg, "tiderr ", len);
+ if (err & INFINIPATH_RHF_H_MKERR)
+ /* bad port, offset, etc. */
+ strlcat(msg, "invalid ipathhdr ", len);
+ if (err & INFINIPATH_RHF_H_IBERR)
+ strlcat(msg, "iberr ", len);
+ if (err & INFINIPATH_RHF_L_SWA)
+ strlcat(msg, "swA ", len);
+ if (err & INFINIPATH_RHF_L_SWB)
+ strlcat(msg, "swB ", len);
+}
+
+static void ipath_handle_errors(const ipath_type t, uint64_t errs)
+{
+ char msg[512];
+ uint32_t piobcnt;
+ uint64_t sbuf[4], ignore_this_time = 0;
+ int i;
+ int chkerrpkts = 0, noprint = 0;
+ cycles_t nc;
+ static cycles_t nextmsg_time;
+ static unsigned nmsgs, supp_msgs;
+ ipath_devdata *dd = &devdata[t];
+
+#define E_SUM_PKTERRS (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID \
+ | INFINIPATH_E_RBADVERSION \
+ | INFINIPATH_E_RHDR | INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN \
+ | INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN \
+ | INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | INFINIPATH_E_RUNEXPCHAR \
+ | INFINIPATH_E_REBP)
+
+#define E_SUM_ERRS ( INFINIPATH_E_SPIOARMLAUNCH \
+ | INFINIPATH_E_SUNEXPERRPKTNUM | INFINIPATH_E_SDROPPEDDATAPKT \
+ | INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SMAXPKTLEN \
+ | INFINIPATH_E_SUNSUPVL | INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN \
+ | INFINIPATH_E_INVALIDADDR)
+
+ /*
+ * throttle back "fast" messages to no more than 10 per 5 seconds
+ * (1.4-2GHz clock). This isn't perfect, but it's a reasonable
+ * heuristic
+ * If we get more than 10, give a 5x longer delay
+ */
+ nc = get_cycles();
+ if (nmsgs > 10) {
+ if (nc < nextmsg_time) {
+ noprint = 1;
+ if (!supp_msgs++)
+ nextmsg_time = nc + 50000000000ULL;
+ } else if (supp_msgs) {
+ /*
+ * Print the message unless it's ibc status
+ * change only, which happens so often we never
+ * want to count it.
+ */
+ if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+ ipath_decode_err(msg, sizeof msg,
+ dd->
+ ipath_lasterror &
+ ~INFINIPATH_E_IBSTATUSCHANGED);
+ if (dd->
+ ipath_lasterror & ~(INFINIPATH_E_RRCVEGRFULL
+ |
+ INFINIPATH_E_RRCVHDRFULL))
+ _IPATH_UNIT_ERROR(t,
+ "Suppressed %u messages for fast-repeating errors (%s) (%llx)\n",
+ supp_msgs, msg,
+ dd->ipath_lasterror);
+ else {
+ /*
+ * rcvegrfull and rcvhdrqfull are
+ * "normal", for some types of
+ * processes (mostly benchmarks)
+ * that send huge numbers of
+ * messages, while not processing
+ * them. So only complain about
+ * these at debug level.
+ */
+ _IPATH_DBG
+ ("Suppressed %u messages for %s\n",
+ supp_msgs, msg);
+ }
+ }
+ supp_msgs = 0;
+ nmsgs = 0;
+ }
+ } else if (!nmsgs++ || nc > nextmsg_time) /* start timer */
+ nextmsg_time = nc + 10000000000ULL;
+
+ /*
+ * don't report errors that are masked (includes those always
+ * ignored)
+ */
+ errs &= ~dd->ipath_maskederrs;
+
+ /* do these first, they are most important */
+ if (errs & INFINIPATH_E_HARDWARE) {
+ /* reuse same msg buf */
+ ipath_handle_hwerrors(t, msg, sizeof msg);
+ }
+
+ if (!noprint && (errs & ~infinipath_e_bitsextant))
+ _IPATH_UNIT_ERROR(t,
+ "error interrupt with unknown errors %llx set\n",
+ errs & ~infinipath_e_bitsextant);
+
+ if (errs & E_SUM_ERRS) {
+ /* if possible that sendbuffererror could be valid */
+ piobcnt = dd->ipath_piobcnt;
+ /* read these before writing errorclear */
+ sbuf[0] = ipath_kget_kreg64(t, kr_sendbuffererror);
+ sbuf[1] = ipath_kget_kreg64(t, kr_sendbuffererror + 1);
+ if (piobcnt > 128) {
+ sbuf[2] = ipath_kget_kreg64(t, kr_sendbuffererror + 2);
+ sbuf[3] = ipath_kget_kreg64(t, kr_sendbuffererror + 3);
+ }
+
+ if (sbuf[0] || sbuf[1]
+ || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+ _IPATH_PDBG("SendbufErrs %llx %llx ", sbuf[0], sbuf[1]);
+ if (infinipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+ printk("%llx %llx ", sbuf[2], sbuf[3]);
+ for (i = 0; i < piobcnt; i++) {
+ if (test_bit(i, sbuf)) {
+ uint32_t sendctrl;
+ if (infinipath_debug & __IPATH_PKTDBG)
+ printk("%u ", i);
+ sendctrl =
+ dd->
+ ipath_sendctrl | INFINIPATH_S_DISARM
+ | (i <<
+ INFINIPATH_S_DISARMPIOBUF_SHIFT);
+ ipath_kput_kreg(t, kr_sendctrl,
+ sendctrl);
+ }
+ }
+ if (infinipath_debug & __IPATH_PKTDBG)
+ printk("\n");
+ }
+ if ((errs &
+ (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT
+ | INFINIPATH_E_SMINPKTLEN))
+ && !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ /*
+ * This can happen when SMA is trying to bring
+ * the link up, but the IB link changes state
+ * at the "wrong" time. The IB logic then
+ * complains that the packet isn't valid.
+ * We don't want to confuse people, so we just
+ * don't print them, except at debug
+ */
+ _IPATH_DBG
+ ("Ignoring pktsend errors %llx, because not yet active\n",
+ errs);
+ ignore_this_time |=
+ INFINIPATH_E_SDROPPEDDATAPKT |
+ INFINIPATH_E_SDROPPEDSMPPKT |
+ INFINIPATH_E_SMINPKTLEN;
+ }
+ }
+
+ if (supp_msgs == 250000) {
+ /*
+ * It's not entirely reasonable assuming that the errors
+ * set in the last clear period are all responsible for
+ * the problem, but the alternative is to assume it's the only
+ * ones on this particular interrupt, which also isn't great
+ */
+ dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+ ipath_kput_kreg(t, kr_errormask, ~dd->ipath_maskederrs);
+ ipath_decode_err(msg, sizeof msg,
+ (dd->ipath_maskederrs & ~dd->
+ ipath_ignorederrs));
+
+ if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs)
+ & ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL))
+ _IPATH_UNIT_ERROR(t,
+ "Disabling error(s) %llx because occuring too frequently (%s)\n",
+ (dd->ipath_maskederrs & ~dd->
+ ipath_ignorederrs), msg);
+ else {
+ /*
+ * rcvegrfull and rcvhdrqfull are "normal",
+ * for some types of processes (mostly benchmarks)
+ * that send huge numbers of messages, while not
+ * processing them. So only complain about
+ * these at debug level.
+ */
+ _IPATH_DBG
+ ("Disabling frequent queue full errors (%s)\n",
+ msg);
+ }
+
+ /*
+ * re-enable the masked errors after around 3 minutes.
+ * in ipath_get_faststats(). If we have a series of
+ * fast repeating but different errors, the interval will keep
+ * stretching out, but that's OK, as that's pretty catastrophic.
+ */
+ dd->ipath_unmasktime = nc + 400000000000ULL;
+ }
+
+ ipath_kput_kreg(t, kr_errorclear, errs);
+ if (ignore_this_time)
+ errs &= ~ignore_this_time;
+ if (errs & ~dd->ipath_lasterror) {
+ errs &= ~dd->ipath_lasterror;
+ /* never suppress duplicate hwerrors or ibstatuschange */
+ dd->ipath_lasterror |= errs &
+ ~(INFINIPATH_E_HARDWARE | INFINIPATH_E_IBSTATUSCHANGED);
+ }
+ if (!errs)
+ return;
+
+ if (!noprint)
+ /* the ones we mask off are handled specially below or above */
+ ipath_decode_err(msg, sizeof msg,
+ errs & ~(INFINIPATH_E_IBSTATUSCHANGED |
+ INFINIPATH_E_RRCVEGRFULL |
+ INFINIPATH_E_RRCVHDRFULL |
+ INFINIPATH_E_HARDWARE));
+ else
+ /* so we don't need if(!noprint) at strlcat's below */
+ *msg = 0;
+
+ if (errs & E_SUM_PKTERRS) {
+ ipath_stats.sps_pkterrs++;
+ chkerrpkts = 1;
+ }
+ if (errs & E_SUM_ERRS)
+ ipath_stats.sps_errs++;
+
+ if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+ ipath_stats.sps_crcerrs++;
+ chkerrpkts = 1;
+ }
+
+ /*
+ * We don't want to print these two as they happen, or we can make
+ * the situation even worse, because it takes so long to print messages.
+ * to serial consoles. kernel ports get printed from fast_stats, no
+ * more than every 5 seconds, user ports get printed on close
+ */
+ if (errs & INFINIPATH_E_RRCVHDRFULL) {
+ int any;
+ uint32_t hd, tl;
+ ipath_stats.sps_hdrqfull++;
+ for (any = i = 0; i < dd->ipath_cfgports; i++) {
+ if (i == 0) {
+ hd = dd->ipath_port0head;
+ tl = *dd->ipath_hdrqtailptr;
+ } else if (dd->ipath_pd[i] &&
+ dd->ipath_pd[i]->port_rcvhdrtail_kvaddr) {
+ /*
+ * don't report same point multiple times,
+ * except kernel
+ */
+ tl = (uint32_t) *
+ dd->ipath_pd[i]->port_rcvhdrtail_kvaddr;
+ if (tl == dd->ipath_lastrcvhdrqtails[i])
+ continue;
+ hd = ipath_kget_ureg32(t, ur_rcvhdrhead, i);
+ } else
+ continue;
+ if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+ dd->ipath_lastrcvhdrqtails[i] = tl;
+ dd->ipath_pd[i]->port_hdrqfull++;
+ if (i == 0)
+ chkerrpkts = 1;
+ }
+ }
+ }
+ if (errs & INFINIPATH_E_RRCVEGRFULL) {
+ /*
+ * since this is of less importance and not likely to
+ * happen without also getting hdrfull, only count
+ * occurrences; don't check each port (or even the kernel
+ * vs user)
+ */
+ ipath_stats.sps_etidfull++;
+ if (dd->ipath_port0head != *dd->ipath_hdrqtailptr)
+ chkerrpkts = 1;
+ }
+
+ /*
+ * do this before IBSTATUSCHANGED, in case both bits set in a single
+ * interrupt; we want the STATUSCHANGE to "win", so we do our
+ * internal copy of state machine correctly
+ */
+ if (errs & INFINIPATH_E_RIBLOSTLINK) {
+ /* force through block below */
+ errs |= INFINIPATH_E_IBSTATUSCHANGED;
+ ipath_stats.sps_iblink++;
+ dd->ipath_flags |= IPATH_LINKDOWN;
+ dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+ | IPATH_LINKARMED | IPATH_LINKACTIVE);
+ if (!noprint)
+ _IPATH_DBG("Lost link, link now down (%s)\n",
+ ipath_ibcstatus_str[ipath_kget_kreg64
+ (t,
+ kr_ibcstatus) & 0xf]);
+ }
+
+ if ((errs & INFINIPATH_E_IBSTATUSCHANGED) && (!ipath_diags_enabled)) {
+ uint64_t val;
+ uint32_t ltstate;
+
+ val = ipath_kget_kreg64(t, kr_ibcstatus);
+ ltstate = val & 0xff;
+ if(ltstate == 0x11 || ltstate == 0x21 || ltstate == 0x31)
+ _IPATH_DBG("Link state changed unit %u to 0x%x, last was 0x%llx\n",
+ t, ltstate, dd->ipath_lastibcstat);
+ else {
+ ltstate = dd->ipath_lastibcstat & 0xff;
+ if(ltstate == 0x11 || ltstate == 0x21 || ltstate == 0x31)
+ _IPATH_DBG("Link state unit %u changed to down state 0x%llx, last was 0x%llx\n",
+ t, val, dd->ipath_lastibcstat);
+ else
+ _IPATH_VDBG("Link state unit %u changed to 0x%llx from one of down states\n",
+ t, val);
+ }
+ ltstate = (val >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+ INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+
+ if (ltstate == 2 || ltstate == 3) {
+ uint32_t last_ltstate;
+
+ /*
+ * ignore cycling back and forth from states 2 to 3
+ * while waiting for other end of link to come up
+ * except that if it keeps happening, we switch between
+ * linkinitstate SLEEP and POLL. While we cycle
+ * back and forth between them, we aren't seeing
+ * any other device, either no cable plugged in,
+ * other device powered off, other device is
+ * switch that hasn't yet polled us, etc.
+ */
+ last_ltstate = (dd->ipath_lastibcstat >>
+ INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)
+ & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+ if (last_ltstate == 2 || last_ltstate == 3) {
+ if (++dd->ipath_ibpollcnt > 4) {
+ uint64_t ibc;
+ dd->ipath_flags |=
+ IPATH_LINK_SLEEPING | IPATH_NOCABLE;
+ *dd->ipath_statusp |=
+ IPATH_STATUS_IB_NOCABLE;
+ _IPATH_VDBG
+ ("linkinitcmd POLL, move to SLEEP\n");
+ ibc = dd->ipath_ibcctrl;
+ ibc |= INFINIPATH_IBCC_LINKINITCMD_SLEEP
+ <<
+ INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+ /*
+ * don't put linkinitcmd in
+ * ipath_ibcctrl, want that to
+ * stay a NOP
+ */
+ ipath_kput_kreg(t, kr_ibcctrl, ibc);
+ dd->ipath_ibpollcnt = 0;
+ }
+ goto skip_ibchange;
+ }
+ }
+ /* some state other than 2 or 3 */
+ dd->ipath_ibpollcnt = 0;
+ ipath_stats.sps_iblink++;
+ /*
+ * Note: We try to match the Mellanox HCA LED behavior
+ * as best we can. That changed around Oct 2003.
+ * Green indicates link state (something is plugged in,
+ * and we can train). Amber indicates the link is
+ * logically up (ACTIVE). Mellanox further blinks the
+ * amber LED to indicate data packet activity, but we
+ * have no hardware support for that, so it would require
+ * waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if
+ * appropriate. That's visible overhead, so not something
+ * we will do.
+ */
+ if (ltstate != 1 || ((dd->ipath_lastibcstat & 0x30) == 0x30 &&
+ (val & 0x30) != 0x30)) {
+ dd->ipath_flags |= IPATH_LINKDOWN;
+ dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+ | IPATH_LINKACTIVE |
+ IPATH_LINKARMED);
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+ if (!noprint) {
+ if ((dd->ipath_lastibcstat & 0x30) == 0x30)
+ /* if from up to down be more vocal */
+ _IPATH_DBG("Link unit %u is now down (%s)\n",
+ t, ipath_ibcstatus_str
+ [ltstate]);
+ else
+ _IPATH_VDBG("Link unit %u is down (%s)\n",
+ t, ipath_ibcstatus_str
+ [ltstate]);
+ }
+
+ if (val & 0x30) {
+ /* leave just green on, 0x11 and 0x21 */
+ dd->ipath_extctrl &=
+ ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+ dd->ipath_extctrl |=
+ INFINIPATH_EXTC_LEDPRIPORTGREENON;
+ } else /* not up at all, so turn the leds off */
+ dd->ipath_extctrl &=
+ ~(INFINIPATH_EXTC_LEDPRIPORTGREENON |
+ INFINIPATH_EXTC_LEDPRIPORTYELLOWON);
+ ipath_kput_kreg(t, kr_extctrl,
+ (uint64_t) dd->ipath_extctrl);
+ if (ltstate == 1
+ && (dd->
+ ipath_flags & (IPATH_LINK_TOARMED |
+ IPATH_LINK_TOACTIVE))) {
+ ipath_set_ib_lstate(t,
+ INFINIPATH_IBCC_LINKCMD_INIT);
+ }
+ } else if ((val & 0x31) == 0x31) {
+ if (!noprint)
+ _IPATH_DBG("Link unit %u is now in active state\n", t);
+ dd->ipath_flags |= IPATH_LINKACTIVE;
+ dd->ipath_flags &=
+ ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKDOWN |
+ IPATH_LINKARMED | IPATH_NOCABLE |
+ IPATH_LINK_TOACTIVE | IPATH_LINK_SLEEPING);
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+ *dd->ipath_statusp |=
+ IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+ /* set the externally visible LEDs to indicate state */
+ dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON
+ | INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+ ipath_kput_kreg(t, kr_extctrl,
+ (uint64_t) dd->ipath_extctrl);
+
+ /*
+ * since we are now active, set the linkinitcmd
+ * to NOP (0) it was probably either POLL or SLEEP
+ */
+ dd->ipath_ibcctrl &=
+ ~(INFINIPATH_IBCC_LINKINITCMD_MASK <<
+ INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+ ipath_kput_kreg(t, kr_ibcctrl, dd->ipath_ibcctrl);
+
+ if (devdata[t].ipath_layer.l_intr)
+ devdata[t].ipath_layer.l_intr(t,
+ IPATH_LAYER_INT_IF_UP);
+ } else if ((val & 0x31) == 0x11) {
+ /*
+ * set set INIT and DOWN. Down is checked by
+ * most of the other code, but INIT is useful
+ * to know in a few places.
+ */
+ dd->ipath_flags |= IPATH_LINKINIT | IPATH_LINKDOWN;
+ dd->ipath_flags &=
+ ~(IPATH_LINKUNK | IPATH_LINKACTIVE | IPATH_LINKARMED
+ | IPATH_NOCABLE | IPATH_LINK_SLEEPING);
+ *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+ | IPATH_STATUS_IB_READY);
+
+ /* set the externally visible LEDs to indicate state */
+ dd->ipath_extctrl &=
+ ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+ dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON;
+ ipath_kput_kreg(t, kr_extctrl,
+ (uint64_t) dd->ipath_extctrl);
+ if (dd->
+ ipath_flags & (IPATH_LINK_TOARMED |
+ IPATH_LINK_TOACTIVE)) {
+ /*
+ * if we got here while trying to bring
+ * the link up, try again, but only once more!
+ */
+ ipath_set_ib_lstate(t,
+ INFINIPATH_IBCC_LINKCMD_ARMED);
+ dd->ipath_flags &=
+ ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE);
+ }
+ } else if ((val & 0x31) == 0x21) {
+ dd->ipath_flags |= IPATH_LINKARMED;
+ dd->ipath_flags &=
+ ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT |
+ IPATH_LINKACTIVE | IPATH_NOCABLE |
+ IPATH_LINK_TOARMED | IPATH_LINK_SLEEPING);
+ *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+ | IPATH_STATUS_IB_READY);
+ /*
+ * set the externally visible LEDs to indicate
+ * state (same as 0x11)
+ */
+ dd->ipath_extctrl &=
+ ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+ dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON;
+ ipath_kput_kreg(t, kr_extctrl,
+ (uint64_t) dd->ipath_extctrl);
+ if (dd->ipath_flags & IPATH_LINK_TOACTIVE) {
+ /*
+ * if we got here while trying to bring
+ * the link up, try again, but only once more!
+ */
+ ipath_set_ib_lstate(t,
+ INFINIPATH_IBCC_LINKCMD_ACTIVE);
+ dd->ipath_flags &= ~IPATH_LINK_TOACTIVE;
+ }
+ } else {
+ if (dd->
+ ipath_flags & (IPATH_LINK_TOARMED |
+ IPATH_LINK_TOACTIVE))
+ ipath_set_ib_lstate(t,
+ INFINIPATH_IBCC_LINKCMD_INIT);
+ else if (!noprint)
+ _IPATH_DBG("IBstatuschange unit %u: %s\n",
+ t, ipath_ibcstatus_str[ltstate]);
+ }
+ dd->ipath_lastibcstat = val;
+ }
+
+skip_ibchange:
+
+ if (errs & INFINIPATH_E_RESET) {
+ if (!noprint)
+ _IPATH_UNIT_ERROR(t,
+ "Got reset, requires re-initialization (unload and reload driver)\n");
+ dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */
+ /* mark as having had error */
+ *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+ }
+
+ if (!noprint && *msg)
+ _IPATH_UNIT_ERROR(t, "%s error\n", msg);
+ if (dd->ipath_sma_state_wanted & dd->ipath_flags) {
+ _IPATH_VDBG("sma wanted state %x, iflags now %x, waking\n",
+ dd->ipath_sma_state_wanted, dd->ipath_flags);
+ wake_up_interruptible(&ipath_sma_state_wait);
+ }
+
+ if (chkerrpkts)
+ /* process possible error packets in hdrq */
+ ipath_kreceive(t);
+}
+
+/* must only be called if ipath_pd[port] is known to be allocated */
+static __inline__ void *ipath_get_egrbuf(const ipath_type t, uint32_t bufnum,
+ int err)
+{
+ return devdata[t].ipath_port0_skbs ?
+ (void *)devdata[t].ipath_port0_skbs[bufnum]->data : NULL;
+
+#ifdef _USE_FOR_DEBUGGING_ONLY
+ /*
+ * want routine to be inlined and fast this is here so if we do ports
+ * other than 0, I don't have to rewrite the code, since it's slightly
+ * complicated
+ */
+ if (port != 1) {
+ void *chunkbase;
+ /*
+ * This calculation takes about 50 cycles. Could do
+ * what I did for protocol code, and have an array of
+ * addresses, getting it down to just a few cycles per
+ * lookup, at the cost of 16KB of memory.
+ */
+ if (!devdata[t].ipath_pd[port]->port_rcvegrbuf_virt)
+ return NULL;
+ chunkbase = devdata[t].ipath_pd[port]->port_rcvegrbuf_virt
+ [bufnum /
+ devdata[t].ipath_pd[port]->port_rcvegrbufs_perchunk];
+ return (void *)(chunkbase +
+ (bufnum %
+ devdata[t].ipath_pd[port]->
+ port_rcvegrbufs_perchunk)
+ * devdata[t].ipath_rcvegrbufsize);
+ }
+#endif
+}
+
+/* receive an sma packet. Separate for better overall optimization */
+static void ipath_rcv_sma(const ipath_type t, uint32_t tlen,
+ uint64_t * rc, void *ebuf)
+{
+ int sindex, slen, elen;
+ void *smbuf;
+ uint8_t pad, *bthbytes;
+
+ ipath_stats.sps_sma_rpkts++; /* another SMA packet received */
+
+ bthbytes = (uint8_t *) ((ips_message_header_typ *) & rc[1])->bth;
+
+ pad = (bthbytes[1] >> 4) & 3;
+ elen = tlen - (IPATH_SMA_HDRSZ + pad + (uint32_t) sizeof(uint32_t));
+ if (elen > (SMA_MAX_PKTSZ - IPATH_SMA_HDRSZ))
+ elen = SMA_MAX_PKTSZ - IPATH_SMA_HDRSZ;
+
+ spin_lock_irq(&ipath_sma_lock);
+ sindex = ipath_sma_next;
+ smbuf = ipath_sma_data[sindex].buf;
+ ipath_sma_data[sindex].unit = t;
+ slen = ipath_sma_data[ipath_sma_next].len;
+ memcpy(smbuf, &rc[1], IPATH_SMA_HDRSZ);
+ memcpy(smbuf + IPATH_SMA_HDRSZ, ebuf, elen);
+ if (slen) {
+ /*
+ * overwriting a yet unread old one (buffer wrap), have to
+ * advance ipath_sma_first to next oldest
+ */
+
+ /* count OK packets that we drop */
+ ipath_stats.sps_krdrops++;
+ if (++ipath_sma_first >= IPATH_NUM_SMAPKTS)
+ ipath_sma_first = 0;
+ }
+ slen = ipath_sma_data[sindex].len = elen + IPATH_SMA_HDRSZ;
+ if (++ipath_sma_next >= IPATH_NUM_SMAPKTS)
+ ipath_sma_next = 0;
+ spin_unlock_irq(&ipath_sma_lock);
+}
+
+/*
+ * receive a packet for the layered (ethernet) driver.
+ * Separate routine for better overall optimization
+ */
+static void ipath_rcv_layer(const ipath_type t, uint32_t etail,
+ uint32_t tlen, ether_header_typ * hdr)
+{
+ uint32_t elen;
+ uint8_t pad, *bthbytes;
+ struct sk_buff *skb;
+ struct sk_buff *nskb;
+ ipath_devdata *dd = &devdata[t];
+ ipath_portdata *pd;
+ unsigned long pa, pent;
+ uint64_t *egrbase;
+ uint64_t lenvalid; /* in words */
+
+ if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) {
+ /*
+ * Allocate a new sk_buff to replace the one we give
+ * to the network stack.
+ */
+ if (!(nskb = dev_alloc_skb(dd->ipath_ibmaxlen + 4))) {
+ /* count OK packets that we drop */
+ ipath_stats.sps_krdrops++;
+ return;
+ }
+
+ bthbytes = (uint8_t *) hdr->bth;
+ pad = (bthbytes[1] >> 4) & 3;
+ /* +CRC32 */
+ elen = tlen - (sizeof(*hdr) + pad + sizeof(uint32_t));
+
+ skb_reserve(nskb, 4);
+
+ skb = dd->ipath_port0_skbs[etail];
+ dd->ipath_port0_skbs[etail] = nskb;
+ skb_put(skb, elen);
+
+ pd = dd->ipath_pd[0];
+ lenvalid = (dd->ipath_ibmaxlen - pd->port_egrskip) >> 2;
+ lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+ lenvalid |= INFINIPATH_RT_VALID;
+ pa = virt_to_phys(nskb->data);
+ pa += pd->port_egrskip;
+ pent = (pa & INFINIPATH_RT_ADDR_MASK) | lenvalid;
+ /* This is simplified for port 0 */
+ egrbase = (uint64_t *) ((char *)(dd->ipath_kregbase) +
+ dd->ipath_rcvegrbase);
+ ipath_kput_memq(t, &egrbase[etail], pent);
+
+ dd->ipath_layer.l_rcv(t, hdr, skb);
+
+ /* another ether packet received */
+ ipath_stats.sps_ether_rpkts++;
+ } else if (hdr->sub_opcode == OPCODE_LID_ARP) {
+ if (dd->ipath_layer.l_rcv_lid)
+ dd->ipath_layer.l_rcv_lid(t, hdr);
+ }
+
+}
+
+/* called from interrupt handler for errors or receive interrupt */
+void ipath_kreceive(const ipath_type t)
+{
+ uint64_t *rc;
+ void *ebuf;
+ ipath_devdata *dd = &devdata[t];
+ const uint32_t rsize = dd->ipath_rcvhdrentsize; /* words */
+ const uint32_t maxcnt = dd->ipath_rcvhdrcnt * rsize; /* in words */
+ uint32_t etail = ~0U, l, hdrqtail, sma_this_time = 0;
+ ips_message_header_typ *hdr;
+ uint32_t eflags, i, etype, tlen, pkttot=0;
+ static uint64_t totcalls; /* stats, may eventually remove */
+ char emsg[128];
+
+ if (!dd->ipath_hdrqtailptr) {
+ _IPATH_UNIT_ERROR(t,
+ "hdrqtailptr not set, can't do receives\n");
+ return;
+ }
+
+ if (test_and_set_bit(0, &dd->ipath_rcv_pending)) {
+ /* There is already a thread processing this queue. */
+ return;
+ }
+
+ if (dd->ipath_port0head == *dd->ipath_hdrqtailptr)
+ goto done;
+
+gotmore:
+ /*
+ * read only once at start. If in flood situation, this helps
+ * performance slightly. If more arrive while we are processing,
+ * we'll come back here and do them
+ */
+ hdrqtail = *dd->ipath_hdrqtailptr;
+
+ for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) {
+ uint32_t qp;
+ uint8_t *bthbytes;
+
+
+ rc = (uint64_t *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2));
+ hdr = (ips_message_header_typ *) & rc[1];
+ /*
+ * could make a network order version of IPATH_KD_QP, and
+ * do the obvious shift before masking to speed this up.
+ */
+ qp = ntohl(hdr->bth[1]) & 0xffffff;
+ bthbytes = (uint8_t *) hdr->bth;
+
+ eflags = ips_get_hdr_err_flags(rc);
+ etype = ips_get_rcv_type(rc);
+ tlen = ips_get_length_in_bytes(rc); /* total length */
+ ebuf = NULL;
+ if (etype != RCVHQ_RCV_TYPE_EXPECTED) {
+ /*
+ * it turns out that the chips uses an eager buffer for
+ * all non-expected packets, whether it "needs"
+ * one or not. So always get the index, but
+ * don't set ebuf (so we try to copy data)
+ * unless the length requires it.
+ */
+ etail = ips_get_index(rc);
+ if (tlen > sizeof(*hdr)
+ || etype == RCVHQ_RCV_TYPE_NON_KD) {
+ ebuf = ipath_get_egrbuf(t, etail, 0);
+ }
+ }
+
+ /*
+ * both tiderr and ipathhdrerr are set for all plain IB
+ * packets; only ipathhdrerr should be set.
+ */
+
+ if (etype != RCVHQ_RCV_TYPE_NON_KD
+ && etype != RCVHQ_RCV_TYPE_ERROR
+ && ips_get_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+ IPS_PROTO_VERSION) {
+ _IPATH_PDBG("Bad InfiniPath protocol version %x\n",
+ etype);
+ }
+
+ if (eflags &
+ ~(INFINIPATH_RHF_H_TIDERR | INFINIPATH_RHF_H_IHDRERR)) {
+ get_rhf_errstring(eflags, emsg, sizeof emsg);
+ _IPATH_PDBG
+ ("RHFerrs %x hdrqtail=%x typ=%u tlen=%x opcode=%x egridx=%x: %s\n",
+ eflags, l, etype, tlen, bthbytes[0],
+ ips_get_index(rc), emsg);
+ } else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+ /*
+ * If there is a userland SMA and this is a MAD packet,
+ * then pass it to the userland SMA.
+ */
+ if (ipath_sma_alive && qp <= 1) {
+ /*
+ * count OK packets that we drop because
+ * SMA isn't yet running, or because we
+ * are in an sma flood (no point in
+ * constantly acquiring the spin lock, and
+ * overwriting previous packets).
+ * Eventually things will recover.
+ * Similarly if the sma consumer is
+ * so far behind that we would overwrite
+ * (yes, it's outside the lock)
+ */
+ if (!ipath_sma_data_spare ||
+ ipath_sma_data[ipath_sma_next].len ||
+ ++sma_this_time > IPATH_NUM_SMAPKTS) {
+ ipath_stats.sps_krdrops++;
+ } else if (ebuf) {
+ ipath_rcv_sma(t, tlen, rc, ebuf);
+ }
+ } else if (dd->verbs_layer.l_rcv) {
+ dd->verbs_layer.l_rcv(t, rc + 1, ebuf, tlen);
+ } else {
+ _IPATH_VDBG("received IB packet, not SMA (QP=%x)\n",
+ qp);
+ }
+ } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+ if (qp == IPATH_KD_QP && bthbytes[0] ==
+ dd->ipath_layer.l_rcv_opcode && ebuf)
+ ipath_rcv_layer(t, etail, tlen,
+ (ether_header_typ *) hdr);
+ else
+ _IPATH_PDBG
+ ("typ %x, opcode %x (eager, qp=%x), len %x; ignored\n",
+ etype, bthbytes[0], qp, tlen);
+ } else if (etype == RCVHQ_RCV_TYPE_EXPECTED) {
+ _IPATH_DBG("Bug: Expected TID, opcode %x; ignored\n",
+ hdr->bth[0] & 0xff);
+ } else if (eflags &
+ (INFINIPATH_RHF_H_TIDERR | INFINIPATH_RHF_H_IHDRERR))
+ {
+ /*
+ * This is a type 3 packet, only the LRH is in
+ * the rcvhdrq, the rest of the header is in
+ * the eager buffer.
+ */
+ uint8_t opcode;
+ if (ebuf) {
+ bthbytes = (uint8_t *) ebuf;
+ opcode = *bthbytes;
+ } else
+ opcode = 0;
+ get_rhf_errstring(eflags, emsg, sizeof emsg);
+ _IPATH_DBG
+ ("Err %x (%s), opcode %x, egrbuf %x, len %x\n",
+ eflags, emsg, opcode, etail, tlen);
+ } else {
+ /*
+ * error packet, type of error unknown.
+ * Probably type 3, but we don't know, so don't
+ * even try to print the opcode, etc.
+ */
+ _IPATH_DBG
+ ("Error Pkt, but no eflags! egrbuf %x, len %x\n"
+ "hdrq@%lx;hdrq+%x rhf: %llx; hdr %llx %llx %llx %llx %llx\n",
+ etail, tlen, (unsigned long)rc, l, rc[0], rc[1],
+ rc[2], rc[3], rc[4], rc[5]);
+ }
+ l += rsize;
+ if (l >= maxcnt)
+ l = 0;
+ /*
+ * update for each packet, to help prevent overflows if we have
+ * lots of packets.
+ */
+ (void)ipath_kput_ureg(t, ur_rcvhdrhead, l, 0);
+ if (etype != RCVHQ_RCV_TYPE_EXPECTED)
+ (void)ipath_kput_ureg(t, ur_rcvegrindexhead, etail, 0);
+ }
+
+ pkttot += i;
+
+ dd->ipath_port0head = l;
+
+ if (hdrqtail != *dd->ipath_hdrqtailptr)
+ goto gotmore; /* more arrived while we handled first batch */
+
+ if(pkttot > ipath_stats.sps_maxpkts_call)
+ ipath_stats.sps_maxpkts_call = pkttot;
+ ipath_stats.sps_port0pkts += pkttot;
+ ipath_stats.sps_avgpkts_call = ipath_stats.sps_port0pkts / ++totcalls;
+
+ if (sma_this_time) /* only once at end, not each time */
+ wake_up_interruptible(&ipath_sma_wait);
+
+done:
+ clear_bit(0, &dd->ipath_rcv_pending);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * Update our shadow copy of the PIO availability register map, called
+ * whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by ipath_bufavail()
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(const ipath_type t)
+{
+ unsigned long flags;
+ int i;
+ const unsigned piobregs = (unsigned)devdata[t].ipath_pioavregs;
+
+ /* If the generation (check) bits have changed, then we update the
+ * busy bit for the corresponding PIO buffer. This algorithm will
+ * modify positions to the value they already have in some cases
+ * (i.e., no change), but it's faster than changing only the bits
+ * that have changed.
+ *
+ * We would like to do this atomicly, to avoid spinlocks in the
+ * critical send path, but that's not really possible, given the
+ * type of changes, and that this routine could be called on multiple
+ * cpu's simultaneously, so we lock in this routine only, to avoid
+ * conflicting updates; all we change is the shadow, and it's a
+ * single 64 bit memory location, so by definition the update is
+ * atomic in terms of what other cpu's can see in testing the
+ * bits. The spin_lock overhead isn't too bad, since it only
+ * happens when all buffers are in use, so only cpu overhead,
+ * not latency or bandwidth is affected.
+ */
+#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL
+ if (!devdata[t].ipath_pioavailregs_dma) {
+ _IPATH_DBG("Update shadow pioavail, but regs_dma NULL!\n");
+ return;
+ }
+ if (infinipath_debug & __IPATH_VERBDBG) {
+ /* only if packet debug and verbose */
+ _IPATH_PDBG("Refill avail, dma0=%llx shad0=%llx, "
+ "d1=%llx s1=%llx, d2=%llx s2=%llx, d3=%llx s3=%llx\n",
+ devdata[t].ipath_pioavailregs_dma[0],
+ devdata[t].ipath_pioavailshadow[0],
+ devdata[t].ipath_pioavailregs_dma[1],
+ devdata[t].ipath_pioavailshadow[1],
+ devdata[t].ipath_pioavailregs_dma[2],
+ devdata[t].ipath_pioavailshadow[2],
+ devdata[t].ipath_pioavailregs_dma[3],
+ devdata[t].ipath_pioavailshadow[3]);
+ if (piobregs > 4)
+ _IPATH_PDBG("2nd group, dma4=%llx shad4=%llx, "
+ "d5=%llx s5=%llx, d6=%llx s6=%llx, d7=%llx s7=%llx\n",
+ devdata[t].ipath_pioavailregs_dma[4],
+ devdata[t].ipath_pioavailshadow[4],
+ devdata[t].ipath_pioavailregs_dma[5],
+ devdata[t].ipath_pioavailshadow[5],
+ devdata[t].ipath_pioavailregs_dma[6],
+ devdata[t].ipath_pioavailshadow[6],
+ devdata[t].ipath_pioavailregs_dma[7],
+ devdata[t].ipath_pioavailshadow[7]);
+ }
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (i = 0; i < piobregs; i++) {
+ uint64_t pchbusy, pchg, piov, pnew;
+ /* Chip Errata: bug 6641; even and odd qwords>3 are swapped */
+ piov = devdata[t].ipath_pioavailregs_dma[i > 3 ? i ^ 1 : i];
+ pchg =
+ _IPATH_ALL_CHECKBITS & ~(devdata[t].
+ ipath_pioavailshadow[i] ^ piov);
+ pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+ if (pchg && (pchbusy & devdata[t].ipath_pioavailshadow[i])) {
+ pnew = devdata[t].ipath_pioavailshadow[i] & ~pchbusy;
+ pnew |= piov & pchbusy;
+ devdata[t].ipath_pioavailshadow[i] = pnew;
+ }
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+static int ipath_do_user_init(ipath_portdata * pd,
+ struct ipath_user_info *uinfo)
+{
+ int ret = 0;
+ ipath_type t = pd->port_unit;
+ ipath_devdata *dd = &devdata[t];
+ struct ipath_user_info kinfo;
+
+ if (copy_from_user(&kinfo, uinfo, sizeof kinfo))
+ ret = -EFAULT;
+ else {
+ /* for now, if major version is different, bail */
+ if ((kinfo.spu_userversion >> 16) != IPATH_USER_SWMAJOR) {
+ _IPATH_INFO
+ ("User major version %d not same as driver major %d\n",
+ kinfo.spu_userversion >> 16, IPATH_USER_SWMAJOR);
+ ret = -ENODEV;
+ } else {
+ if ((kinfo.spu_userversion & 0xffff) !=
+ IPATH_USER_SWMINOR)
+ _IPATH_DBG
+ ("User minor version %d not same as driver minor %d\n",
+ kinfo.spu_userversion & 0xffff,
+ IPATH_USER_SWMINOR);
+ if (kinfo.spu_rcvhdrsize) {
+ if ((ret =
+ ipath_setrcvhdrsize(t,
+ kinfo.spu_rcvhdrsize)))
+ goto done;
+ } else if (!dd->ipath_rcvhdrsize) {
+ /*
+ * first user of field, kernel or user
+ * code, and using default
+ */
+ dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+ ipath_kput_kreg(pd->port_unit, kr_rcvhdrsize,
+ dd->ipath_rcvhdrsize);
+ _IPATH_VDBG
+ ("Use default protocol header size %u\n",
+ dd->ipath_rcvhdrsize);
+ }
+
+ pd->port_egrskip = kinfo.spu_egrskip;
+ if (pd->port_egrskip) {
+ if (pd->port_egrskip & 3) {
+ _IPATH_DBG
+ ("eager skip 0x%x invalid, must be word multiple; using 0x%x\n",
+ pd->port_egrskip,
+ pd->port_egrskip & ~3);
+ pd->port_egrskip &= ~3;
+ }
+ _IPATH_DBG
+ ("user reserves 0x%x bytes at start of eager TIDs\n",
+ pd->port_egrskip);
+ }
+
+ /*
+ * for now we do nothing with rcvhdrcnt:
+ * kinfo.spu_rcvhdrcnt
+ */
+
+ /*
+ * set up for the rcvhdr Q tail register writeback
+ * to user memory
+ */
+ if (kinfo.spu_rcvhdraddr &&
+ access_ok(VERIFY_WRITE, kinfo.spu_rcvhdraddr,
+ sizeof(uint64_t))) {
+ uint64_t physaddr, uaddr, off, atmp;
+ struct page *pagep;
+ off = offset_in_page(kinfo.spu_rcvhdraddr);
+ uaddr =
+ PAGE_MASK & (unsigned long)kinfo.
+ spu_rcvhdraddr;
+ if ((ret = ipath_mlock_nocopy(uaddr, &pagep))) {
+ _IPATH_INFO
+ ("Failed to lookup and lock address %llx for rcvhdrtail: errno %d\n",
+ kinfo.spu_rcvhdraddr, -ret);
+ goto done;
+ }
+ ipath_stats.sps_pagelocks++;
+ pd->port_rcvhdrtail_uaddr = uaddr;
+ pd->port_rcvhdrtail_pagep = pagep;
+ pd->port_rcvhdrtail_kvaddr =
+ page_address(pagep);
+ pd->port_rcvhdrtail_kvaddr += off;
+ physaddr = page_to_phys(pagep) + off;
+ _IPATH_VDBG
+ ("port %d user addr %llx hdrtailaddr, %llx physical (off=%llx)\n",
+ pd->port_port, kinfo.spu_rcvhdraddr,
+ physaddr, off);
+ ipath_kput_kreg_port(t, kr_rcvhdrtailaddr,
+ pd->port_port, physaddr);
+ atmp =
+ ipath_kget_kreg64_port(t, kr_rcvhdrtailaddr,
+ pd->port_port);
+ if (physaddr != atmp) {
+ _IPATH_UNIT_ERROR(t,
+ "Catastrophic software error, RcvHdrTailAddr%u written as %llx, read back as %llx\n",
+ pd->port_port,
+ physaddr, atmp);
+ ret = -EINVAL;
+ goto done;
+ }
+ } else {
+ _IPATH_DBG
+ ("Port %d rcvhdrtail addr %llx not valid\n",
+ pd->port_port, kinfo.spu_rcvhdraddr);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * for right now, kernel piobufs are at end,
+ * so port 1 is at 0
+ */
+ pd->port_piobufs = dd->ipath_piobufbase +
+ dd->ipath_pbufsport * (pd->port_port -
+ 1) * dd->ipath_palign;
+ _IPATH_VDBG("Set base of piobufs for port %u to 0x%x\n",
+ pd->port_port, pd->port_piobufs);
+
+ /*
+ * Now allocate the rcvhdr Q and eager TIDs;
+ * skip the TID array for time being.
+ * If pd->port_port > chip-supported, we need
+ * to do extra stuff here to handle by handling
+ * overflow through port 0, someday
+ */
+ if (!(ret = ipath_create_rcvhdrq(pd)))
+ ret = ipath_create_user_egr(pd);
+ if (!ret) { /* enable receives now */
+ uint64_t head;
+ uint32_t head32;
+ /* atomically set enable bit for this port */
+ atomic_set_mask(1U <<
+ (INFINIPATH_R_PORTENABLE_SHIFT +
+ pd->port_port),
+ &dd->ipath_rcvctrl);
+
+ /*
+ * set the head registers for this port
+ * to the current values of the tail
+ * pointers, since we don't know if they
+ * were updated on last use of the port.
+ */
+ head32 =
+ ipath_kget_ureg32(t, ur_rcvhdrtail,
+ pd->port_port);
+ head = (uint64_t) head32;
+ ipath_kput_ureg(t, ur_rcvhdrhead, head,
+ pd->port_port);
+ head32 =
+ ipath_kget_ureg32(t, ur_rcvegrindextail,
+ pd->port_port);
+ ipath_kput_ureg(t, ur_rcvegrindexhead, head32,
+ pd->port_port);
+ dd->ipath_lastegrheads[pd->port_port] = ~0;
+ dd->ipath_lastrcvhdrqtails[pd->port_port] = ~0;
+ _IPATH_VDBG
+ ("Wrote port%d head %llx, egrhead %x from tail regs\n",
+ pd->port_port, head, head32);
+ /* start at beginning after open */
+ pd->port_tidcursor = 0;
+ {
+ /*
+ * now enable the port; the tail
+ * registers will be written to
+ * memory by the chip as soon
+ * as it sees the write to
+ * kr_rcvctrl. The update only
+ * happens on transition from 0
+ * to 1, so clear it first, then
+ * set it as part of enabling
+ * the port. This will (very
+ * briefly) affect any other open
+ * ports, but it shouldn't be long
+ * enough to be an issue.
+ */
+ ipath_kput_kreg(t, kr_rcvctrl,
+ dd->
+ ipath_rcvctrl &
+ ~INFINIPATH_R_TAILUPD);
+ ipath_kput_kreg(t, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ }
+ }
+ }
+
+done:
+ return ret;
+}
+
+static int ipath_get_baseinfo(ipath_portdata * pd,
+ struct ipath_base_info *ubase)
+{
+ int ret = 0;
+ struct ipath_base_info kbase;
+ ipath_devdata *dd = &devdata[pd->port_unit];
+
+ /* be sure anything we don't set is 0ed */
+ memset(&kbase, 0, sizeof kbase);
+ kbase.spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+ kbase.spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+ kbase.spi_tidegrcnt = dd->ipath_rcvegrcnt;
+ kbase.spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+ kbase.spi_rcv_egrbuftotlen = pd->port_rcvegrbuf_chunks * PAGE_SIZE * (1 << pd->port_rcvegrbuf_order); /* have to mmap whole thing */
+ kbase.spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+ kbase.spi_rcv_egrchunksize = kbase.spi_rcv_egrbuftotlen /
+ pd->port_rcvegrbuf_chunks;
+ kbase.spi_tidcnt = dd->ipath_rcvtidcnt;
+ /*
+ * for this use, may be ipath_cfgports summed over all chips that
+ * are are configured and present
+ */
+ kbase.spi_nports = dd->ipath_cfgports;
+ kbase.spi_unit = pd->port_unit; /* unit (chip/board) our port is on */
+ /* for now, only a single page */
+ kbase.spi_tid_maxsize = PAGE_SIZE;
+
+ /*
+ * doing this per port, and based on the skip value, etc.
+ * This has to be the actual buffer size, since the protocol
+ * code treats it as an array.
+ *
+ * These have to be set to user addresses in the user code via mmap
+ * These values are used on return to user code for the mmap target
+ * addresses only. For 32 bit, same 44 bit address problem, so use
+ * the physical address, not virtual. Before 2.6.11, using the
+ * page_address() macro worked, but in 2.6.11, even that returns
+ * the full 64 bit address (upper bits all 1's).
+ * So far, using the physical addresses (or chip offsets, for
+ * chip mapping) works, but no doubt some future kernel release
+ * will chang that, and we'll be on to yet another method of
+ * dealing with this
+ */
+ kbase.spi_rcvhdr_base = (uint64_t) pd->port_rcvhdrq_phys;
+ kbase.spi_rcv_egrbufs = (uint64_t) pd->port_rcvegr_phys;
+ kbase.spi_pioavailaddr = (uint64_t) dd->ipath_pioavailregs_phys;
+ kbase.spi_status = (uint64_t) kbase.spi_pioavailaddr +
+ (void *)dd->ipath_statusp - (void *)dd->ipath_pioavailregs_dma;
+ kbase.spi_piobufbase = (uint64_t) pd->port_piobufs;
+ kbase.__spi_uregbase =
+ dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+
+ kbase.spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1);
+ kbase.spi_piocnt = dd->ipath_pbufsport;
+ kbase.spi_pioalign = dd->ipath_palign;
+
+ kbase.spi_qpair = IPATH_KD_QP;
+ kbase.spi_piosize = dd->ipath_ibmaxlen;
+ kbase.spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */
+ kbase.spi_port = pd->port_port;
+ kbase.spi_sw_version = IPATH_KERN_SWVERSION;
+ kbase.spi_hw_version = dd->ipath_revision;
+
+ if (copy_to_user(ubase, &kbase, sizeof kbase))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * return number of units supported by driver. This is infinipath_max,
+ * unless there are no initted units.
+ */
+static int ipath_get_units(void)
+{
+ int i;
+
+ for (i = 0; i < infinipath_max; i++)
+ if (devdata[i].ipath_flags & IPATH_INITTED)
+ return infinipath_max;
+ return 0;
+}
+
+/* write data to the EEPROM on the board */
+static int ipath_wr_eeprom(ipath_portdata * pd, struct ipath_eeprom_req *req)
+{
+ int ret = 0;
+ struct ipath_eeprom_req kreq;
+ void *buf = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM; /* not just any old user can write flash */
+ if (copy_from_user(&kreq, req, sizeof kreq))
+ return -EFAULT;
+ if (!kreq.addr || (kreq.offset + kreq.len) > 128) {
+ _IPATH_DBG
+ ("called with NULL addr %llx, or bad cnt %u or offset %u\n",
+ kreq.addr, kreq.len, kreq.offset);
+ return -EINVAL;
+ }
+
+ if (!(buf = vmalloc(kreq.len))) {
+ ret = -ENOMEM;
+ _IPATH_UNIT_ERROR(pd->port_unit,
+ "Couldn't allocate memory to write %u bytes from eeprom\n",
+ kreq.len);
+ goto done;
+ }
+ if (copy_from_user(buf, (void *)kreq.addr, kreq.len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ if (ipath_eeprom_write(pd->port_unit, kreq.offset, buf, kreq.len)) {
+ ret = -ENXIO;
+ _IPATH_UNIT_ERROR(pd->port_unit,
+ "Failed write to eeprom %u bytes offset %u\n",
+ kreq.len, kreq.offset);
+ }
+
+done:
+ if (buf)
+ vfree(buf);
+ return ret;
+}
+
+/* read data from the EEPROM on the board */
+int ipath_rd_eeprom(const ipath_type port_unit, struct ipath_eeprom_req *req)
+{
+ int ret = 0;
+ struct ipath_eeprom_req kreq;
+ void *buf = NULL;
+
+ if (copy_from_user(&kreq, req, sizeof kreq))
+ return -EFAULT;
+ if (!kreq.addr || (kreq.offset + kreq.len) > 128) {
+ _IPATH_DBG
+ ("called with NULL addr %llx, or bad cnt %u or offset %u\n",
+ kreq.addr, kreq.len, kreq.offset);
+ return -EINVAL;
+ }
+
+ if (!(buf = vmalloc(kreq.len))) {
+ ret = -ENOMEM;
+ _IPATH_UNIT_ERROR(port_unit,
+ "Couldn't allocate memory to read %u bytes from eeprom\n",
+ kreq.len);
+ goto done;
+ }
+ if (ipath_eeprom_read(port_unit, kreq.offset, buf, kreq.len)) {
+ ret = -ENXIO;
+ _IPATH_UNIT_ERROR(port_unit,
+ "Failed reading %u bytes offset %u from eeprom\n",
+ kreq.len, kreq.offset);
+ }
+ if (copy_to_user((void *)kreq.addr, buf, kreq.len))
+ ret = -EFAULT;
+
+done:
+ if (buf)
+ vfree(buf);
+ return ret;
+}
+
+/*
+ * wait for something to happen on a port. Currently this is
+ * PIO buffer available, or a packet being received. For now, at
+ * least, we wait no longer than 1/2 seconds on rcv, 1 tick on PIO, so
+ * we recover from any bugs (or, as we see in ips.c init and close, cases
+ * where other side isn't yet ready).
+ * NOTE: currently called only with PIO or RCV, never both, so path with both
+ * has not been tested
+ */
+static int ipath_wait_intr(ipath_portdata * pd, uint32_t flag)
+{
+ ipath_devdata *dd = &devdata[pd->port_unit];
+ /* stupid compiler can't tell it's initialized */
+ uint32_t im = 0;
+ uint32_t head, tail, timeo = 0, wflag = 0;
+
+ if (!(flag & (IPATH_WAIT_RCV | IPATH_WAIT_PIO)))
+ return -EINVAL;
+ if (flag & IPATH_WAIT_RCV) {
+ head = flag >> 16;
+ im = (1U << pd->port_port) << INFINIPATH_R_INTRAVAIL_SHIFT;
+ atomic_set_mask(im, &dd->ipath_rcvctrl);
+ /*
+ * now, before blocking, make sure that head is still == tail,
+ * reading from the chip, so we can be sure the interrupt enable
+ * has made it to the chip. If not equal, disable
+ * interrupt again and return immediately. This avoids
+ * races, and the overhead of the chip read doesn't
+ * matter much at this point, since we are waiting for
+ * something anyway.
+ */
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl);
+ tail =
+ ipath_kget_ureg32(pd->port_unit, ur_rcvhdrtail,
+ pd->port_port);
+ if (tail == head) {
+ timeo = HZ / 2;
+ wflag = IPATH_PORT_WAITING_RCV;
+ } else {
+ atomic_clear_mask(im, &dd->ipath_rcvctrl);
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ }
+ if (flag & IPATH_WAIT_PIO) {
+ /*
+ * this one's a bit worse than the receive case, in that we
+ * can't really verify that at least one interrupt
+ * will happen...
+ * We do use a really short timeout, however
+ */
+ timeo = 1; /* if both, the short PIO timeout wins */
+ atomic_set_mask(1U << pd->port_port, &dd->ipath_portpiowait);
+ wflag |= IPATH_PORT_WAITING_PIO;
+ /*
+ * this has a possible race with the ipath stuff, so do
+ * it atomicly
+ */
+ atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+ &dd->ipath_sendctrl);
+ ipath_kput_kreg(pd->port_unit, kr_sendctrl, dd->ipath_sendctrl);
+ }
+ if (wflag) {
+ pd->port_flag |= wflag;
+ wait_event_interruptible_timeout(pd->port_wait,
+ (pd->port_flag & wflag) !=
+ wflag, timeo);
+ if (wflag & pd->port_flag & IPATH_PORT_WAITING_PIO) {
+ /* timed out, no PIO interrupts */
+ atomic_clear_mask(IPATH_PORT_WAITING_PIO,
+ &pd->port_flag);
+ pd->port_piowait_to++;
+ atomic_clear_mask(1U << pd->port_port,
+ &dd->ipath_portpiowait);
+ /*
+ * *don't* clear the pio interrupt enable;
+ * let that happen in the interrupt handler;
+ * else we have a race condition.
+ */
+ }
+ if (wflag & pd->port_flag & IPATH_PORT_WAITING_RCV) {
+ /* timed out, no packets received */
+ atomic_clear_mask(IPATH_PORT_WAITING_RCV,
+ &pd->port_flag);
+ pd->port_rcvwait_to++;
+ atomic_clear_mask(im, &dd->ipath_rcvctrl);
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ } else {
+ /* else it's already happened, don't do wait_event overhead */
+ if (flag & IPATH_WAIT_RCV)
+ pd->port_rcvnowait++;
+ if (flag & IPATH_WAIT_PIO)
+ pd->port_pionowait++;
+ }
+ return 0;
+}
--
0.99.9n
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/