/*
 * dma_test.c -- Standalone bare-metal DMA RX stress test.
 *
 * Tests the PA6T-1682M Ethernet DMA receive path without any driver framework.
 * Sets up MAC + DMA from scratch, receives packets, recycles descriptors
 * in a tight polling loop.  Prints heartbeat to serial.
 *
 * Run WITHOUT pa6t_eth.device loaded (rename/move it from DEVS:).
 * Connect Ethernet cable (100M or 1G), run udp_flood.py from PC.
 * PC needs static ARP for X1000 MAC address (printed on serial at start).
 *
 * Ctrl+C to exit.
 *
 */

#include <exec/exec.h>
#include <exec/memory.h>
#include <proto/exec.h>
#include <proto/dos.h>
#include <expansion/pci.h>
#include <interfaces/expansion.h>
#include <dos/dos.h>

/* ---- PCI standard defines ------------------------------------------------- */

#ifndef PCI_COMMAND
#define PCI_COMMAND         0x04
#define PCI_COMMAND_MEMORY  0x0002
#define PCI_COMMAND_MASTER  0x0004
#endif

/* ---- PASemi PCI device IDs ------------------------------------------------ */

#define PASEMI_VENDOR_ID    0x1959
#define PASEMI_DEV_GMAC     0xa005
#define PASEMI_DEV_DMA      0xa007

/* ---- DMA ring configuration ----------------------------------------------- */

#define RX_RING_SIZE        2048
#define RX_BUF_RING_SIZE    RX_RING_SIZE
#define NUM_RX_BUFS         512
#define RX_BUF_SIZE         2048

#define RX_CHANNEL          1
#define MAC_DMA_INTF        5   /* DMA interface number for the MAC on X1000, always 5 */

#define RX_DESC(ring, n)    ((ring)[(n) & (RX_RING_SIZE - 1)])

/* ---- RX descriptor bits --------------------------------------------------- */

#define XCT_MACRX_O         0x0400000000000000ULL

/* ---- RX buffer descriptor bits -------------------------------------------- */

#define XCT_RXB_LEN_S       44ULL
#define XCT_RXB_LEN_M       0x0ffff00000000000ULL
#define XCT_RXB_LEN(x)      ((((uint64)(x)) << XCT_RXB_LEN_S) & XCT_RXB_LEN_M)
#define XCT_RXB_ADDR_M      0x00000fffffffffffULL
#define XCT_RXB_ADDR(x)     ((((uint64)(x)) << 0) & XCT_RXB_ADDR_M)

/* ---- MAC config-space registers ------------------------------------------- */

#define PAS_MAC_CFG_PCFG        0x80
#define PAS_MAC_CFG_MACCFG      0x84
#define PAS_MAC_CFG_ADR0        0x8c
#define PAS_MAC_CFG_ADR1        0x90
#define PAS_MAC_CFG_TXP         0x98
#define PAS_MAC_CFG_RMON(r)     (0x100 + (r)*4)
#define PAS_MAC_IPC_CHNL        0x208

#define PAS_MAC_CFG_PCFG_PE       0x80000000
#define PAS_MAC_CFG_PCFG_CE       0x40000000
#define PAS_MAC_CFG_PCFG_TSR_1G   0x08000000
#define PAS_MAC_CFG_PCFG_PR       0x01000000
#define PAS_MAC_CFG_PCFG_S1       0x00000080
#define PAS_MAC_CFG_PCFG_SPD_1G   0x00000002

#define PAS_MAC_CFG_MACCFG_MAXF_M   0x00ffff00
#define PAS_MAC_CFG_MACCFG_MAXF(x)  (((x) << 8) & PAS_MAC_CFG_MACCFG_MAXF_M)

#define PAS_MAC_CFG_TXP_FCE      0x00800000
#define PAS_MAC_CFG_TXP_FPC(x)   (((x) << 20) & 0x00300000)
#define PAS_MAC_CFG_TXP_SL(x)    (((x) << 16) & 0x00030000)
#define PAS_MAC_CFG_TXP_COB(x)   (((x) << 12) & 0x0000f000)
#define PAS_MAC_CFG_TXP_TIFT(x)  (((x) << 8)  & 0x00000f00)
#define PAS_MAC_CFG_TXP_TIFG(x)  ((x)         & 0x000000ff)

#define PAS_MAC_IPC_CHNL_DCHNO(x) (((x) << 16) & 0x003f0000)
#define PAS_MAC_IPC_CHNL_BCH(x)   ((x)         & 0x0000003f)

/* ---- DMA controller registers --------------------------------------------- */

#define PAS_DMA_COM_TXCMD      0x100
#define PAS_DMA_COM_TXSTA      0x104
#define PAS_DMA_COM_RXCMD      0x108
#define PAS_DMA_COM_RXSTA      0x10c
#define PAS_DMA_COM_CFG        0x114
#define PAS_DMA_TXF_SFLG0     0x140
#define PAS_DMA_TXF_SFLG1     0x144
#define PAS_DMA_TXF_CFLG0     0x148
#define PAS_DMA_TXF_CFLG1     0x14c

#define PAS_DMA_COM_TXCMD_EN   0x00000001
#define PAS_DMA_COM_RXCMD_EN   0x00000001

/* RX interface */
#define PAS_DMA_RXINT_RCMDSTA(i)  (0x200 + (i)*0x20)
#define PAS_DMA_RXINT_CFG(i)      (0x204 + (i)*0x20)
#define PAS_DMA_RXINT_INCR(i)     (0x210 + (i)*0x20)
#define PAS_DMA_RXINT_BASEL(i)    (0x218 + (i)*0x20)
#define PAS_DMA_RXINT_BASEU(i)    (0x21c + (i)*0x20)

#define PAS_DMA_RXINT_RCMDSTA_EN      0x00000001
#define PAS_DMA_RXINT_RCMDSTA_DROPS_M 0xfffe0000
#define PAS_DMA_RXINT_RCMDSTA_ACT     0x00010000
#define PAS_DMA_RXINT_RCMDSTA_BP      0x00004000
#define PAS_DMA_RXINT_RCMDSTA_OO      0x00002000
#define PAS_DMA_RXINT_RCMDSTA_DR      0x00001000
#define PAS_DMA_RXINT_RCMDSTA_BT      0x00000800

#define PAS_DMA_RXINT_CFG_RBP    0x80000000
#define PAS_DMA_RXINT_CFG_DHL(x) (((x) << 24) & 0x07000000)
#define PAS_DMA_RXINT_CFG_LW     0x00200000
#define PAS_DMA_RXINT_CFG_L2     0x00100000
#define PAS_DMA_RXINT_CFG_HEN    0x00080000

#define PAS_DMA_RXINT_BASEL_BRBL(x) ((x) & ~0x3f)
#define PAS_DMA_RXINT_BASEU_BRBH(x) ((x) & 0xfff)
#define PAS_DMA_RXINT_BASEU_SIZ(x)  (((x) << 16) & 0x3fff0000)

/* RX channel */
#define PAS_DMA_RXCHAN_CCMDSTA(c) (0x800 + (c)*0x20)
#define PAS_DMA_RXCHAN_CFG(c)     (0x804 + (c)*0x20)
#define PAS_DMA_RXCHAN_INCR(c)    (0x810 + (c)*0x20)
#define PAS_DMA_RXCHAN_BASEL(c)   (0x818 + (c)*0x20)
#define PAS_DMA_RXCHAN_BASEU(c)   (0x81c + (c)*0x20)

#define PAS_DMA_RXCHAN_CCMDSTA_EN  0x00000001
#define PAS_DMA_RXCHAN_CCMDSTA_ST  0x00000002
#define PAS_DMA_RXCHAN_CCMDSTA_ACT 0x00010000
#define PAS_DMA_RXCHAN_CCMDSTA_DU  0x00020000
#define PAS_DMA_RXCHAN_CCMDSTA_OD  0x00002000
#define PAS_DMA_RXCHAN_CCMDSTA_FD  0x00001000
#define PAS_DMA_RXCHAN_CCMDSTA_DT  0x00000800

#define PAS_DMA_RXCHAN_CFG_HBU(x) (((x) << 7) & 0x00000380)
#define PAS_DMA_RXCHAN_BASEL_BRBL(x) ((x) & ~0x3f)
#define PAS_DMA_RXCHAN_BASEU_BRBH(x) ((x) & 0xfff)
#define PAS_DMA_RXCHAN_BASEU_SIZ(x)  (((x) << 16) & 0x3fff0000)

/* ---- Memory barrier ------------------------------------------------------- */

static inline void pasemi_wmb(void)
{
    __asm__ volatile("sync" : : : "memory");
}

/* ---- Globals -------------------------------------------------------------- */

static struct Library     *ExpansionBase;
static struct PCIIFace    *IPCI;
static struct PCIDevice   *mac_dev;
static struct PCIDevice   *dma_dev;

static volatile uint64    *rx_ring;
static volatile uint64    *rx_buf_ring;
static uint32              rx_ring_phys;
static uint32              rx_buf_ring_phys;
static uint32              rx_ring_dma_entries;
static uint32              rx_buf_ring_dma_entries;

static uint8              *rx_bufs[NUM_RX_BUFS];
static uint32              rx_bufs_phys[NUM_RX_BUFS];
static uint32              rx_bufs_dma_entries[NUM_RX_BUFS];
static uint32              num_bufs_alloced;

static uint32              rx_next_clean;
static uint32              rx_next_buf;    /* next slot in buf ring to repost */

/* ---- Register access via PCI interface ------------------------------------ */

static inline uint32 rd_mac(uint32 off) { return mac_dev->ReadConfigLong(off); }
static inline void   wr_mac(uint32 off, uint32 v) { mac_dev->WriteConfigLong(off, v); }
static inline uint32 rd_dma(uint32 off) { return dma_dev->ReadConfigLong(off); }
static inline void   wr_dma(uint32 off, uint32 v) { dma_dev->WriteConfigLong(off, v); }

/* ---- get_phys: DMA-map and return physical address ------------------------ */

static uint32 get_phys(void *virt, uint32 size, uint32 flags, uint32 *out_entries)
{
    uint32 nentries = IExec->StartDMA(virt, size, flags);
    if (!nentries) { *out_entries = 0; return 0; }

    struct DMAEntry *dl = (struct DMAEntry *)IExec->AllocSysObjectTags(
        ASOT_DMAENTRY, ASODMAE_NumEntries, nentries, TAG_DONE);
    if (!dl) {
        IExec->EndDMA(virt, size, flags | DMAF_NoModify);
        *out_entries = 0;
        return 0;
    }

    IExec->GetDMAList(virt, size, flags, dl);
    uint32 phys = (uint32)dl[0].PhysicalAddress;
    IExec->FreeSysObject(ASOT_DMAENTRY, dl);
    *out_entries = nentries;
    return phys;
}

/* ---- Cleanup -------------------------------------------------------------- */

static void cleanup(void)
{
    uint32 i;

    /* Disable MAC */
    if (mac_dev) {
        uint32 pcfg = rd_mac(PAS_MAC_CFG_PCFG);
        wr_mac(PAS_MAC_CFG_PCFG, pcfg & ~PAS_MAC_CFG_PCFG_PE);
    }

    /* Stop RX channel */
    if (dma_dev) {
        wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), PAS_DMA_RXCHAN_CCMDSTA_ST);
        for (i = 0; i < 50000; i++) {
            if (!(rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL)) & PAS_DMA_RXCHAN_CCMDSTA_ACT))
                break;
        }
        wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), 0);

        /* Stop RX interface */
        wr_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF), 0x00000002);  /* ST bit */
        for (i = 0; i < 50000; i++) {
            if (!(rd_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF)) & PAS_DMA_RXINT_RCMDSTA_ACT))
                break;
        }
        wr_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF), 0);
    }

    /* Free RX data buffers */
    for (i = 0; i < NUM_RX_BUFS; i++) {
        if (rx_bufs[i]) {
            if (rx_bufs_dma_entries[i])
                IExec->EndDMA(rx_bufs[i], RX_BUF_SIZE, DMAF_NoModify);
            IExec->FreeVec(rx_bufs[i]);
            rx_bufs[i] = NULL;
        }
    }

    /* Free RX buffer ring */
    if (rx_buf_ring) {
        if (rx_buf_ring_dma_entries)
            IExec->EndDMA((void *)rx_buf_ring,
                          RX_BUF_RING_SIZE * sizeof(uint64), DMAF_NoModify);
        IExec->FreeVec((void *)rx_buf_ring);
        rx_buf_ring = NULL;
    }

    /* Free RX descriptor ring */
    if (rx_ring) {
        if (rx_ring_dma_entries)
            IExec->EndDMA((void *)rx_ring,
                          RX_RING_SIZE * sizeof(uint64), DMAF_NoModify);
        IExec->FreeVec((void *)rx_ring);
        rx_ring = NULL;
    }

    /* Release PCI devices */
    if (IPCI) {
        if (dma_dev) { IPCI->FreeDevice(dma_dev); dma_dev = NULL; }
        if (mac_dev) { IPCI->FreeDevice(mac_dev); mac_dev = NULL; }
        IExec->DropInterface((struct Interface *)IPCI);
        IPCI = NULL;
    }
    if (ExpansionBase) {
        IExec->CloseLibrary(ExpansionBase);
        ExpansionBase = NULL;
    }
}

/* ---- dump_hw_state: print HW register diagnostics ------------------------- */

static void dump_hw_state(const char *label)
{
    uint32 pcfg   = rd_mac(PAS_MAC_CFG_PCFG);
    uint32 rxcsta = rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL));
    uint32 rxista = rd_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF));
    uint32 rxcmd  = rd_dma(PAS_DMA_COM_RXCMD);

    IExec->DebugPrintF("[dma_test] HW %s: PCFG=0x%08lX RXCSTA=0x%08lX"
                       " RXISTA=0x%08lX RXCMD=0x%08lX\n",
                       label, (ULONG)pcfg, (ULONG)rxcsta,
                       (ULONG)rxista, (ULONG)rxcmd);

    if (!(pcfg & PAS_MAC_CFG_PCFG_PE))
        IExec->DebugPrintF("[dma_test] ERROR: MAC port NOT enabled!\n");
    if (!(rxcsta & PAS_DMA_RXCHAN_CCMDSTA_EN))
        IExec->DebugPrintF("[dma_test] ERROR: RX channel NOT enabled!\n");
    if (!(rxista & PAS_DMA_RXINT_RCMDSTA_EN))
        IExec->DebugPrintF("[dma_test] ERROR: RX interface NOT enabled!\n");

    /* DMA error flags */
    if (rxcsta & PAS_DMA_RXCHAN_CCMDSTA_DU)
        IExec->DebugPrintF("[dma_test] RX chan: descriptor underrun (DU)\n");
    if (rxcsta & PAS_DMA_RXCHAN_CCMDSTA_OD)
        IExec->DebugPrintF("[dma_test] RX chan: out-of-descriptors (OD)\n");
    if (rxcsta & PAS_DMA_RXCHAN_CCMDSTA_FD)
        IExec->DebugPrintF("[dma_test] RX chan: frame drop (FD)\n");
    if (rxista & PAS_DMA_RXINT_RCMDSTA_BP)
        IExec->DebugPrintF("[dma_test] RX intf: backpressure (BP)\n");
    if (rxista & PAS_DMA_RXINT_RCMDSTA_OO)
        IExec->DebugPrintF("[dma_test] RX intf: out-of-order (OO)\n");
    if (rxista & PAS_DMA_RXINT_RCMDSTA_DR)
        IExec->DebugPrintF("[dma_test] RX intf: descriptor read error (DR)\n");

    /* RMON counters */
    uint32 rmon0 = rd_mac(PAS_MAC_CFG_RMON(0));
    uint32 rmon1 = rd_mac(PAS_MAC_CFG_RMON(1));
    IExec->DebugPrintF("[dma_test] RMON[0]=0x%08lX RMON[1]=0x%08lX\n",
                       (ULONG)rmon0, (ULONG)rmon1);
}

/* ---- dump_ring: print first few ring entries ------------------------------ */

static void dump_ring(void)
{
    uint32 rx_count = 0;
    uint32 idx;
    for (idx = 0; idx < RX_RING_SIZE; idx += 4) {
        if (rx_ring[idx] & XCT_MACRX_O)
            rx_count++;
    }
    IExec->DebugPrintF("[dma_test] Verification: %lu/%lu descriptors filled by DMA\n",
                       (ULONG)rx_count, (ULONG)(RX_RING_SIZE / 4));

    IExec->DebugPrintF("[dma_test] ring[0]=0x%08lX_%08lX\n",
                       (ULONG)(rx_ring[0] >> 32), (ULONG)(rx_ring[0] & 0xFFFFFFFF));
    IExec->DebugPrintF("[dma_test] ring[1]=0x%08lX_%08lX\n",
                       (ULONG)(rx_ring[1] >> 32), (ULONG)(rx_ring[1] & 0xFFFFFFFF));
    IExec->DebugPrintF("[dma_test] ring[2]=0x%08lX_%08lX\n",
                       (ULONG)(rx_ring[2] >> 32), (ULONG)(rx_ring[2] & 0xFFFFFFFF));
    IExec->DebugPrintF("[dma_test] ring[3]=0x%08lX_%08lX\n",
                       (ULONG)(rx_ring[3] >> 32), (ULONG)(rx_ring[3] & 0xFFFFFFFF));

    IExec->DebugPrintF("[dma_test] bufring[0]=0x%08lX_%08lX\n",
                       (ULONG)(rx_buf_ring[0] >> 32), (ULONG)(rx_buf_ring[0] & 0xFFFFFFFF));
    IExec->DebugPrintF("[dma_test] bufring[1]=0x%08lX_%08lX\n",
                       (ULONG)(rx_buf_ring[1] >> 32), (ULONG)(rx_buf_ring[1] & 0xFFFFFFFF));

    if (rx_count == 0)
        IExec->DebugPrintF("[dma_test] WARNING: NO packets received! Test invalid.\n");
}

/* ---- init: one-time hardware setup ---------------------------------------- */

static int init(void)
{
    uint32 i;

    IExec->DebugPrintF("\n[dma_test] ==============================\n");
    IExec->DebugPrintF("[dma_test] Standalone DMA RX stress test\n");
    IExec->DebugPrintF("[dma_test] ==============================\n");

    /* ---- Open PCI ---- */
    ExpansionBase = IExec->OpenLibrary("expansion.library", 54);
    if (!ExpansionBase) {
        IExec->DebugPrintF("[dma_test] FAIL: no expansion.library\n");
        return 0;
    }
    IPCI = (struct PCIIFace *)IExec->GetInterface(ExpansionBase, "pci", 1, NULL);
    if (!IPCI) {
        IExec->DebugPrintF("[dma_test] FAIL: no PCI interface\n");
        return 0;
    }

    /* ---- Find PCI devices ---- */
    mac_dev = IPCI->FindDeviceTags(
        FDT_VendorID, PASEMI_VENDOR_ID, FDT_DeviceID, PASEMI_DEV_GMAC, TAG_DONE);
    dma_dev = IPCI->FindDeviceTags(
        FDT_VendorID, PASEMI_VENDOR_ID, FDT_DeviceID, PASEMI_DEV_DMA, TAG_DONE);

    if (!mac_dev || !dma_dev) {
        IExec->DebugPrintF("[dma_test] FAIL: PCI devices not found"
                           " (mac=%p dma=%p)\n", mac_dev, dma_dev);
        return 0;
    }
    IExec->DebugPrintF("[dma_test] PCI: MAC=%p DMA=%p\n", mac_dev, dma_dev);

    /* ---- Safety: check RX channel not already in use ---- */
    {
        uint32 rxsta = rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL));
        if (rxsta & PAS_DMA_RXCHAN_CCMDSTA_ACT) {
            IExec->DebugPrintF("[dma_test] FAIL: RX channel already active!"
                               "  RX_STA=0x%08lX\n"
                               "  Is pa6t_eth.device still loaded?\n", rxsta);
            return 0;
        }
    }

    /* ---- Read and display MAC address ---- */
    {
        uint32 adr0 = rd_mac(PAS_MAC_CFG_ADR0);
        uint32 adr1 = rd_mac(PAS_MAC_CFG_ADR1);
        IExec->DebugPrintF("[dma_test] MAC addr: %02lX:%02lX:%02lX:%02lX:%02lX:%02lX\n",
            (ULONG)((adr1 >> 8) & 0xff), (ULONG)(adr1 & 0xff),
            (ULONG)((adr0 >> 24) & 0xff), (ULONG)((adr0 >> 16) & 0xff),
            (ULONG)((adr0 >> 8) & 0xff), (ULONG)(adr0 & 0xff));
        IExec->DebugPrintF("[dma_test] Set static ARP on PC for this MAC!\n");
    }

    /* ---- DMA controller init (matches Linux pasemi_dma_init) ---- */
    {
        /* Disable TX section, wait idle */
        wr_dma(PAS_DMA_COM_TXCMD, 0);
        for (i = 0; i < 100000; i++)
            if (!(rd_dma(PAS_DMA_COM_TXSTA) & 0x01)) break;

        /* COM_CFG bits [28:27] = 0b11 (undocumented but Linux always sets) */
        uint32 cfg = rd_dma(PAS_DMA_COM_CFG);
        wr_dma(PAS_DMA_COM_CFG, cfg | 0x18000000);

        /* Re-enable TX section */
        wr_dma(PAS_DMA_COM_TXCMD, PAS_DMA_COM_TXCMD_EN);

        /* Clear TX status flags */
        wr_dma(PAS_DMA_TXF_SFLG0, 0);
        wr_dma(PAS_DMA_TXF_SFLG1, 0);
        wr_dma(PAS_DMA_TXF_CFLG0, 0xffffffff);
        wr_dma(PAS_DMA_TXF_CFLG1, 0xffffffff);

        /* Stop RX section */
        wr_dma(PAS_DMA_COM_RXCMD, 0);
        for (i = 0; i < 100000; i++)
            if (!(rd_dma(PAS_DMA_COM_RXSTA) & 0x01)) break;

        /* Hard-reset RX channel and interface */
        wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), 0);
        wr_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF), 0);

        IExec->DebugPrintF("[dma_test] DMA controller init done\n");
    }

    /* ---- Allocate RX descriptor ring ---- */
    {
        uint32 bytes = RX_RING_SIZE * sizeof(uint64);
        rx_ring = (volatile uint64 *)IExec->AllocVecTags(bytes,
            AVT_Type, MEMF_SHARED, AVT_Contiguous, TRUE,
            AVT_Alignment, 64, AVT_ClearWithValue, 0, TAG_DONE);
        if (!rx_ring) {
            IExec->DebugPrintF("[dma_test] FAIL: RX ring alloc\n");
            return 0;
        }
        rx_ring_phys = get_phys((void *)rx_ring, bytes, 0, &rx_ring_dma_entries);
        if (!rx_ring_phys) {
            IExec->DebugPrintF("[dma_test] FAIL: RX ring DMA-map\n");
            return 0;
        }
        IExec->DebugPrintF("[dma_test] RX ring: virt=%p phys=0x%08lX\n",
                           rx_ring, (ULONG)rx_ring_phys);
    }

    /* ---- Allocate RX buffer pointer ring ---- */
    {
        uint32 bytes = RX_BUF_RING_SIZE * sizeof(uint64);
        rx_buf_ring = (volatile uint64 *)IExec->AllocVecTags(bytes,
            AVT_Type, MEMF_SHARED, AVT_Contiguous, TRUE,
            AVT_Alignment, 64, AVT_ClearWithValue, 0, TAG_DONE);
        if (!rx_buf_ring) {
            IExec->DebugPrintF("[dma_test] FAIL: RX buf ring alloc\n");
            return 0;
        }
        rx_buf_ring_phys = get_phys((void *)rx_buf_ring, bytes, 0,
                                    &rx_buf_ring_dma_entries);
        if (!rx_buf_ring_phys) {
            IExec->DebugPrintF("[dma_test] FAIL: RX buf ring DMA-map\n");
            return 0;
        }
        IExec->DebugPrintF("[dma_test] RX buf ring: virt=%p phys=0x%08lX\n",
                           rx_buf_ring, (ULONG)rx_buf_ring_phys);
    }

    /* ---- Mark rings cache-inhibited + coherent + guarded ---- */
    {
        struct MMUIFace *IMMU = (struct MMUIFace *)
            IExec->GetInterface(IExec->Data.LibBase, "mmu", 1, NULL);
        if (IMMU) {
            /* All three flags as recommended by SDK fsldma.h for DMA memory */
            uint32 attrs = MEMATTRF_CACHEINHIBIT |
                           MEMATTRF_COHERENT     |
                           MEMATTRF_GUARDED      |
                           MEMATTRF_SUPER_RW_USER_RW;
            IMMU->SetMemoryAttrs((void *)rx_ring,
                                 RX_RING_SIZE * sizeof(uint64), attrs);
            IMMU->SetMemoryAttrs((void *)rx_buf_ring,
                                 RX_BUF_RING_SIZE * sizeof(uint64), attrs);
            IExec->DropInterface((struct Interface *)IMMU);
            IExec->DebugPrintF("[dma_test] rings: CI + coherent + guarded\n");
        } else {
            IExec->DebugPrintF("[dma_test] WARNING: MMU unavailable!"
                               "  Rings NOT cache-inhibited\n");
        }
    }

    /* ---- Allocate RX data buffers ---- */
    num_bufs_alloced = 0;
    for (i = 0; i < NUM_RX_BUFS; i++) {
        rx_bufs[i] = (uint8 *)IExec->AllocVecTags(RX_BUF_SIZE,
            AVT_Type, MEMF_SHARED, AVT_Contiguous, TRUE,
            AVT_Alignment, 64, AVT_ClearWithValue, 0, TAG_DONE);
        if (!rx_bufs[i]) break;
        rx_bufs_phys[i] = get_phys(rx_bufs[i], RX_BUF_SIZE, 0,
                                   &rx_bufs_dma_entries[i]);
        if (!rx_bufs_phys[i]) {
            IExec->FreeVec(rx_bufs[i]);
            rx_bufs[i] = NULL;
            break;
        }
        num_bufs_alloced++;
    }
    IExec->DebugPrintF("[dma_test] %lu/%lu RX buffers allocated\n",
                       (ULONG)num_bufs_alloced, (ULONG)NUM_RX_BUFS);
    IExec->DebugPrintF("[dma_test] RX data buffers: normal (cached)\n");

    if (num_bufs_alloced == 0) {
        IExec->DebugPrintF("[dma_test] FAIL: no RX buffers\n");
        return 0;
    }

    /* ---- Configure RX channel ---- */
    {
        uint32 siz = RX_RING_SIZE >> 3;
        wr_dma(PAS_DMA_RXCHAN_BASEL(RX_CHANNEL),
               PAS_DMA_RXCHAN_BASEL_BRBL(rx_ring_phys));
        wr_dma(PAS_DMA_RXCHAN_BASEU(RX_CHANNEL),
               PAS_DMA_RXCHAN_BASEU_BRBH(0) | PAS_DMA_RXCHAN_BASEU_SIZ(siz));
        wr_dma(PAS_DMA_RXCHAN_CFG(RX_CHANNEL), PAS_DMA_RXCHAN_CFG_HBU(2));
    }

    /* ---- Configure RX interface ---- */
    {
        uint32 siz = RX_RING_SIZE >> 3;
        wr_dma(PAS_DMA_RXINT_BASEL(MAC_DMA_INTF),
               PAS_DMA_RXINT_BASEL_BRBL(rx_buf_ring_phys));
        wr_dma(PAS_DMA_RXINT_BASEU(MAC_DMA_INTF),
               PAS_DMA_RXINT_BASEU_BRBH(0) | PAS_DMA_RXINT_BASEU_SIZ(siz));

        uint32 cfg = PAS_DMA_RXINT_CFG_RBP      |
                     PAS_DMA_RXINT_CFG_DHL(2)    |
                     PAS_DMA_RXINT_CFG_LW        |
                     PAS_DMA_RXINT_CFG_L2        |
                     PAS_DMA_RXINT_CFG_HEN;
        wr_dma(PAS_DMA_RXINT_CFG(MAC_DMA_INTF), cfg);
    }

    /* ---- Re-enable RX section ---- */
    wr_dma(PAS_DMA_COM_RXCMD, PAS_DMA_COM_RXCMD_EN);

    /* ---- Enable RX interface + channel ---- */
    wr_dma(PAS_DMA_RXINT_RCMDSTA(MAC_DMA_INTF),
           PAS_DMA_RXINT_RCMDSTA_EN      |
           PAS_DMA_RXINT_RCMDSTA_DROPS_M |
           PAS_DMA_RXINT_RCMDSTA_BP      |
           PAS_DMA_RXINT_RCMDSTA_OO      |
           PAS_DMA_RXINT_RCMDSTA_DR      |
           PAS_DMA_RXINT_RCMDSTA_BT);

    wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL),
           PAS_DMA_RXCHAN_CCMDSTA_EN |
           PAS_DMA_RXCHAN_CCMDSTA_DU |
           PAS_DMA_RXCHAN_CCMDSTA_OD |
           PAS_DMA_RXCHAN_CCMDSTA_FD |
           PAS_DMA_RXCHAN_CCMDSTA_DT);

    /* ---- Post initial buffers ---- */
    for (i = 0; i < num_bufs_alloced; i++) {
        rx_buf_ring[i] = XCT_RXB_LEN(RX_BUF_SIZE) |
                         XCT_RXB_ADDR(rx_bufs_phys[i]);
    }
    pasemi_wmb();

    wr_dma(PAS_DMA_RXINT_INCR(MAC_DMA_INTF), num_bufs_alloced);
    wr_dma(PAS_DMA_RXCHAN_INCR(RX_CHANNEL), RX_RING_SIZE >> 1);

    /* ---- Enable BusMaster on MAC + DMA ---- */
    {
        uint16 cmd = mac_dev->ReadConfigWord(PCI_COMMAND);
        mac_dev->WriteConfigWord(PCI_COMMAND,
            cmd | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
    }
    {
        uint16 cmd = dma_dev->ReadConfigWord(PCI_COMMAND);
        dma_dev->WriteConfigWord(PCI_COMMAND,
            cmd | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
    }

    /* ---- Configure and enable MAC ---- */
    {
        /* Disable while configuring */
        uint32 pcfg = rd_mac(PAS_MAC_CFG_PCFG);
        wr_mac(PAS_MAC_CFG_PCFG, pcfg & ~PAS_MAC_CFG_PCFG_PE);

        /* Max frame size */
        uint32 maccfg = rd_mac(PAS_MAC_CFG_MACCFG);
        maccfg &= ~PAS_MAC_CFG_MACCFG_MAXF_M;
        maccfg |= PAS_MAC_CFG_MACCFG_MAXF(1518);
        wr_mac(PAS_MAC_CFG_MACCFG, maccfg);

        /* TX parameters (needed for MAC even though we don't TX) */
        wr_mac(PAS_MAC_CFG_TXP,
               PAS_MAC_CFG_TXP_FCE     |
               PAS_MAC_CFG_TXP_FPC(3)  |
               PAS_MAC_CFG_TXP_SL(3)   |
               PAS_MAC_CFG_TXP_COB(0xf)|
               PAS_MAC_CFG_TXP_TIFT(8) |
               PAS_MAC_CFG_TXP_TIFG(12));

        /* Zero RMON counters */
        for (i = 0; i < 32; i++)
            wr_mac(PAS_MAC_CFG_RMON(i), 0);

        /* Assign DMA channel to MAC */
        wr_mac(PAS_MAC_IPC_CHNL,
               PAS_MAC_IPC_CHNL_DCHNO(RX_CHANNEL) |
               PAS_MAC_IPC_CHNL_BCH(RX_CHANNEL));

        /* Enable: 1 Gbps full-duplex, promiscuous */
        pcfg = PAS_MAC_CFG_PCFG_S1       |
               PAS_MAC_CFG_PCFG_PR        |
               PAS_MAC_CFG_PCFG_CE        |
               PAS_MAC_CFG_PCFG_TSR_1G    |
               PAS_MAC_CFG_PCFG_SPD_1G    |
               PAS_MAC_CFG_PCFG_PE;
        wr_mac(PAS_MAC_CFG_PCFG, pcfg);
    }

    rx_next_clean = 0;
    rx_next_buf   = num_bufs_alloced;  /* first free buf ring slot */

    IExec->DebugPrintF("[dma_test] MAC enabled, DMA running.\n");

    dump_hw_state("init");

    return 1;
}

/* ---- Main ----------------------------------------------------------------- */

int main(int argc, char *argv[])
{
    (void)argc;
    (void)argv;

    if (!init()) {
        cleanup();
        return RETURN_FAIL;
    }

    /*
     * BLIND_RECYCLE: recycle descriptors and give INCRs but NEVER read O-bit.
     * Tests whether CPU writes to the ring (without reads) cause the lockup.
     * Blindly clears 16 descriptors per batch, reposts buffers, gives INCRs.
     */
    IExec->DebugPrintF("[dma_test] *** BLIND_RECYCLE: write-only, no O-bit read ***\n");
    IExec->DebugPrintF("[dma_test] Start udp_flood.py on PC now.\n");
    IExec->DebugPrintF("[dma_test] Ctrl+C to stop.\n");

    {
        uint32 loops = 0;
        uint32 heartbeat = 0;
        uint32 batch = 16;  /* descriptors per batch */

        for (;;) {
            uint32 n = rx_next_clean;
            uint32 i;

            /* Blindly clear+repost a batch without reading O-bit */
            for (i = 0; i < batch; i++) {
                uint32 idx = n & (RX_RING_SIZE - 1);
                rx_ring[idx]     = 0;
                rx_ring[idx + 1] = 0;
                rx_ring[idx + 2] = 0;
                rx_ring[idx + 3] = 0;

                uint32 buf_idx = i % num_bufs_alloced;
                rx_buf_ring[rx_next_buf & (RX_BUF_RING_SIZE - 1)] =
                    XCT_RXB_LEN(RX_BUF_SIZE) | XCT_RXB_ADDR(rx_bufs_phys[buf_idx]);
                rx_next_buf++;

                n += 4;
            }

            rx_next_clean = n & (RX_RING_SIZE - 1);

            /* Tell DMA about recycled slots */
            pasemi_wmb();
            wr_dma(PAS_DMA_RXINT_INCR(MAC_DMA_INTF), batch);
            wr_dma(PAS_DMA_RXCHAN_INCR(RX_CHANNEL), batch << 1);

            loops++;

            if ((loops % 100000) == 0) {
                heartbeat++;
                IExec->DebugPrintF("[dma_test] #%lu loops=%lu\n",
                                   (ULONG)heartbeat, (ULONG)loops);
            }

            if ((loops & 0xFF) == 0) {
                if (IExec->SetSignal(0, SIGBREAKF_CTRL_C) & SIGBREAKF_CTRL_C)
                    break;
            }
        }
    }

    dump_hw_state("after");
    dump_ring();

    IExec->DebugPrintF("[dma_test] stopping\n");

    cleanup();

    IExec->DebugPrintF("[dma_test] done.\n");
    return RETURN_OK;
}
