/*
 * cfe_dma_test.c -- Bare-metal DMA RX stress test, runs from CFE.
 *
 * No OS, no libraries, no MMU translation.  Direct hardware access only.
 * Tests the PA6T-1682M Ethernet DMA receive path under packet flood.
 * Real-time serial output via UART (LPCR_RMI required for MMIO access).
 *
 * Build:   make
 * Load:    CFE> ifconfig eth0 -addr=192.168.0.150
 *          CFE> load -elf 192.168.0.144:cfe_dma_test.elf
 *          CFE> go
 *   or:    CFE> boot -elf -noints 192.168.0.144:cfe_dma_test.elf
 *
 * Before running:
 *   1. Set up TFTP server on PC (python tftp_simple.py)
 *   2. Load and run as above
 *   3. Start udp_flood.py on PC AFTER the test prints "DMA running"
 *   4. Press any key on serial console to stop, or power cycle if locked
 */

/* ---- Types (no standard library) ------------------------------------------ */

typedef unsigned char       uint8;
typedef unsigned short      uint16;
typedef unsigned int        uint32;
typedef unsigned long long  uint64;

#define NULL ((void *)0)

/* ---- 64-bit address fix --------------------------------------------------- */
/*
 * PA6T is 64-bit, our code is 32-bit.  lis r10, 0xFCFF produces
 * r10 = 0xFFFFFFFFFCFF0000 (sign-extended).  clrldi clears upper 32 bits.
 */
static inline void *fixaddr(uint32 addr)
{
    unsigned long result;
    __asm__ volatile ("clrldi %0, %1, 32" : "=r"(result) : "r"(addr));
    return (void *)result;
}

/* ---- LPCR_RMI ------------------------------------------------------------- */
/*
 * LPCR = SPR 318.  Bit 1 = RMI (Real Mode cache Inhibit).
 * In real mode (MSR[DR]=0), data accesses are cacheable by default.
 * MMIO/PCI I/O access with cached reads causes Machine Check (0x0200).
 * Setting RMI makes all real-mode accesses cache-inhibited.
 * Confirmed from CFE firmware binary: CFE wraps every UART access with RMI.
 *
 * For this test we set RMI once at startup and leave it on.
 * All memory (rings, buffers) becomes uncached — slower but correct.
 * dcbi/dcbf become no-ops on uncached memory, which is fine.
 *
 * MUST restore LPCR before returning to CFE (or Alignment exception 0x0600).
 */
static uint32 saved_lpcr_lo;

static inline void lpcr_enable_rmi(void)
{
    uint32 lo;
    __asm__ volatile (
        "mfspr 10, 318   \n"
        "mr    %0, 10    \n"
        "ori   10, 10, 2 \n"
        "sync            \n"
        "mtspr 318, 10   \n"
        "isync           \n"
        : "=r"(lo) :: "r10", "memory"
    );
    saved_lpcr_lo = lo;
}

static inline void lpcr_restore(void)
{
    uint32 lo = saved_lpcr_lo;
    __asm__ volatile (
        "mfspr 10, 318   \n"
        "li    11, 2     \n"
        "andc  10, 10, 11\n"
        "or    10, 10, %0\n"
        "sync            \n"
        "mtspr 318, 10   \n"
        "isync           \n"
        : : "r"(lo & 2) : "r10", "r11", "memory"
    );
}

/* ---- MSR[EE] disable (for clean return to CFE) --------------------------- */
/*
 * boot -elf arms a Decrementer timer.  ~35s after return it fires 0x0900
 * and crashes CFE.  Clearing MSR[EE] prevents this.  CFE polls UART for
 * keyboard, doesn't need interrupts.  Not needed if using "boot -noints".
 */
static inline void msr_disable_ee(void)
{
    __asm__ volatile (
        "mfmsr 10          \n"
        "lis   11, -1      \n"
        "ori   11, 11, 0x7FFF\n"
        "and   10, 10, 11  \n"
        "mtmsrd 10         \n"
        "isync             \n"
        ::: "r10", "r11", "memory"
    );
}

/* ---- UART (NS16550 at 0xFCFF03F8, 1-byte spacing) ------------------------ */

#define UART_BASE       0xFCFF03F8
#define UART_THR        0
#define UART_LSR        5
#define UART_LSR_THRE   0x20
#define UART_LSR_DR     0x01
#define UART_RBR        0

static void ser_putc(char c)
{
    volatile uint8 *uart = (volatile uint8 *)fixaddr(UART_BASE);
    while (!(uart[UART_LSR] & UART_LSR_THRE))
        ;
    uart[UART_THR] = c;
    __asm__ volatile ("eieio" ::: "memory");
}

static void ser_puts(const char *s)
{
    while (*s) {
        if (*s == '\n')
            ser_putc('\r');
        ser_putc(*s++);
    }
}

static void ser_puthex8(uint8 v)
{
    const char hex[] = "0123456789ABCDEF";
    ser_putc(hex[(v >> 4) & 0xf]);
    ser_putc(hex[v & 0xf]);
}

static void ser_puthex32(uint32 v)
{
    ser_puthex8((v >> 24) & 0xff);
    ser_puthex8((v >> 16) & 0xff);
    ser_puthex8((v >> 8) & 0xff);
    ser_puthex8(v & 0xff);
}

static void ser_putdec(uint32 v)
{
    char buf[12];
    int i = 0;
    if (v == 0) { ser_putc('0'); return; }
    while (v > 0) {
        buf[i++] = '0' + (v % 10);
        v /= 10;
    }
    while (--i >= 0)
        ser_putc(buf[i]);
}

static inline int uart_keypressed(void)
{
    volatile uint8 *uart = (volatile uint8 *)fixaddr(UART_BASE);
    return (uart[UART_LSR] & UART_LSR_DR);
}

static inline char uart_getc(void)
{
    volatile uint8 *uart = (volatile uint8 *)fixaddr(UART_BASE);
    return uart[UART_RBR];
}

/* ---- PCI ECAM access (byte-swapped for big-endian CPU) -------------------- */
/*
 * PA6T maps PCI config space at 0xE0000000 (256 MB ECAM region).
 * Address = 0xE0000000 | (bus << 20) | (dev << 15) | (fn << 12) | offset
 *
 * All on-chip devices (IOB, DMA, MAC) are on bus 0.
 * PCI config is little-endian; PA6T is big-endian -> use lwbrx/stwbrx.
 */

#define PCI_ECAM_BASE   0xE0000000UL

static inline uint32 ecam_addr(uint32 bus, uint32 dev, uint32 fn, uint32 off)
{
    return PCI_ECAM_BASE | (bus << 20) | (dev << 15) | (fn << 12) | off;
}

static inline uint32 ecam_read32(uint32 addr)
{
    uint32 val;
    void *p = fixaddr(addr);
    __asm__ volatile ("lwbrx %0, 0, %1" : "=r"(val) : "r"(p) : "memory");
    return val;
}

static inline void ecam_write32(uint32 addr, uint32 val)
{
    void *p = fixaddr(addr);
    __asm__ volatile ("stwbrx %0, 0, %1; eieio" : : "r"(val), "r"(p) : "memory");
}

static inline uint16 ecam_read16(uint32 addr)
{
    uint16 val;
    void *p = fixaddr(addr);
    __asm__ volatile ("lhbrx %0, 0, %1" : "=r"(val) : "r"(p) : "memory");
    return val;
}

static inline void ecam_write16(uint32 addr, uint16 val)
{
    void *p = fixaddr(addr);
    __asm__ volatile ("sthbrx %0, 0, %1; eieio" : : "r"(val), "r"(p) : "memory");
}

/* ---- Memory barrier ------------------------------------------------------- */

static inline void wmb(void)
{
    __asm__ volatile ("sync" : : : "memory");
}

/*
 * dcbi/dcbf are unnecessary with LPCR_RMI set (all memory is uncached).
 * dcbi is also illegal/privileged on PA6T — causes Program exception 0x0700.
 * Keep as no-ops (compiler barrier only) so call sites don't need changing.
 */
static inline void dcbi_line(volatile void *addr)
{
    (void)addr;
    __asm__ volatile ("" ::: "memory");
}

static inline void dcbf_line(volatile void *addr)
{
    (void)addr;
    __asm__ volatile ("" ::: "memory");
}

/* ---- PASemi PCI device IDs ------------------------------------------------ */

#define PASEMI_VENDOR_ID    0x1959
#define PASEMI_DEV_IOB      0xa001
#define PASEMI_DEV_GMAC     0xa005
#define PASEMI_DEV_DMA      0xa007

/* ---- PCI standard --------------------------------------------------------- */

#define PCI_COMMAND         0x04
#define PCI_COMMAND_MEMORY  0x0002
#define PCI_COMMAND_MASTER  0x0004

/* ---- MAC config-space registers ------------------------------------------- */

#define PAS_MAC_CFG_PCFG        0x80
#define PAS_MAC_CFG_MACCFG      0x84
#define PAS_MAC_CFG_ADR0        0x8c
#define PAS_MAC_CFG_ADR1        0x90
#define PAS_MAC_CFG_TXP         0x98
#define PAS_MAC_CFG_RMON(r)     (0x100 + (r)*4)
#define PAS_MAC_IPC_CHNL        0x208

#define PAS_MAC_CFG_PCFG_PE       0x80000000
#define PAS_MAC_CFG_PCFG_CE       0x40000000
#define PAS_MAC_CFG_PCFG_TSR_1G   0x08000000
#define PAS_MAC_CFG_PCFG_PR       0x01000000
#define PAS_MAC_CFG_PCFG_S1       0x00000080
#define PAS_MAC_CFG_PCFG_SPD_1G   0x00000002

#define PAS_MAC_CFG_MACCFG_MAXF_M   0x00ffff00
#define PAS_MAC_CFG_MACCFG_MAXF(x)  (((x) << 8) & PAS_MAC_CFG_MACCFG_MAXF_M)

#define PAS_MAC_CFG_TXP_FCE      0x00800000
#define PAS_MAC_CFG_TXP_FPC(x)   (((x) << 20) & 0x00300000)
#define PAS_MAC_CFG_TXP_SL(x)    (((x) << 16) & 0x00030000)
#define PAS_MAC_CFG_TXP_COB(x)   (((x) << 12) & 0x0000f000)
#define PAS_MAC_CFG_TXP_TIFT(x)  (((x) << 8)  & 0x00000f00)
#define PAS_MAC_CFG_TXP_TIFG(x)  ((x)         & 0x000000ff)

#define PAS_MAC_IPC_CHNL_DCHNO(x) (((x) << 16) & 0x003f0000)
#define PAS_MAC_IPC_CHNL_BCH(x)   ((x)         & 0x0000003f)

/* ---- DMA controller registers --------------------------------------------- */

#define PAS_DMA_COM_TXCMD      0x100
#define PAS_DMA_COM_TXSTA      0x104
#define PAS_DMA_COM_RXCMD      0x108
#define PAS_DMA_COM_RXSTA      0x10c
#define PAS_DMA_COM_CFG        0x114
#define PAS_DMA_TXF_SFLG0     0x140
#define PAS_DMA_TXF_SFLG1     0x144
#define PAS_DMA_TXF_CFLG0     0x148
#define PAS_DMA_TXF_CFLG1     0x14c

#define PAS_DMA_COM_TXCMD_EN   0x00000001
#define PAS_DMA_COM_RXCMD_EN   0x00000001

/* RX interface */
#define PAS_DMA_RXINT_RCMDSTA(i)  (0x200 + (i)*0x20)
#define PAS_DMA_RXINT_CFG(i)      (0x204 + (i)*0x20)
#define PAS_DMA_RXINT_INCR(i)     (0x210 + (i)*0x20)
#define PAS_DMA_RXINT_BASEL(i)    (0x218 + (i)*0x20)
#define PAS_DMA_RXINT_BASEU(i)    (0x21c + (i)*0x20)

#define PAS_DMA_RXINT_RCMDSTA_EN      0x00000001
#define PAS_DMA_RXINT_RCMDSTA_DROPS_M 0xfffe0000
#define PAS_DMA_RXINT_RCMDSTA_BP      0x00004000
#define PAS_DMA_RXINT_RCMDSTA_OO      0x00002000
#define PAS_DMA_RXINT_RCMDSTA_BT      0x00000800

#define PAS_DMA_RXINT_CFG_RBP    0x80000000
#define PAS_DMA_RXINT_CFG_DHL(x) (((x) << 24) & 0x07000000)
#define PAS_DMA_RXINT_CFG_LW     0x00200000
#define PAS_DMA_RXINT_CFG_L2     0x00100000
#define PAS_DMA_RXINT_CFG_HEN    0x00080000

#define PAS_DMA_RXINT_BASEL_BRBL(x) ((x) & ~0x3f)
#define PAS_DMA_RXINT_BASEU_BRBH(x) ((x) & 0xfff)
#define PAS_DMA_RXINT_BASEU_SIZ(x)  (((x) << 16) & 0x3fff0000)

/* RX channel */
#define PAS_DMA_RXCHAN_CCMDSTA(c) (0x800 + (c)*0x20)
#define PAS_DMA_RXCHAN_CFG(c)     (0x804 + (c)*0x20)
#define PAS_DMA_RXCHAN_INCR(c)    (0x810 + (c)*0x20)
#define PAS_DMA_RXCHAN_BASEL(c)   (0x818 + (c)*0x20)
#define PAS_DMA_RXCHAN_BASEU(c)   (0x81c + (c)*0x20)

#define PAS_DMA_RXCHAN_CCMDSTA_EN  0x00000001
#define PAS_DMA_RXCHAN_CCMDSTA_ST  0x00000002
#define PAS_DMA_RXCHAN_CCMDSTA_ACT 0x00010000
#define PAS_DMA_RXCHAN_CCMDSTA_DU  0x00020000
#define PAS_DMA_RXCHAN_CCMDSTA_OD  0x00002000
#define PAS_DMA_RXCHAN_CCMDSTA_FD  0x00001000
#define PAS_DMA_RXCHAN_CCMDSTA_DT  0x00000800

#define PAS_DMA_RXCHAN_CFG_HBU(x) (((x) << 7) & 0x00000380)
#define PAS_DMA_RXCHAN_BASEL_BRBL(x) ((x) & ~0x3f)
#define PAS_DMA_RXCHAN_BASEU_BRBH(x) ((x) & 0xfff)
#define PAS_DMA_RXCHAN_BASEU_SIZ(x)  (((x) << 16) & 0x3fff0000)

/* ---- RX descriptor bits --------------------------------------------------- */

#define XCT_MACRX_O         0x0400000000000000ULL

/* ---- RX buffer descriptor bits -------------------------------------------- */

#define XCT_RXB_LEN_S       44ULL
#define XCT_RXB_LEN_M       0x0ffff00000000000ULL
#define XCT_RXB_LEN(x)      ((((uint64)(x)) << XCT_RXB_LEN_S) & XCT_RXB_LEN_M)
#define XCT_RXB_ADDR_M      0x00000fffffffffffULL
#define XCT_RXB_ADDR(x)     ((((uint64)(x)) << 0) & XCT_RXB_ADDR_M)

/* ---- Ring configuration (match Linux) ------------------------------------- */

#define RX_RING_SIZE        2048    /* uint64 entries in descriptor ring */
#define RX_BUF_RING_SIZE    2048    /* uint64 entries in buffer ring */
#define NUM_RX_BUFS         2048    /* number of data buffers */
#define RX_BUF_SIZE         1648    /* Linux: MTU+ETH+FCS+align+headroom */

#define RX_CHANNEL          1       /* RX DMA channel */

#define RX_DESC(ring, n)    ((ring)[(n) & (RX_RING_SIZE - 1)])

/* ---- Fixed physical addresses for buffers --------------------------------- */
/*
 * No allocator, no MMU -- just pick addresses in free RAM.
 * Program loaded at 0x00100000 (1MB).
 * Rings at 0x00200000, data buffers at 0x00300000.
 *
 * Ring alignment: 64 bytes (hardware requirement).
 * All these addresses are 64KB-aligned so more than sufficient.
 */

#define RX_RING_PHYS        0x00200000  /* 16KB: desc ring (2MB mark) */
#define RX_BUF_RING_PHYS    0x00210000  /* 16KB: buffer ptr ring */
#define RX_BUFS_PHYS        0x00300000  /* 2048 * 2048-stride = 4MB */
#define RX_BUF_STRIDE       2048        /* stride per buffer (>= RX_BUF_SIZE, pow2) */

#define RX_BUF_ADDR(i)      (RX_BUFS_PHYS + (uint32)(i) * RX_BUF_STRIDE)

/* ---- DMA interface discovery (from Linux mac_to_intf) --------------------- */
/*
 * PAS_DMA_CAP_IFI register at DMA offset 0x4c contains a table that maps
 * PCI devfn -> DMA interface number.  NIN = number of interfaces, IOFF =
 * offset to the mapping registers.  Each mapping register packs 4 devfns
 * (one per byte).  Linux does a linear search through this table.
 */
#define PAS_DMA_CAP_IFI         0x4c
#define PAS_DMA_CAP_IFI_NIN_M  0x00ff0000
#define PAS_DMA_CAP_IFI_NIN_S  16
#define PAS_DMA_CAP_IFI_IOFF_M 0xff000000
#define PAS_DMA_CAP_IFI_IOFF_S 24

/* ---- IOB (I/O Bridge) registers ------------------------------------------- */
/*
 * IOB is at PCI bus 0, dev 0, fn 0 (ECAM base 0xE0000000).
 * On PA6T, IOB registers extend beyond 4KB config space -- offsets up to
 * 0x1700+ are accessible.  Same approach as the AmigaOS driver (PA6T_IOB_ECAM).
 * Registers are little-endian; use lwbrx/stwbrx for byte-swap.
 */
#define IOB_BASE                 0xE0000000UL

/* Errata 5971: packet header counter (reset to 0 after each RX batch) */
#define PAS_IOB_COM_PKTHDRCNT            0x0120

/* DMA RX channel config (CNTTH = packet count threshold for interrupt) */
#define PAS_IOB_DMA_RXCH_CFG(i)          (0x1100 + (i)*4)
#define PAS_IOB_DMA_RXCH_CFG_CNTTH(x)   ((x) & 0xfff)

/* DMA RX channel reset (clear interrupt flags and packet counter) */
#define PAS_IOB_DMA_RXCH_RESET(i)        (0x1500 + (i)*4)
#define PAS_IOB_DMA_RXCH_RESET_PCNT_M    0xffff0000
#define PAS_IOB_DMA_RXCH_RESET_PCNT_S    16
#define PAS_IOB_DMA_RXCH_RESET_PCNT(x)   (((x) << 16) & 0xffff0000)
#define PAS_IOB_DMA_RXCH_RESET_PCNTRST   0x00000020
#define PAS_IOB_DMA_RXCH_RESET_DCNTRST   0x00000010
#define PAS_IOB_DMA_RXCH_RESET_TINTC     0x00000008
#define PAS_IOB_DMA_RXCH_RESET_DINTC     0x00000004
#define PAS_IOB_DMA_RXCH_RESET_SINTC     0x00000002
#define PAS_IOB_DMA_RXCH_RESET_PINTC     0x00000001

/* DMA TX channel config (for completeness -- we don't TX but Linux sets it) */
#define PAS_IOB_DMA_TXCH_CFG(i)          (0x1200 + (i)*4)
#define PAS_IOB_DMA_TXCH_CFG_CNTTH(x)   ((x) & 0xfff)

/* DMA timeout config */
#define PAS_IOB_DMA_COM_TIMEOUTCFG           0x1700
#define PAS_IOB_DMA_COM_TIMEOUTCFG_TCNT(x)   ((x) & 0x00ffffff)

/* ---- IOB register helpers ------------------------------------------------- */

static inline uint32 rd_iob(uint32 off)
{
    return ecam_read32(IOB_BASE + off);
}

static inline void wr_iob(uint32 off, uint32 v)
{
    ecam_write32(IOB_BASE + off, v);
}

/* ---- Globals -------------------------------------------------------------- */

static uint32 mac_ecam;    /* ECAM address of MAC device */
static uint32 dma_ecam;    /* ECAM address of DMA device */
static uint32 mac_dma_intf; /* DMA interface for our MAC (from hw table) */

static volatile uint64 *rx_ring;
static volatile uint64 *rx_buf_ring;

static uint32 rx_next_clean;
static uint32 rx_next_buf;

/* ---- Register access helpers ---------------------------------------------- */

static inline uint32 rd_mac(uint32 off) { return ecam_read32(mac_ecam + off); }
static inline void   wr_mac(uint32 off, uint32 v) { ecam_write32(mac_ecam + off, v); }
static inline uint32 rd_dma(uint32 off) { return ecam_read32(dma_ecam + off); }
static inline void   wr_dma(uint32 off, uint32 v) { ecam_write32(dma_ecam + off, v); }

/* ---- PCI scan: find device by vendor/device ID on bus 0 ------------------- */

static uint32 pci_find_device(uint16 vendor, uint16 device)
{
    uint32 dev, fn;

    for (dev = 0; dev < 32; dev++) {
        for (fn = 0; fn < 8; fn++) {
            uint32 addr = ecam_addr(0, dev, fn, 0);
            uint32 id = ecam_read32(addr);
            uint16 v = id & 0xffff;
            uint16 d = (id >> 16) & 0xffff;
            if (v == vendor && d == device)
                return addr;
        }
    }
    return 0;
}

/*
 * Find a specific PCI function of a multi-function device.
 * PA6T has multiple GMAC ports with the same vendor/device ID but
 * different function numbers.  CFE's eth0 = fn 3.
 */
static uint32 pci_find_device_fn(uint16 vendor, uint16 device, uint32 want_fn)
{
    uint32 dev;

    for (dev = 0; dev < 32; dev++) {
        uint32 addr = ecam_addr(0, dev, want_fn, 0);
        uint32 id = ecam_read32(addr);
        uint16 v = id & 0xffff;
        uint16 d = (id >> 16) & 0xffff;
        if (v == vendor && d == device)
            return addr;
    }
    return 0;
}

/*
 * Discover DMA interface number for a given MAC PCI devfn.
 * Reads PAS_DMA_CAP_IFI from the DMA controller to get the hardware
 * mapping table, then does a linear search (same as Linux mac_to_intf).
 *
 * devfn = (dev << 3) | fn  (standard PCI devfn encoding)
 * Returns DMA interface number, or 0xFFFFFFFF on failure.
 */
static uint32 discover_mac_intf(uint32 devfn)
{
    uint32 tmp, nintf, off, i, j;

    tmp = rd_dma(PAS_DMA_CAP_IFI);
    nintf = (tmp & PAS_DMA_CAP_IFI_NIN_M) >> PAS_DMA_CAP_IFI_NIN_S;
    off   = (tmp & PAS_DMA_CAP_IFI_IOFF_M) >> PAS_DMA_CAP_IFI_IOFF_S;

    ser_puts("[cfe_dma] DMA_CAP_IFI=0x"); ser_puthex32(tmp);
    ser_puts(" nintf="); ser_putdec(nintf);
    ser_puts(" off=0x"); ser_puthex32(off); ser_puts("\n");

    for (i = 0; i < (nintf + 3) / 4; i++) {
        tmp = rd_dma(off + 4 * i);
        ser_puts("[cfe_dma]   intf_map["); ser_putdec(i);
        ser_puts("]=0x"); ser_puthex32(tmp); ser_puts("\n");
        for (j = 0; j < 4; j++) {
            if (((tmp >> (8 * j)) & 0xff) == devfn)
                return i * 4 + j;
        }
    }
    return 0xFFFFFFFF;
}

/* ---- Zero memory ---------------------------------------------------------- */

static void memzero(volatile void *dst, uint32 len)
{
    volatile uint8 *p = (volatile uint8 *)dst;
    uint32 i;
    for (i = 0; i < len; i++)
        p[i] = 0;
}

/* ---- Init hardware -------------------------------------------------------- */

static int init(void)
{
    uint32 i, cfg;

    ser_puts("\n[cfe_dma] ==============================\n");
    ser_puts("[cfe_dma] Bare-metal DMA RX stress test\n");
    ser_puts("[cfe_dma] ==============================\n");

    /* ---- Find PCI devices ---- */
    /*
     * PA6T has 4 GMAC ports (fn 0-3), all with device ID 0xa005.
     * CFE's eth0 is fn 3 (ECAM 0xE00A3000).  Our old code found fn 0
     * (0xE00A0000) which is the wrong port -> rx=0.
     *
     * Also discover the DMA interface number from hardware (like Linux
     * mac_to_intf) instead of hardcoding it.
     */
    mac_ecam = pci_find_device_fn(PASEMI_VENDOR_ID, PASEMI_DEV_GMAC, 3);
    dma_ecam = pci_find_device(PASEMI_VENDOR_ID, PASEMI_DEV_DMA);

    if (!mac_ecam || !dma_ecam) {
        ser_puts("[cfe_dma] FAIL: PCI devices not found\n");
        ser_puts("[cfe_dma]   MAC ECAM=0x"); ser_puthex32(mac_ecam);
        ser_puts("  DMA ECAM=0x"); ser_puthex32(dma_ecam); ser_puts("\n");
        return 0;
    }

    ser_puts("[cfe_dma] MAC ECAM=0x"); ser_puthex32(mac_ecam);
    ser_puts(" (fn 3)\n");
    ser_puts("[cfe_dma] DMA ECAM=0x"); ser_puthex32(dma_ecam); ser_puts("\n");

    /* Discover DMA interface for this MAC.
     * MAC is at dev 5, fn 3 -> devfn = (5 << 3) | 3 = 0x2B = 43 */
    {
        uint32 mac_dev = (mac_ecam >> 15) & 0x1f;
        uint32 mac_fn  = (mac_ecam >> 12) & 0x07;
        uint32 devfn   = (mac_dev << 3) | mac_fn;
        ser_puts("[cfe_dma] MAC devfn=0x"); ser_puthex32(devfn); ser_puts("\n");

        mac_dma_intf = discover_mac_intf(devfn);
        if (mac_dma_intf == 0xFFFFFFFF) {
            ser_puts("[cfe_dma] FAIL: cannot find DMA intf for devfn 0x");
            ser_puthex32(devfn); ser_puts("\n");
            return 0;
        }
        ser_puts("[cfe_dma] DMA interface="); ser_putdec(mac_dma_intf);
        ser_puts("\n");
    }

    /* ---- Read MAC address ---- */
    {
        uint32 adr0 = rd_mac(PAS_MAC_CFG_ADR0);
        uint32 adr1 = rd_mac(PAS_MAC_CFG_ADR1);
        ser_puts("[cfe_dma] MAC: ");
        ser_puthex8((adr1 >> 8) & 0xff); ser_puts(":");
        ser_puthex8(adr1 & 0xff); ser_puts(":");
        ser_puthex8((adr0 >> 24) & 0xff); ser_puts(":");
        ser_puthex8((adr0 >> 16) & 0xff); ser_puts(":");
        ser_puthex8((adr0 >> 8) & 0xff); ser_puts(":");
        ser_puthex8(adr0 & 0xff); ser_puts("\n");
        ser_puts("[cfe_dma] Set static ARP on PC for this MAC!\n");
    }

    /* ---- Safety: check channel not in use ---- */
    {
        uint32 rxsta = rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL));
        if (rxsta & PAS_DMA_RXCHAN_CCMDSTA_ACT) {
            ser_puts("[cfe_dma] FAIL: RX channel active! STA=0x");
            ser_puthex32(rxsta); ser_puts("\n");
            return 0;
        }
    }

    /* ---- DMA controller init (matches Linux pasemi_dma_init) ---- */
    {
        /* Disable TX, wait idle */
        wr_dma(PAS_DMA_COM_TXCMD, 0);
        for (i = 0; i < 100000; i++)
            if (!(rd_dma(PAS_DMA_COM_TXSTA) & 0x01)) break;

        /* COM_CFG: fast FIFO (bits [28:27] = 0b11) */
        cfg = rd_dma(PAS_DMA_COM_CFG);
        ser_puts("[cfe_dma] COM_CFG before=0x"); ser_puthex32(cfg); ser_puts("\n");
        wr_dma(PAS_DMA_COM_CFG, cfg | 0x18000000);
        ser_puts("[cfe_dma] COM_CFG after=0x");
        ser_puthex32(rd_dma(PAS_DMA_COM_CFG)); ser_puts("\n");

        /* Re-enable TX */
        wr_dma(PAS_DMA_COM_TXCMD, PAS_DMA_COM_TXCMD_EN);

        /* Clear TX status flags */
        wr_dma(PAS_DMA_TXF_SFLG0, 0);
        wr_dma(PAS_DMA_TXF_SFLG1, 0);
        wr_dma(PAS_DMA_TXF_CFLG0, 0xffffffff);
        wr_dma(PAS_DMA_TXF_CFLG1, 0xffffffff);

        /* Stop RX */
        wr_dma(PAS_DMA_COM_RXCMD, 0);
        for (i = 0; i < 100000; i++)
            if (!(rd_dma(PAS_DMA_COM_RXSTA) & 0x01)) break;

        /* Hard-reset channel and interface */
        wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), 0);
        wr_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf), 0);

        ser_puts("[cfe_dma] DMA controller init done\n");
    }

    /* ---- Set up ring pointers (fixaddr to clear upper 32 bits) ---- */
    rx_ring     = (volatile uint64 *)fixaddr(RX_RING_PHYS);
    rx_buf_ring = (volatile uint64 *)fixaddr(RX_BUF_RING_PHYS);

    /* Zero the rings */
    memzero(rx_ring,     RX_RING_SIZE * sizeof(uint64));
    memzero(rx_buf_ring, RX_BUF_RING_SIZE * sizeof(uint64));

    ser_puts("[cfe_dma] RX ring at 0x"); ser_puthex32(RX_RING_PHYS); ser_puts("\n");
    ser_puts("[cfe_dma] buf ring at 0x"); ser_puthex32(RX_BUF_RING_PHYS); ser_puts("\n");
    ser_puts("[cfe_dma] buffers at 0x"); ser_puthex32(RX_BUFS_PHYS); ser_puts("\n");

    /* ---- Configure RX channel (descriptor ring) ---- */
    {
        uint32 siz = RX_RING_SIZE >> 3;
        wr_dma(PAS_DMA_RXCHAN_BASEL(RX_CHANNEL),
               PAS_DMA_RXCHAN_BASEL_BRBL(RX_RING_PHYS));
        wr_dma(PAS_DMA_RXCHAN_BASEU(RX_CHANNEL),
               PAS_DMA_RXCHAN_BASEU_BRBH(0) | PAS_DMA_RXCHAN_BASEU_SIZ(siz));
        wr_dma(PAS_DMA_RXCHAN_CFG(RX_CHANNEL), PAS_DMA_RXCHAN_CFG_HBU(2));
    }

    /* ---- Configure RX interface (buffer ring) ---- */
    {
        uint32 siz = RX_BUF_RING_SIZE >> 3;
        wr_dma(PAS_DMA_RXINT_BASEL(mac_dma_intf),
               PAS_DMA_RXINT_BASEL_BRBL(RX_BUF_RING_PHYS));
        wr_dma(PAS_DMA_RXINT_BASEU(mac_dma_intf),
               PAS_DMA_RXINT_BASEU_BRBH(0) | PAS_DMA_RXINT_BASEU_SIZ(siz));

        cfg = PAS_DMA_RXINT_CFG_RBP      |
              PAS_DMA_RXINT_CFG_DHL(2)    |
              PAS_DMA_RXINT_CFG_LW        |
              PAS_DMA_RXINT_CFG_L2        |
              PAS_DMA_RXINT_CFG_HEN;
        wr_dma(PAS_DMA_RXINT_CFG(mac_dma_intf), cfg);
    }

    /* ---- Re-enable RX section ---- */
    wr_dma(PAS_DMA_COM_RXCMD, PAS_DMA_COM_RXCMD_EN);

    /* ---- Enable RX interface + channel (match Linux flags) ---- */
    wr_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf),
           PAS_DMA_RXINT_RCMDSTA_EN      |
           PAS_DMA_RXINT_RCMDSTA_DROPS_M |
           PAS_DMA_RXINT_RCMDSTA_BP      |
           PAS_DMA_RXINT_RCMDSTA_OO      |
           PAS_DMA_RXINT_RCMDSTA_BT);

    wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL),
           PAS_DMA_RXCHAN_CCMDSTA_EN |
           PAS_DMA_RXCHAN_CCMDSTA_DU |
           PAS_DMA_RXCHAN_CCMDSTA_OD |
           PAS_DMA_RXCHAN_CCMDSTA_FD |
           PAS_DMA_RXCHAN_CCMDSTA_DT);

    /* ---- IOB configuration (match Linux pasemi_mac_open) ---- */
    /*
     * These IOB registers are CRITICAL and were missing from our first test.
     * Linux configures them in pasemi_mac_open().
     *
     * TIMEOUTCFG: 0x3FF with 33MHz clock is ~31us (from Linux comment).
     * CNTTH: packet count threshold for interrupt generation.
     *        Linux uses 256 for RX, 32 for TX.
     * PKTHDRCNT: Errata 5971 -- reset to 0 here and after each RX batch.
     * RXCH_RESET: clear all pending interrupt flags and counters.
     */
    wr_iob(PAS_IOB_DMA_COM_TIMEOUTCFG,
           PAS_IOB_DMA_COM_TIMEOUTCFG_TCNT(0x3ff));
    wr_iob(PAS_IOB_DMA_RXCH_CFG(RX_CHANNEL),
           PAS_IOB_DMA_RXCH_CFG_CNTTH(256));
    wr_iob(PAS_IOB_COM_PKTHDRCNT, 0);

    /* Clear all interrupt flags and reset packet counter */
    wr_iob(PAS_IOB_DMA_RXCH_RESET(RX_CHANNEL),
           PAS_IOB_DMA_RXCH_RESET_PINTC |
           PAS_IOB_DMA_RXCH_RESET_SINTC |
           PAS_IOB_DMA_RXCH_RESET_DINTC |
           PAS_IOB_DMA_RXCH_RESET_TINTC |
           PAS_IOB_DMA_RXCH_RESET_PCNTRST);

    ser_puts("[cfe_dma] IOB: TIMEOUTCFG=0x3FF, CNTTH=256, PKTHDRCNT reset\n");

    /* ---- Post initial buffers ---- */
    for (i = 0; i < NUM_RX_BUFS; i++) {
        rx_buf_ring[i] = XCT_RXB_LEN(RX_BUF_SIZE) |
                         XCT_RXB_ADDR(RX_BUF_ADDR(i));
    }
    wmb();

    wr_dma(PAS_DMA_RXINT_INCR(mac_dma_intf), NUM_RX_BUFS);
    wr_dma(PAS_DMA_RXCHAN_INCR(RX_CHANNEL), RX_RING_SIZE >> 1);

    /* ---- Enable BusMaster on MAC + DMA ---- */
    {
        uint32 cmd_sta;

        cmd_sta = ecam_read32(mac_ecam + PCI_COMMAND);
        cmd_sta = (cmd_sta & 0xffff0000) |
                  ((cmd_sta & 0xffff) | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
        ecam_write32(mac_ecam + PCI_COMMAND, cmd_sta);

        cmd_sta = ecam_read32(dma_ecam + PCI_COMMAND);
        cmd_sta = (cmd_sta & 0xffff0000) |
                  ((cmd_sta & 0xffff) | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
        ecam_write32(dma_ecam + PCI_COMMAND, cmd_sta);
    }

    /* ---- Configure and enable MAC ---- */
    {
        uint32 pcfg = rd_mac(PAS_MAC_CFG_PCFG);
        wr_mac(PAS_MAC_CFG_PCFG, pcfg & ~PAS_MAC_CFG_PCFG_PE);

        /* Max frame size */
        uint32 maccfg = rd_mac(PAS_MAC_CFG_MACCFG);
        maccfg &= ~PAS_MAC_CFG_MACCFG_MAXF_M;
        maccfg |= PAS_MAC_CFG_MACCFG_MAXF(1518);
        wr_mac(PAS_MAC_CFG_MACCFG, maccfg);

        /* TX parameters */
        wr_mac(PAS_MAC_CFG_TXP,
               PAS_MAC_CFG_TXP_FCE     |
               PAS_MAC_CFG_TXP_FPC(3)  |
               PAS_MAC_CFG_TXP_SL(3)   |
               PAS_MAC_CFG_TXP_COB(0xf)|
               PAS_MAC_CFG_TXP_TIFT(8) |
               PAS_MAC_CFG_TXP_TIFG(12));

        /* Zero RMON counters */
        for (i = 0; i < 32; i++)
            wr_mac(PAS_MAC_CFG_RMON(i), 0);

        /* Assign DMA channel to MAC */
        wr_mac(PAS_MAC_IPC_CHNL,
               PAS_MAC_IPC_CHNL_DCHNO(RX_CHANNEL) |
               PAS_MAC_IPC_CHNL_BCH(RX_CHANNEL));

        /* Enable: 1G full-duplex promiscuous */
        pcfg = PAS_MAC_CFG_PCFG_S1       |
               PAS_MAC_CFG_PCFG_PR        |
               PAS_MAC_CFG_PCFG_CE        |
               PAS_MAC_CFG_PCFG_TSR_1G    |
               PAS_MAC_CFG_PCFG_SPD_1G    |
               PAS_MAC_CFG_PCFG_PE;
        wr_mac(PAS_MAC_CFG_PCFG, pcfg);
    }

    rx_next_clean = 0;
    rx_next_buf   = NUM_RX_BUFS;

    ser_puts("[cfe_dma] MAC enabled, DMA running.\n");

    /* Print HW state */
    {
        uint32 pcfg   = rd_mac(PAS_MAC_CFG_PCFG);
        uint32 rxcsta = rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL));
        uint32 rxista = rd_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf));
        ser_puts("[cfe_dma] PCFG=0x"); ser_puthex32(pcfg);
        ser_puts(" RXCSTA=0x"); ser_puthex32(rxcsta);
        ser_puts(" RXISTA=0x"); ser_puthex32(rxista); ser_puts("\n");
    }

    return 1;
}

/* ---- Cleanup: stop DMA and MAC -------------------------------------------- */

static void cleanup(void)
{
    uint32 i, pcfg;

    /* Disable MAC */
    pcfg = rd_mac(PAS_MAC_CFG_PCFG);
    wr_mac(PAS_MAC_CFG_PCFG, pcfg & ~PAS_MAC_CFG_PCFG_PE);

    /* Stop RX channel */
    wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), PAS_DMA_RXCHAN_CCMDSTA_ST);
    for (i = 0; i < 50000; i++)
        if (!(rd_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL)) & PAS_DMA_RXCHAN_CCMDSTA_ACT))
            break;
    wr_dma(PAS_DMA_RXCHAN_CCMDSTA(RX_CHANNEL), 0);

    /* Stop RX interface */
    wr_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf), 0x00000002);
    for (i = 0; i < 50000; i++)
        if (!(rd_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf)) & 0x00010000))
            break;
    wr_dma(PAS_DMA_RXINT_RCMDSTA(mac_dma_intf), 0);
}

/* ---- Main polling loop ---------------------------------------------------- */

static void poll_loop(void)
{
    uint32 total_rx = 0;
    uint32 loops = 0;
    uint32 heartbeat = 0;

    ser_puts("[cfe_dma] Polling + recycling. Press any key to stop.\n");

    for (;;) {
        uint32 n = rx_next_clean;
        uint32 count = 0;

        /* Scan for completed descriptors */
        for (;;) {
            /*
             * Invalidate cache line before reading, in case CPU
             * cache has stale data and DMA wrote to physical memory.
             */
            dcbi_line(&rx_ring[n & (RX_RING_SIZE - 1)]);

            uint64 macrx = RX_DESC(rx_ring, n);
            if (!(macrx & XCT_MACRX_O))
                break;

            /* Clear descriptor -- mask n for ring wrap */
            {
                uint32 m = n & (RX_RING_SIZE - 1);
                rx_ring[m]     = 0;
                rx_ring[m + 1] = 0;
                rx_ring[m + 2] = 0;
                rx_ring[m + 3] = 0;
            }

            /* Repost buffer */
            {
                uint32 buf_idx = count % NUM_RX_BUFS;
                uint32 br_idx  = rx_next_buf & (RX_BUF_RING_SIZE - 1);
                rx_buf_ring[br_idx] =
                    XCT_RXB_LEN(RX_BUF_SIZE) | XCT_RXB_ADDR(RX_BUF_ADDR(buf_idx));
                /* Flush buffer ring write so DMA sees it */
                dcbf_line(&rx_buf_ring[br_idx]);
                rx_next_buf++;
            }

            n += 4;
            count++;
            if (count >= NUM_RX_BUFS)
                break;
        }

        if (count > 0) {
            /* Errata 5971: reset PKTHDRCNT after every RX batch
             * (Linux does this when n > RX_RING_SIZE; clean driver
             *  does it after every batch -- we follow the clean driver) */
            wr_iob(PAS_IOB_COM_PKTHDRCNT, 0);

            if (n >= RX_RING_SIZE)
                n &= (RX_RING_SIZE - 1);
            rx_next_clean = n;
            total_rx += count;

            wmb();
            wr_dma(PAS_DMA_RXINT_INCR(mac_dma_intf), count);
            wr_dma(PAS_DMA_RXCHAN_INCR(RX_CHANNEL), count << 1);

            /*
             * Equivalent of Linux restart_rx_intr():
             * Reset packet counter and clear all interrupt flags.
             *
             * Even in polling mode (no IRQ delivery), the DMA
             * controller's internal PCNT counter and interrupt flags
             * accumulate.  If PINTC is never cleared, the DMA may
             * stop delivering descriptors after CNTTH packets.
             * Linux clears this after every NAPI poll cycle.
             *
             * Linux reads the exact PCNT from a 64-bit status register
             * at 0xfd800000+ch*8 and debits it via PCNT(x).
             * We use PCNTRST for a full reset instead -- simpler for
             * polling mode and avoids needing the 64-bit status read.
             */
            wr_iob(PAS_IOB_DMA_RXCH_RESET(RX_CHANNEL),
                   PAS_IOB_DMA_RXCH_RESET_PCNTRST |
                   PAS_IOB_DMA_RXCH_RESET_PINTC    |
                   PAS_IOB_DMA_RXCH_RESET_TINTC    |
                   PAS_IOB_DMA_RXCH_RESET_DINTC    |
                   PAS_IOB_DMA_RXCH_RESET_SINTC);
        }

        loops++;

        /* Heartbeat every 100K loops */
        if ((loops % 100000) == 0) {
            heartbeat++;
            ser_puts("[cfe_dma] #");
            ser_putdec(heartbeat);
            ser_puts(" rx=");
            ser_putdec(total_rx);
            ser_puts(" loops=");
            ser_putdec(loops);
            ser_puts("\n");
        }

        /* Check for keypress every 256 loops */
        if ((loops & 0xFF) == 0) {
            if (uart_keypressed()) {
                uart_getc();  /* consume the key */
                break;
            }
        }
    }

    ser_puts("[cfe_dma] Total rx=");
    ser_putdec(total_rx);
    ser_puts(" loops=");
    ser_putdec(loops);
    ser_puts("\n");
}

/* ---- Entry point (must be first in binary!) ------------------------------- */

/*
 * We use a naked asm stub to guarantee _start is placed first
 * by the linker, then jump to the real C entry point.
 */
void cfe_main(void);

__asm__(
    "    .section .text\n"
    "    .globl _start\n"
    "_start:\n"
    "    b cfe_main\n"
);

void cfe_main(void)
{
    lpcr_enable_rmi();           /* MUST be first -- enables MMIO access */

    ser_puts("\n[cfe_dma] Starting bare-metal DMA test...\n");

    if (!init()) {
        ser_puts("[cfe_dma] Init failed. Returning to CFE.\n");
        lpcr_restore();
        msr_disable_ee();
        return;
    }

    poll_loop();

    ser_puts("[cfe_dma] Stopping hardware...\n");
    cleanup();
    ser_puts("[cfe_dma] Done. Returning to CFE.\n");

    lpcr_restore();              /* restore LPCR before returning to CFE */
    msr_disable_ee();            /* prevent Decrementer crash in CFE */
}
