Hello Team,
General motivation:
Our 3rd party provider of 66AK2H12 boards straggling to build boards because of ECC error yield issues.
We want to help with the production of boards, by giving a better solution/answer to 2bit ECC cases.
The original design of 2bit ECC is to reboot the device (panic) when it happens.
I would like to change the design of 2bit ECC. I removed the panic command and I print the 2bit ECC error address.
For testing, I inject 2bit ECC error from user space. And I see that the handler is triggered, but only at the first time.
From the second time and on, the handler doesn't triggered, but I see that the DDR controller registers do change and show ECC error.
How to make the handler triggered again for every 2bit ECC error?
Attached snap shots of DDR control registers states before and after injecting the 2bit ECC error.
changed handler code:
static irqreturn_t ddr3_ecc_err_irq_handler(int irq, void *reg_virt)
{
int ret = IRQ_NONE;
int i;
u32 irq_status;
u32 err_ddr_address_2b = 0;
u32 err_ddr_address_1b = 0;
u32 data;
void __iomem *ddr_reg = (void __iomem *)reg_virt;
irq_status = readl(ddr_reg + DDR3_IRQ_STATUS_SYS);
pr_warn("DDR3 ECC irq status 0x%x \n", irq_status);
if(irq_status > 0)
{
if ((irq_status & DDR3_2B_ECC_ERR) ||
(irq_status & DDR3_WR_ECC_ERR)) {
// panic("Unrecoverable DDR3 ECC error, irq status 0x%x, "
// "rebooting kernel ..\n", irq_status);
pr_warn("Unrecoverable DDR3 2 bits ECC error, irq status 0x%x \n", irq_status);
}
ret = IRQ_HANDLED;
}
return ret;
}
Injecting ECC error from user space:
dmesg | grep -A 2 -B 2 "ECC"
md.l 0x21010000
md.l 0x21010100
mw.l 0x960000000 0xffffffff 80
sleep 1
echo 1 > /proc/sys/vm/drop_caches
sleep 1
md.l 0x960000000
sleep 1
devmem 0x21010110 w 0x0
sleep 1
devmem 0x960000068 w 0xfffffffc
sleep 1
devmem 0x21010134 w 0x01000000
sleep 1
devmem 0x21010114 w 0xCFFFA000
sleep 1
devmem 0x21010110 w 0xF0000001
sleep 1
echo 1 > /proc/sys/vm/drop_caches
sleep 1
md.l 0x960000000
md.l 0x21010000
md.l 0x21010100
dmesg | grep -A 2 -B 2 "ECC"
BEFORE INJECTING 2bit ECC ERROR: # md.l 0x21010000 21010000: 40461c02 40000004 6200ce63 00000000 ..F@...@c..b.... 21010010: 000017cd 00000000 16709885 00001d4a ..........p.J... 21010020: 4461ff53 00000000 543f111f 00000000 S.aD......?T.... 21010030: 00000000 00000000 00000000 00000000 ................ 21010040: 00000000 00000000 00000000 00000000 ................ 21010050: 00000000 00ffffff c0071410 00021c1c ................ 21010060: 00002010 00000000 00000000 00000000 . .............. 21010070: 00000000 00000000 00000000 00000000 ................ 21010080: 1b0ee523 034b724c 00010000 00000000 #...LrK......... 21010090: 14a8c3fe 00000000 00000000 00000000 ................ 210100a0: 00000000 00000000 00000000 00000000 ................ 210100b0: 00000000 00000038 00000000 00000038 ....8.......8... 210100c0: 00000000 00000000 70073200 00000000 .........2.p.... 210100d0: 00000000 00000000 00000000 00000000 ................ 210100e0: 00000000 00000000 00000000 00000000 ................ 210100f0: 00000000 00000000 00000000 00000000 ................ # md.l 0x21010100 21010100: 00000000 00000000 00000000 00000000 ................ 21010110: b0000000 00000000 00000000 00000000 ................ 21010120: 00001f1f 00000000 00000000 00000000 ................ 21010130: 00000000 00000000 00000000 00000000 ................ 21010140: 00000000 00000000 00000000 00000000 ................ 21010150: 00000000 00000000 00000000 00000000 ................ 21010160: 00000000 00000000 00000000 00000000 ................ 21010170: 00000000 00000000 00000000 00000000 ................ 21010180: 00000000 00000000 00000000 00000000 ................ 21010190: 00000000 00000000 00000000 00000000 ................ 210101a0: 00000000 00000000 00000000 00000000 ................ 210101b0: 00000000 00000000 00000000 00000000 ................ 210101c0: 00000000 00000000 00000000 00000000 ................ 210101d0: 00000000 00000000 00000000 00000000 ................ 210101e0: 00000000 00000000 00000000 00000000 ................ 210101f0: 00000000 00000000 00000000 00000000 ................ AFTER INJECTING 2bit ECC ERROR: # md.l 0x21010000 21010000: 40461c02 40000004 6200ce63 00000000 ..F@...@c..b.... 21010010: 000017cd 00000000 16709885 00001d4a ..........p.J... 21010020: 4461ff53 00000000 543f111f 00000000 S.aD......?T.... 21010030: 00000000 00000000 00000000 00000000 ................ 21010040: 00000000 00000000 00000000 00000000 ................ 21010050: 00000000 00ffffff c0071410 00021c1c ................ 21010060: 00002010 00000000 00000000 00000000 . .............. 21010070: 00000000 00000000 00000000 00000000 ................ 21010080: 1b10fc7d 034c7009 00010000 00000000 }....pL......... 21010090: e720bc2c 00000000 00000000 00000000 ,. ............. 210100a0: 00000000 00000010 00000000 00000010 ................ 210100b0: 00000000 00000038 00000000 00000038 ....8.......8... 210100c0: 00000000 00000000 70073200 00000000 .........2.p.... 210100d0: 00000000 00000000 00000000 00000000 ................ 210100e0: 00000000 00000000 00000000 00000000 ................ 210100f0: 00000000 00000000 00000000 00000000 ................ # md.l 0x21010100 21010100: 00000000 00000000 00000000 00000000 ................ 21010110: f0000001 cfffa000 00000000 00000000 ................ 21010120: 00001f1f 00000000 00000000 00000000 ................ 21010130: 00000000 01000000 00000000 00000000 ................ 21010140: b0000020 00000000 00000000 00000000 ............... 21010150: 00000000 00000000 00000000 00000000 ................ 21010160: 00000000 00000000 00000000 00000000 ................ 21010170: 00000000 00000000 00000000 00000000 ................ 21010180: 00000000 00000000 00000000 00000000 ................ 21010190: 00000000 00000000 00000000 00000000 ................ 210101a0: 00000000 00000000 00000000 00000000 ................ 210101b0: 00000000 00000000 00000000 00000000 ................ 210101c0: 00000000 00000000 00000000 00000000 ................ 210101d0: 00000000 00000000 00000000 00000000 ................ 210101e0: 00000000 00000000 00000000 00000000 ................ 210101f0: 00000000 00000000 00000000 00000000 ................
ECC interrupt gets initialized
int keystone_init_ddr3_ecc(struct device_node *node)
{
void __iomem *ddr_reg;
int error_irq = 0;
int ret;
/* ddr3 controller reg is configured in the sysctrl node at index 0 */
ddr_reg = of_iomap(node, 0);
if (!ddr_reg) {
pr_warn("Warning!! DDR3 controller regs not defined\n");
return -ENODEV;
}
/* add DDR3 ECC error handler */
error_irq = irq_of_parse_and_map(node, 1);
if (!error_irq) {
/* No GIC interrupt, need to map CIC2 interupt to GIC */
pr_warn("Warning!! DDR3 ECC irq number not defined\n");
return -ENODEV;
}
ret = request_irq(error_irq, ddr3_ecc_err_irq_handler, 0,
"ddr3-ecc-err-irq", (void *)ddr_reg);
if (ret) {
WARN_ON("request_irq fail for DDR3 ECC error irq\n");
return ret;
}
return 0;
}
DDR configuration:
void ddr3_init_ecc(u32 base)
{
u32 ddr3_size;
if (!ddr3_ecc_support_rmw(base)) {
ddr3_disable_ecc(base);
return;
}
ddr3_ecc_init_range(base);
ddr3_size = ddr3_get_size();
ddr3_reset_data(CONFIG_SYS_SDRAM_BASE, ddr3_size);
ddr3_enable_ecc(base, 0);
}
void ddr3_enable_ecc(u32 base, int test)
{
u32 ecc_val = KS2_DDR3_ECC_ENABLE;
u32 rmw = ddr3_ecc_support_rmw(base);
if (test)
ecc_val |= KS2_DDR3_ECC_ADDR_RNG_1_EN;
if (!rmw) {
if (!test)
/* by default, disable ecc when rmw = 0 and no
ecc test */
ecc_val = 0;
} else {
ecc_val |= KS2_DDR3_ECC_RMW_EN;
}
ddr3_ecc_config(base, ecc_val);
}
static void ddr3_ecc_config(u32 base, u32 value)
{
u32 data;
__raw_writel(value, base + KS2_DDR3_ECC_CTRL_OFFSET);
udelay(100000); /* delay required to synchronize across clock domains */
if (value & KS2_DDR3_ECC_EN) {
/* Clear the 1-bit error count */
data = __raw_readl(base + KS2_DDR3_ONE_BIT_ECC_ERR_CNT_OFFSET);
__raw_writel(data, base + KS2_DDR3_ONE_BIT_ECC_ERR_CNT_OFFSET);
__raw_writel(KS2_DDR3_1B_ECC_ERR_THRESH_VAL(0) | KS2_DDR3_1B_ECC_ERR_WIN_VAL(0),
base + KS2_DDR3_ONE_BIT_ECC_ERR_THRESH);
/* enable the ECC interrupt */
__raw_writel(KS2_DDR3_1B_ECC_ERR_SYS | KS2_DDR3_2B_ECC_ERR_SYS |
KS2_DDR3_WR_ECC_ERR_SYS,
base + KS2_DDR3_ECC_INT_ENABLE_SET_SYS_OFFSET);
/* Clear the ECC error interrupt status */
__raw_writel(KS2_DDR3_1B_ECC_ERR_SYS | KS2_DDR3_2B_ECC_ERR_SYS |
KS2_DDR3_WR_ECC_ERR_SYS,
base + KS2_DDR3_ECC_INT_STATUS_OFFSET);
}
}
picture below is DTS section of ECC interrupt declaration: