Hi experts,
I am using tda4vm sdk8.1, and I try to set no cached memory region (64M) to zero by c library memset function on mcu3_0,
but it consume about 14 second.
I use mem_set8_arm function to replace memset as follow , it consume about 500ms.
static void *mem_set8_arm (void *dest, int c, size_t n) { uint32_t *d = dest; uint8_t *dc = dest; uint32_t setflag32 = (c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000); uint8_t setflag8 = c & 0xff; while (n >= 64) { __asm __volatile ( "\n\t mov r4, %[flag]" "\n\t mov r5, r4" "\n\t mov r6, r4" "\n\t mov r7, r4" "\n\t stmia %[dst]!,{r4-r7}" "\n\t stmia %[dst]!,{r4-r7}" :: [dst] "r" (d), [flag] "r" (&setflag32) : "r4", "r4", "r6", "r7"); d += 16; n -= 64; } while (n >= 4) { *d++ = setflag32; n -= 4; } dc = (uint8_t *) d; while (n--) *dc++ = setflag8; return dest; }
there are two problem:
1. Why LLVM c library memset/memcpy function too slow ?
2. Why r5 access ddr bandwidth only about 100M/s ?
Regards,
Li quan