I have the following C64x+ (OMAP4 Tesla) code which works well, runs in 22ms and is optimized by the compiler:
#pragma CODE_SECTION(colorDetect, ".text:intrinsic"); void colorDetect(const unsigned char *uv, short w, short h) { int p, c, i = 0, pixels = w * h / 2, w2 = w /2; int ar = 0, mX = 0; int p1, p2, p1p2; unsigned vuvumin; _nassert(w % 4 == 0); _nassert(w >= 16); _nassert(w <= 32764); _nassert(h <= 32764); _nassert((int)uv % 8 == 0); _nassert(pixels % 16 == 0); _nassert(pixels >= 16); vuvumin = _pack2(150, 130); vuvumin = _packl4(vuvumin, vuvumin); #pragma MUST_ITERATE(4,,4); #pragma UNROLL(4); for (c = 0; c < pixels; c += 4, i+= 2) { if (i == w2) { i = 0; } p = _cmpgtu4(_amem4_const(&uv[c]), vuvumin); p1 = ((p & 3) == 3); p2 = ((p & 12) == 12); p1p2 = p1 + p2; mX += p1p2 * i + p2; ar += p1p2; } ... }
The compiler seems to do a good job of optimization:
3461 ;*----------------------------------------------------------------------------* 3462 ;* SOFTWARE PIPELINE INFORMATION 3463 ;* 3464 ;* Loop source line : 137 3465 ;* Loop opening brace source line : 137 TMS320C6x Assembler Unix v7.2.5 Wed Aug 27 14:13:35 2014 Tools Copyright (c) 1996-2011 Texas Instruments Incorporated ./package/cfg/ti_platform_omap4430_dsp/release/tesla-dsp/../main_module.se64T PAGE 64 3466 ;* Loop closing brace source line : 145 3467 ;* Loop Unroll Multiple : 4x 3468 ;* Known Minimum Trip Count : 1 3469 ;* Known Max Trip Count Factor : 1 3470 ;* Loop Carried Dependency Bound(^) : 12 3471 ;* Unpartitioned Resource Bound : 19 3472 ;* Partitioned Resource Bound(*) : 19 3473 ;* Searching for software pipeline schedule at ... 3474 ;* ii = 19 Schedule found with 2 iterations in parallel 3475 ;* 3476 ;* Register Usage Table: 3477 ;* +---------------------------------+ 3478 ;* |AAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBB| 3479 ;* |0000000000111111|0000000000111111| 3480 ;* |0123456789012345|0123456789012345| 3481 ;* |----------------+----------------| 3482 ;* 0: |**** ***********|** **** ** * | 3483 ;* 1: | *** ***********|** ***** ** * | 3484 ;* 2: | ***************|** ***** ** * | 3485 ;* 3: |***** **********|*** ***** ** * | 3486 ;* 4: | **** **********|********* ** * | 3487 ;* 5: | **** **********|***** ******* | 3488 ;* 6: |**** **********|************** | 3489 ;* 7: | **** **********|************** | 3490 ;* 8: |****************|************** | 3491 ;* 9: | ***************|************** | 3492 ;* 10: | ***************|*** ********** | 3493 ;* 11: | ***************|************** | 3494 ;* 12: |****************|************** | 3495 ;* 13: | ***************|************* | 3496 ;* 14: | ***************|***** ******* | 3497 ;* 15: | ***************|**** ****** | 3498 ;* 16: |**** ***********|***** ****** | 3499 ;* 17: | *** ***********|** *** **** | 3500 ;* 18: | *** ***********|** *** ** * | 3501 ;* +---------------------------------+ 3502 ;* 3503 ;* Done 3504 ;* 3505 ;* Loop is interruptible 3506 ;* Redundant loop generated 3507 ;* Epilog not removed 3508 ;* Collapsed epilog stages : 0 3509 ;* 3510 ;* Prolog not removed 3511 ;* Collapsed prolog stages : 0 3512 ;* 3513 ;* Minimum required memory pad : 0 bytes 3514 ;* 3515 ;* For further improvement on this loop, try option -mh16 3516 ;* 3517 ;* Minimum safe trip count : 2 (after unrolling) 3518 ;* Min. prof. trip count (est.) : 3 (after unrolling) 3519 ;* 3520 ;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 } TMS320C6x Assembler Unix v7.2.5 Wed Aug 27 14:13:35 2014 Tools Copyright (c) 1996-2011 Texas Instruments Incorporated ./package/cfg/ti_platform_omap4430_dsp/release/tesla-dsp/../main_module.se64T PAGE 65 3521 ;* Mem bank perf. penalty (est.) : 0.0% 3522 ;* 3523 ;* 3524 ;* Total cycles (est.) : 9 + trip_cnt * 19 3525 ;*----------------------------------------------------------------------------*
Then, I want to calculate the vertical moment too. It runs in 50 ms (twice as slow as before):
#pragma CODE_SECTION(colorDetect, ".text:intrinsic"); void colorDetect(const unsigned char *uv, short w, short h) { int p, c, i = 0, j = 0, pixels = w * h / 2, w2 = w /2; int ar = 0, mX = 0, mY = 0; int p1, p2, p1p2; unsigned vuvumin; _nassert(w % 4 == 0); _nassert(w >= 16); _nassert(w <= 32764); _nassert(h <= 32764); _nassert((int)uv % 8 == 0); _nassert(pixels % 16 == 0); _nassert(pixels >= 16); vuvumin = _pack2(150, 130); vuvumin = _packl4(vuvumin, vuvumin); #pragma MUST_ITERATE(4,,4); #pragma UNROLL(4); for (c = 0; c < pixels; c += 4, i+= 2) { if (i == w2) { i = 0; j++; } p = _cmpgtu4(_amem4_const(&uv[c]), vuvumin); p1 = ((p & 3) == 3); p2 = ((p & 12) == 12); p1p2 = p1 + p2; mX += p1p2 * i + p2; mY += p1p2 * j; ar += p1p2; } ... }
The compiler doesn't seem to be happy:
3322 ;*----------------------------------------------------------------------------* 3323 ;* SOFTWARE PIPELINE INFORMATION 3324 ;* 3325 ;* Loop source line : 137 3326 ;* Loop opening brace source line : 137 3327 ;* Loop closing brace source line : 146 3328 ;* Loop Unroll Multiple : 4x 3329 ;* Known Minimum Trip Count : 1 3330 ;* Known Max Trip Count Factor : 1 3331 ;* Loop Carried Dependency Bound(^) : 12 3332 ;* Unpartitioned Resource Bound : 22 3333 ;* Partitioned Resource Bound(*) : 22 3334 ;* Searching for software pipeline schedule at ... 3335 ;* ii = 22 Cannot allocate machine registers 3336 ;* Regs Live Always : 21/21/ 3337 ;* Max Regs Live : 36/36/ 3338 ;* Max Cond Regs Live : 2/ 2/
How can I optimize this situation like in the first case? What would be the solution to avoid this "Cannot allocate machine registers" problem? Or is there any other intrinsics I could use while calculating "mX +=" and "mY +="? I was thinking about _ddot* but I'm not sure how to do it.
Note that I would be ready to avoid the j variable and use the c variable in the mY calculation (dividing by w2 after the loop). I would probably lose a little bit of precision but probably not that much,
Grégoire