Hi,
I try to achieve fastest possible bootup with C6748. My last post about overall plan and SYS/BIOS startup delays is here: http://e2e.ti.com/support/dsp/omap_applications_processors/f/42/p/290700/1015923.aspx#1015923
Anyway, now I have implemented 4 bit GPIO bitbanged read for Micron 25Q032 flash memory. The problem is that GPIO bitbanging is not fast enough. 25Q032 accepts clock upto 108 MHz, however I am very far from there with bitbanging.
From attached picture can be seen that toggling pin takes ca 40 ns, however reading GPIO pin value takes 260 ns. Question is how is reading so slow compared to writing? C6748 GPIO datasheet http://www.ti.com/lit/ug/sprufl8b/sprufl8b.pdf says (page 15) that reading GPIO is synchronized to GPIO clock, which is SYSCLK4. I am running DSP with 300 MHz clock, so SYSCLK4 should be 75 MHz. C6748 datasheet page 256 http://www.ti.com/lit/ds/symlink/tms320c6748.pdf shows some GPIO timing diagrams. I read out from there that minimum toggle time is 2C, where C is GPIO clock period. With 75 MHz it would be 2 + 13 = 26 ns. My logic analyzer does not sample fast enough to get such resolution. So I guess actually this 40 ns pin toggle what I measured could be less. Lets assume write also needs to synchronized with clock, then it is almost according to datasheet. I expect read to happen also within something around 30-40 ns, however it takes more like 260 ns. Which seems to be out of spec.
I wrote my own GPIO function based on Starterware GPIO API. I did not like that Starterware driver calculated pin registers and offsets runtime. Therefore I created these defines.
#define NONVOLATILEHWREG(x) (*((unsigned int *)(x)))
#define GPIOPINNUMBER(bank, pin) ((bank << 4) | (pin + 1))
#define CS0PIN GPIOPINNUMBER(1, 6)
#define CS0REGNUMBER ((CS0PIN - 1) / 32)
#define CS0PINOFFSET ((CS0PIN - 1) % 32)
//; |../bl_spi.c:156|
inline void CLKPinWrite0() {
 HWREG((SOC_GPIO_0_REGS + GPIO_CLR_DATA(CLKREGNUMBER))) = (1 << CLKPINOFFSET);
}
// ; |../bl_spi.c:160|
inline void CLKPinWrite1() {
 HWREG((SOC_GPIO_0_REGS + GPIO_SET_DATA(CLKREGNUMBER))) = (1 << CLKPINOFFSET);
}
inline unsigned int DQ0DQ3PinRead() {
 unsigned int val = NONVOLATILEHWREG(SOC_GPIO_0_REGS + GPIO_IN_DATA(DQ0REGNUMBER));
 return ((val & (1 << DQ3PINOFFSET)) << 1) | ((val & (1 << DQ2PINOFFSET)) >> 2) | ((val & (3 << DQ0PINOFFSET)) >> 5);
}
void pseudotestread(unsigned char* restrict destAddr) {
volatile int a;
a = 0xBEEF; // |../bl_spi.c:325| 
CLKPinWrite0();
CLKPinWrite1();
rx_data = (DQ0DQ3PinRead() << 4); //|../bl_spi.c:329|
CLKPinWrite0();
CLKPinWrite1();
rx_data |= (DQ0DQ3PinRead()); // |../bl_spi.c:333|
CLKPinWrite0();
CLKPinWrite1();
destAddr[0] = rx_data;
a = 0xBEEF; // |../bl_spi.c:341|
while(1) {
}
}
Assembly:
; EXCLUSIVE CPU CYCLES: 24
MVKL .S2 0xbeef,B4
|| MVKL .S1 0x1e2601c,A4
MVKH .S2 0xbeef,B4
|| MVKH .S1 0x1e2601c,A4
|| ZERO .L1 A6
ADD .L1 -4,A4,A3
|| STW .D2T2 B4,*+SP(4) ; |../bl_spi.c:325|
|| SET .S1 A6,0x18,0x18,A6
STW .D1T1 A6,*A4 ; |../bl_spi.c:156|
|| MVK .S2 168,B6
ADD .L2X A3,B6,B6
|| STW .D1T1 A6,*A3 ; |../bl_spi.c:160|
LDW .D2T2 *B6,B5 ; |../bl_spi.c:329|
ZERO .L2 B7
SET .S2 B7,0x2,0x1e,B7
MV .L2X A4,B9 ; |../bl_spi.c:329|
STW .D2T1 A6,*B9 ; |../bl_spi.c:156|
SHRU .S2 B5,2,B8 ; |../bl_spi.c:329|
|| AND .L2 B7,B5,B7 ; |../bl_spi.c:329|
AND .L2 4,B8,B8 ; |../bl_spi.c:329|
|| ADD .S2 B7,B7,B7 ; |../bl_spi.c:329|
OR .L2 B8,B7,B7 ; |../bl_spi.c:329|
|| EXTU .S2 B5,25,30,B5 ; |../bl_spi.c:329|
|| STW .D1T1 A6,*A3 ; |../bl_spi.c:160|
LDW .D2T2 *B6,B5 ; |../bl_spi.c:333|
|| OR .L2 B5,B7,B6 ; |../bl_spi.c:329|
EXTU .S2 B6,28,24,B6 ; |../bl_spi.c:329|
STW .D1T1 A6,*A4 ; |../bl_spi.c:156|
STW .D1T1 A6,*A3 ; |../bl_spi.c:160|
STW .D2T2 B4,*+SP(4) ; |../bl_spi.c:341|
AND .L1X 4,B5,A5 ; |../bl_spi.c:333|
|| SHRU .S2 B5,2,B7 ; |../bl_spi.c:333|
ADD .L1 A5,A5,A5 ; |../bl_spi.c:333|
|| AND .L2 4,B7,B7 ; |../bl_spi.c:333|
OR .L1X A5,B6,A5 ; |../bl_spi.c:333|
|| EXTU .S2 B5,25,30,B5 ; |../bl_spi.c:333|
OR .L1X B7,A5,A4 ; |../bl_spi.c:333|
OR .L1X B5,A4,A3 ; |../bl_spi.c:333|
STB .D1T1 A3,*A10 ; |../bl_spi.c:337|
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;* Disqualified loop: Bad loop structure
;*----------------------------------------------------------------------------*
$C$L11:
; EXCLUSIVE CPU CYCLES: 6
BNOP .S1 $C$L11,5 ; |../bl_spi.c:342|
; BRANCH OCCURS {$C$L11} ; |../bl_spi.c:342|
.sect ".text"
.clink
Build options
"C:/ti/ccsv5/tools/compiler/c6000_7.4.2/bin/cl6x" -mv6740 --abi=eabi -O2 --symdebug:none --optimize_with_debug=off --include_path="C:/ti/ccsv5/tools/compiler/c6000_7.4.2/include" --include_path="C:/Program Files/Texas Instruments/pdk_C6748_2_0_0_0/C6748_StarterWare_1_20_03_03/include" --include_path="C:/Program Files/Texas Instruments/pdk_C6748_2_0_0_0/C6748_StarterWare_1_20_03_03/include/hw" --include_path="C:/Program Files/Texas Instruments/pdk_C6748_2_0_0_0/C6748_StarterWare_1_20_03_03/include/c674x/c6748" --program_level_compile --define=c6748 --display_error_number --diag_warning=225 --no_bad_aliases --debug_software_pipeline --opt_for_speed=5 --call_assumptions=3 -k "../bl_copy_rprc.c" "../bl_platform.c" "../bl_spi.c" "../main.c" "../uartConsole.c"
My question is: is it optimal? So is it rather hardware delay not software, although my interpretation from datasheet says that it should be faster from hardware point of view.
Andres
EDIT: I did not notice this before:
GPIx duration must be extended to allow the device enough time to access the GPIO register through the internal bus.
So actual read is 2C + bus time. I guess it is so slow because of bus time and there is nothing I can do about it.
 
				 
		 
					 
                          
 
				