Latency of LW

Hi,
I’m testing cycle count of LW but get some strange results.

Here is my test programme, adapted from the Hello World sample programme.

#include <stdio.h>

int main() {
    __asm("mv a0, sp");
    __asm("csrr	a2, minstret");
    __asm("csrr	a3, mcycle");

    __asm("lw	a0,0(a0)");

    __asm("csrr a4, mcycle");
    __asm("csrr	a5, minstret");
    __asm("sub	a1, a4, a3");
    __asm("sub	a2, a5, a2");
    printf("Load from sp:\tcycles = %d instructions = %d\n");

    // metal_dtim_0_memory_start = 0x1000000
    __asm("lui a0, 4096");
    __asm("csrr	a2, minstret");
    __asm("csrr	a3, mcycle");

    __asm("lw	a0,0(a0)");

    __asm("csrr a4, mcycle");
    __asm("csrr	a5, minstret");
    __asm("sub	a1, a4, a3");
    __asm("sub	a2, a5, a2");
    printf("Load from DTIM:\tcycles = %d instructions = %d\n");
    
    return 0;
}

When running this program I get the following output:

Load from sp:	cycles = 7 instructions = 4
Load from DTIM:	cycles = 1 instructions = 4

But the SiFive U74 Core Complex manual says that LW has a latency of three cycles. Can anyone help me understand why this is happening?

You can’t just use inline assembly like that. You need to tell the compiler about any registers you clobber, and you cannot assume that registers will be untouched by the compiler outside of each inline asm statement (i.e. both between asm statements and from an asm statement to a printf).

As for why you’re seeing that behaviour, I don’t know. Maybe you’re crossing an instruction cache line boundary in the middle of the first sequence and picking up the penalty of that cache miss.

Also note that latency isn’t just “how many cycles the instruction takes”. The U74 is pipelined, so the execution of other instructions can overlap your loads, so long as the loaded value isn’t needed as an input to them. The latency is about what happens when you have instructions that depend on each other, and even then it can be complex based on what bypassing is present in the design.

Thanks @jrtc27 for the feedback.

It’s true that shouldn’t use inline assembly in this way, I only use it for testing purposes because it’s not easy to control the instructions generated by the compiler using c code. I did this test because I found that the IPC of the coremark sample programme was only 0.25. I’m wondering if I have the wrong configuration or if it’s for some other reason.

And I checked that no registers are touched by the compiler. Here is the disassembler code.

0000000020000a00 <main>:
20000a00: 41 11         addi    sp, sp, -16
20000a02: 06 e4         sd      ra, 8(sp)
20000a04: 22 e0         sd      s0, 0(sp)
20000a06: 00 08         addi    s0, sp, 16
20000a08: 0a 85         mv      a0, sp
20000a0a: 73 26 20 b0   csrr    a2, minstret
20000a0e: f3 26 00 b0   csrr    a3, mcycle
20000a12: 08 41         lw      a0, 0(a0)
20000a14: 73 27 00 b0   csrr    a4, mcycle
20000a18: f3 27 20 b0   csrr    a5, minstret
20000a1c: b3 05 d7 40   sub     a1, a4, a3
20000a20: 33 86 c7 40   sub     a2, a5, a2
20000a24: 17 05 00 00   auipc   a0, 0
20000a28: 13 05 45 9f   addi    a0, a0, -1548
20000a2c: ef 00 a0 03   jal     0x20000a66 <printf>
20000a30: 37 05 00 01   lui     a0, 4096
20000a34: 73 26 20 b0   csrr    a2, minstret
20000a38: f3 26 00 b0   csrr    a3, mcycle
20000a3c: 08 41         lw      a0, 0(a0)
20000a3e: 73 27 00 b0   csrr    a4, mcycle
20000a42: f3 27 20 b0   csrr    a5, minstret
20000a46: b3 05 d7 40   sub     a1, a4, a3
20000a4a: 33 86 c7 40   sub     a2, a5, a2
20000a4e: 17 05 00 00   auipc   a0, 0
20000a52: 13 05 a5 9f   addi    a0, a0, -1542
20000a56: ef 00 00 01   jal     0x20000a66 <printf>
20000a5a: 81 47         li      a5, 0
20000a5c: 3e 85         mv      a0, a5
20000a5e: a2 60         ld      ra, 8(sp)
20000a60: 02 64         ld      s0, 0(sp)
20000a62: 41 01         addi    sp, sp, 16
20000a64: 82 80         ret

I also did other tests, the result shows that two loads cost 15 cycles. And if the target registers are the same, it will cost 37 cycles. But the cycle count is as expected when loading from DTIM.
Here are the test codes and the output.

Testing of two loads with different target registers

#include <stdio.h>

int main() {
    __asm("mv a0, sp");
    __asm("csrr	a2, minstret");
    __asm("csrr	a3, mcycle");

    __asm("lw	t0,0(a0)");
    __asm("lw	t1,0(a0)");

    __asm("csrr a4, mcycle");
    __asm("csrr	a5, minstret");
    __asm("sub	a1, a4, a3");
    __asm("sub	a2, a5, a2");
    printf("Load from sp:\tcycles = %d instructions = %d\n");
    return 0;
}

Output:

Load from sp:   cycles = 15 instructions = 5

Disassembly code:

0000000020000a00 <main>:
20000a00: 41 11         addi    sp, sp, -16
20000a02: 06 e4         sd      ra, 8(sp)
20000a04: 22 e0         sd      s0, 0(sp)
20000a06: 00 08         addi    s0, sp, 16
20000a08: 0a 85         mv      a0, sp
20000a0a: 73 26 20 b0   csrr    a2, minstret
20000a0e: f3 26 00 b0   csrr    a3, mcycle
20000a12: 83 22 05 00   lw      t0, 0(a0)
20000a16: 03 23 05 00   lw      t1, 0(a0)
20000a1a: 73 27 00 b0   csrr    a4, mcycle
20000a1e: f3 27 20 b0   csrr    a5, minstret
20000a22: b3 05 d7 40   sub     a1, a4, a3
20000a26: 33 86 c7 40   sub     a2, a5, a2
20000a2a: 17 05 00 00   auipc   a0, 0
20000a2e: 13 05 e5 9e   addi    a0, a0, -1554
20000a32: ef 00 00 01   jal     0x20000a42 <printf>
20000a36: 81 47         li      a5, 0
20000a38: 3e 85         mv      a0, a5
20000a3a: a2 60         ld      ra, 8(sp)
20000a3c: 02 64         ld      s0, 0(sp)
20000a3e: 41 01         addi    sp, sp, 16
20000a40: 82 80         ret

Testing of two loads with the same target register

#include <stdio.h>

int main() {
    __asm("mv a0, sp");
    __asm("csrr	a2, minstret");
    __asm("csrr	a3, mcycle");

    __asm("lw	t0,0(a0)");
    __asm("lw	t0,0(a0)");

    __asm("csrr a4, mcycle");
    __asm("csrr	a5, minstret");
    __asm("sub	a1, a4, a3");
    __asm("sub	a2, a5, a2");
    printf("Load from sp:\tcycles = %d instructions = %d\n");
    return 0;
}

Output:

Load from sp:   cycles = 37 instructions = 5

Disassembly code:

0000000020000a00 <main>:
20000a00: 41 11         addi    sp, sp, -16
20000a02: 06 e4         sd      ra, 8(sp)
20000a04: 22 e0         sd      s0, 0(sp)
20000a06: 00 08         addi    s0, sp, 16
20000a08: 0a 85         mv      a0, sp
20000a0a: 73 26 20 b0   csrr    a2, minstret
20000a0e: f3 26 00 b0   csrr    a3, mcycle
20000a12: 83 22 05 00   lw      t0, 0(a0)
20000a16: 83 22 05 00   lw      t0, 0(a0)
20000a1a: 73 27 00 b0   csrr    a4, mcycle
20000a1e: f3 27 20 b0   csrr    a5, minstret
20000a22: b3 05 d7 40   sub     a1, a4, a3
20000a26: 33 86 c7 40   sub     a2, a5, a2
20000a2a: 17 05 00 00   auipc   a0, 0
20000a2e: 13 05 e5 9e   addi    a0, a0, -1554
20000a32: ef 00 00 01   jal     0x20000a42 <printf>
20000a36: 81 47         li      a5, 0
20000a38: 3e 85         mv      a0, a5
20000a3a: a2 60         ld      ra, 8(sp)
20000a3c: 02 64         ld      s0, 0(sp)
20000a3e: 41 01         addi    sp, sp, 16
20000a40: 82 80         ret