Inaccurate value of performance monitoring counter minstret

Hi,
I’m testing the result of the csrr instructions and I’m getting some strange results.

Here is the test programme adapted from the Hello World sample programme.

 #include <stdio.h>
 ​
 int main()
 {
     // Padding instructions
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;");
     __asm("nop; nop;"); // First instruction of a new cache line
     __asm("nop; nop;");
     
     __asm("csrr t4, minstret"); // I1, ensure that I2 is single issued
     __asm("csrr t5, minstret"); // I2
     __asm("csrr t6, minstret"); // I3
     
     __asm("csrr a6, mcycle");
     __asm("csrr a7, mcycle"); // Ensure that I4 is single issued
     
     __asm("csrr a5, minstret"); // I4
     __asm("sub a1, a7, a6");
     __asm("sub a2, a5, t6");
     printf("Cycle = %d, Instruction = %d\n");
 ​
     __asm("mv a1, t4");
     __asm("mv a2, t5");
     __asm("mv a3, t6");
     __asm("mv a4, a5");
     printf("I1 = %d, I2 = %d, I3 = %d, I4 = %d\n");
 ​
     return 0;
 }

When I run this programme I get the following output, the results are as expected.

 Cycle = 1, Instruction = 3, I1 = 7990, I2 = 7992, I3 = 7993, I4 = 7996

Disassemble code of main is:

 0000000020000a00 <main>:
 20000a00: 41 11         addi    sp, sp, -16
 20000a02: 06 e4         sd     ra, 8(sp)
 20000a04: 22 e0         sd     s0, 0(sp)
 20000a06: 00 08         addi   s0, sp, 16
 20000a08: 01 00         nop
 20000a0a: 01 00         nop
 ...
 20000a40: 01 00         nop
 20000a42: 01 00         nop
 20000a44: 01 00         nop
 20000a46: 01 00         nop
 20000a48: 73 23 20 b0   csrr   t1, minstret
 20000a4c: f3 23 20 b0   csrr   t2, minstret
 20000a50: 73 2e 20 b0   csrr   t3, minstret
 20000a54: f3 2e 00 b0   csrr   t4, mcycle
 20000a58: 73 2f 00 b0   csrr   t5, mcycle
 20000a5c: f3 2f 20 b0   csrr   t6, minstret
 20000a60: b3 05 df 41   sub     a1, t5, t4
 20000a64: 33 86 cf 41   sub     a2, t6, t3
 20000a68: 9a 86         mv     a3, t1
 20000a6a: 1e 87         mv     a4, t2
 20000a6c: f2 87         mv     a5, t3
 20000a6e: 7e 88         mv     a6, t6
 20000a70: 17 05 00 00   auipc   a0, 0
 20000a74: 13 05 85 9a   addi   a0, a0, -1624
 20000a78: ef 00 00 01   jal     0x20000a88 <printf>
 20000a7c: 81 47         li     a5, 0
 20000a7e: 3e 85         mv     a0, a5
 20000a80: a2 60         ld     ra, 8(sp)
 20000a82: 02 64         ld     s0, 0(sp)
 20000a84: 41 01         addi    sp, sp, 16
 20000a86: 82 80         ret

But when I place csrr t3, minstret as the first instruction of a new cache line, the result of I3 is greater than I2 by two. I3 should be only one greater than I2.

#include <stdio.h>

int main()
{
    // Padding instructions
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    
    __asm("csrr t1, minstret"); // I1, ensure that I2 is single issued
    __asm("csrr t2, minstret"); // I2
    __asm("csrr t3, minstret"); // I3, first instruction of a new cache line

    __asm("csrr t4, mcycle");
    __asm("csrr t5, mcycle"); // Ensure that I4 is single issued

    __asm("csrr t6, minstret"); // I4
    __asm("sub a1, t5, t4");
    __asm("sub a2, t6, t3");
    __asm("mv a3, t1");
    __asm("mv a4, t2");
    __asm("mv a5, t3");
    __asm("mv a6, t6");
    printf("Cycle = %d, Instruction = %d, I1 = %d, I2 = %d, I3 = %d, I4 = %d\n");

    return 0;
}

When running this program I get the following output:

 Cycle = 1, Instruction = 2, I1 = 7982, I2 = 7984, I3 = 7986, I4 = 7988

Disassemble code of main is:

 0000000020000a00 <main>:
 20000a00: 41 11         addi    sp, sp, -16
 20000a02: 06 e4         sd     ra, 8(sp)
 20000a04: 22 e0         sd     s0, 0(sp)
 20000a06: 00 08         addi   s0, sp, 16
 20000a08: 01 00         nop
 20000a0a: 01 00         nop
 ...
 20000a34: 01 00         nop
 20000a36: 01 00         nop
 20000a38: 73 23 20 b0   csrr   t1, minstret
 20000a3c: f3 23 20 b0   csrr   t2, minstret
 20000a40: 73 2e 20 b0   csrr   t3, minstret
 20000a44: f3 2e 00 b0   csrr   t4, mcycle
 20000a48: 73 2f 00 b0   csrr   t5, mcycle
 20000a4c: f3 2f 20 b0   csrr   t6, minstret
 20000a50: b3 05 df 41   sub     a1, t5, t4
 20000a54: 33 86 cf 41   sub     a2, t6, t3
 20000a58: 9a 86         mv     a3, t1
 20000a5a: 1e 87         mv     a4, t2
 20000a5c: f2 87         mv     a5, t3
 20000a5e: 7e 88         mv     a6, t6
 20000a60: 17 05 00 00   auipc   a0, 0
 20000a64: 13 05 85 9b   addi   a0, a0, -1608
 20000a68: ef 00 00 01   jal     0x20000a78 <printf>
 20000a6c: 81 47         li     a5, 0
 20000a6e: 3e 85         mv     a0, a5
 20000a70: a2 60         ld     ra, 8(sp)
 20000a72: 02 64         ld     s0, 0(sp)
 20000a74: 41 01         addi    sp, sp, 16
 20000a76: 82 80         ret

And when I place csrr t6, minstret as the first instruction of a new cache line, the result of I4 is greater than I3 by four. I4 should be three greater than I3.

int main()
{
    // Padding instructions
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    __asm("nop; nop;");
    
    __asm("csrr t1, minstret"); // I1, ensure that I2 is single issued
    __asm("csrr t2, minstret"); // I2
    __asm("csrr t3, minstret"); // I3

    __asm("csrr t4, mcycle");
    __asm("csrr t5, mcycle"); // Ensure that I4 is single issued

    __asm("csrr t6, minstret"); // I4, first instruction of a new cache line
    __asm("sub a1, t5, t4");
    __asm("sub a2, t6, t3");
    __asm("mv a3, t1");
    __asm("mv a4, t2");
    __asm("mv a5, t3");
    __asm("mv a6, t6");
    printf("Cycle = %d, Instruction = %d, I1 = %d, I2 = %d, I3 = %d, I4 = %d\n");

    return 0;
}

When running this program I get the following output:

 Cycle = 1, Instruction = 4, I1 = 7976, I2 = 7978, I3 = 7979, I4 = 7983

Disassemble code of main is:

 0000000020000a00 <main>:
 20000a00: 41 11         addi    sp, sp, -16
 20000a02: 06 e4         sd     ra, 8(sp)
 20000a04: 22 e0         sd     s0, 0(sp)
 20000a06: 00 08         addi   s0, sp, 16
 20000a08: 01 00         nop
 20000a0a: 01 00         nop
 ...
 20000a28: 01 00         nop
 20000a2a: 01 00         nop
 20000a2c: 73 23 20 b0   csrr   t1, minstret
 20000a30: f3 23 20 b0   csrr   t2, minstret
 20000a34: 73 2e 20 b0   csrr   t3, minstret
 20000a38: f3 2e 00 b0   csrr   t4, mcycle
 20000a3c: 73 2f 00 b0   csrr   t5, mcycle
 20000a40: f3 2f 20 b0   csrr   t6, minstret
 20000a44: b3 05 df 41   sub     a1, t5, t4
 20000a48: 33 86 cf 41   sub     a2, t6, t3
 20000a4c: 9a 86         mv     a3, t1
 20000a4e: 1e 87         mv     a4, t2
 20000a50: f2 87         mv     a5, t3
 20000a52: 7e 88         mv     a6, t6
 20000a54: 17 05 00 00   auipc   a0, 0
 20000a58: 13 05 45 9c   addi   a0, a0, -1596
 20000a5c: ef 00 00 01   jal     0x20000a6c <printf>
 20000a60: 81 47         li     a5, 0
 20000a62: 3e 85         mv     a0, a5
 20000a64: a2 60         ld     ra, 8(sp)
 20000a66: 02 64         ld     s0, 0(sp)
 20000a68: 41 01         addi    sp, sp, 16
 20000a6a: 82 80         ret