Instruction Buffer

Life is all like this, meaningless, hopeless and

Inst Buffer Structure

Inst buffer包含32个entries,每个entry的structure如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
//==========================================================
//Inst Buffer Entry Fields Description:
//+-----+------+----------+---------+-------+-----------+--------+--------+-------+
//| vld | inst | 32_start | acc_err | pgflt | high_expt | split1 | split0 | fence |
//+-----+------+----------+---------+-------+-----------+--------+--------+-------+
//==========================================================
//vld means entry valid
//inst[15:0] means the half word inst data
//32_start means this half is the start of 32 inst
//acc_err means this half have acc_err expt
//pgflt means this half have pgflt expt
//tinv means this half have tinv expt
//tfatal means this half have tfatal expt
//high_expt means 32 bit inst & expt happen at low half
//split1 means predecode info
//split0 means predecode info
//fence means predecode info

Inst Buffer 关键逻辑

空满逻辑

ibuf的空满逻辑如下,当ibuf满时,需要将ibctrl进行stall

  • Full:当ibuf中不能再保存9条指令时,为满(指针左移8位,并与vld信号比较,如果发现移动后的指针所在位置已经vld,说明ibuf full)
  • Empty:当创建的指令数和退休的指令数相同,且entry invalid,则为空
1
2
3
4
5
6
7
8
9
10
assign ibuf_full  = |({ibuf_create_pointer[ENTRY_NUM-9:0],
ibuf_create_pointer[ENTRY_NUM-1:ENTRY_NUM-8]} &
entry_vld[31:0]);
assign ibuf_empty = (ibuf_create_num[4:0] ==
ibuf_retire_num[4:0]) &&
!entry_vld[0]; //in case of 32 entry all valid

assign ibuf_lbuf_empty = ibuf_empty;
assign ibuf_ibctrl_stall = ibuf_full;
assign ibuf_ibctrl_empty = ibuf_empty;

Empty的判断条件相对比较绕

Time
$T$
$2T$
$3T$
$4T$

Create Logic

ibuf的create逻辑中,我们重点需要解决两个问题:

  • ibuf什么位置保存指令(pointer logic)
  • 指令是否需要保存在ibuf中(vld logic)

Create pointer logic

对于一个FIFO,我们会有入队列和出队列两种逻辑,而在ibuf中,入队列的逻辑是由create pointer logic进行控制的,使用了一个变量ibuf_create_pointer作为FIFO的指针。当CPU进行rst之后,ibuf_create_pointer的值为0x00000001。更新create pointer时,会根据半字的数量,预先生成create pointer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
always @( ibuf_create_pointer[31:0]
or ibdp_ibuf_half_vld_num[3:0]) // 根据半字的数量,决定create_pointer
begin
case(ibdp_ibuf_half_vld_num[3:0])
4'b0001 : create_pointer_pre[ENTRY_NUM-1:0] = {ibuf_create_pointer[ENTRY_NUM-2:0], // 坐移一位
ibuf_create_pointer[ENTRY_NUM-1]};
4'b0010 : create_pointer_pre[ENTRY_NUM-1:0] = {ibuf_create_pointer[ENTRY_NUM-3:0], // 左移两位
ibuf_create_pointer[ENTRY_NUM-1:ENTRY_NUM-2]};
4'b1001:
// 左移九位
...
endcase
// &CombEnd; @187
end

/*
* 将准备好的pointer赋给实际的ibuf create pointer
*/
always @(posedge ibuf_create_pointer_update_clk or negedge cpurst_b)
begin
if(!cpurst_b || ibuf_flush)
ibuf_create_pointer[ENTRY_NUM-1:0] <= {{(ENTRY_NUM-1){1'b0}}, 1'b1};
else if(ibuf_create_vld)
ibuf_create_pointer[ENTRY_NUM-1:0] <= create_pointer_pre[ENTRY_NUM-1:0];
else
ibuf_create_pointer[ENTRY_NUM-1:0] <= ibuf_create_pointer[ENTRY_NUM-1:0];
end

由于最多有9条指令需要被保存在ibuf中,因此我们会创建9个pointer(通过左移指向待保存的位置)。这个半字的数量是由ibdp_ibuf_half_vld_num决定的,根据block中是否存在条件分支或转移语句,决定提供给ib阶段的半字有多少个:

1
2
3
4
5
assign ibdp_half_vld_num[3:0]    = (ipdp_ibdp_con_br_num_vld)
? ipdp_ibdp_con_br_num[3:0]
: (ipdp_ibdp_chgflw_num_vld)
? ipdp_ibdp_chgflw_num[3:0]
: ipdp_ibdp_no_chgflw_num[3:0];

所以,此处createpointer_pre表示ibuf需要预先创建多少个指针用来保存需要存在ibuf中的指令数目$N{max}$,但是实际保存的指令数目$N<N_{max}$。那么,实际需要保存至ibuf中的指令数目该如何计算呢,设ibuf中当前保存的指令数目为$N_0$,那么理论上需要保存的有效指令数最大为$n_v$,则ibuf中预计需要保存到指令数为:

$N_{max}$表示无bypass的情况。

1
2
3
4
5
6
7
8
9
10
11
12
always @( ibuf_create_num[4:0]
or ibdp_ibuf_half_vld_num[3:0])
begin
case(ibdp_ibuf_half_vld_num[3:0])
4'b0001 : create_num_pre[4:0] = ibuf_create_num[4:0] + 5'd1;
4'b0010 : create_num_pre[4:0] = ibuf_create_num[4:0] + 5'd2;
...
4'b1001 : create_num_pre[4:0] = ibuf_create_num[4:0] + 5'd9;
default : create_num_pre[4:0] = ibuf_create_num[4:0];
endcase
// &CombEnd; @282
end

而实际保存的指令需要减掉bypass的指令数$n_b$,故最终保存在ibuf中有效的指令数目为:

而$n_b$在处理bypass的情况时进行计算

1
2
3
4
5
6
7
8
9
10
11
12
13
always @( ibuf_create_num[4:0]
or bypass_way_inst2_valid
or bypass_way_half_num[2:0]
or create_num_pre[4:0])
begin
casez({bypass_way_inst2_valid, bypass_way_half_num[2:0]})
4'b0??? : create_num_pre_bypass[4:0] = ibuf_create_num[4:0];
4'b1011 : create_num_pre_bypass[4:0] = create_num_pre[4:0] - 5'd3;
4'b1100 : create_num_pre_bypass[4:0] = create_num_pre[4:0] - 5'd4;
4'b1101 : create_num_pre_bypass[4:0] = create_num_pre[4:0] - 5'd5;
4'b1110 : create_num_pre_bypass[4:0] = create_num_pre[4:0] - 5'd6;
default : create_num_pre_bypass[4:0] = create_num_pre[4:0];
endcase

实际的指针如下:

1
2
3
4
5
6
7
8
assign ibuf_create_pointer0[ENTRY_NUM-1:0] =  ibuf_create_pointer[ENTRY_NUM-1:0];
assign ibuf_create_pointer1[ENTRY_NUM-1:0] = {ibuf_create_pointer[ENTRY_NUM-2:0],
ibuf_create_pointer[ENTRY_NUM-1]};
assign ibuf_create_pointer2[ENTRY_NUM-1:0] = {ibuf_create_pointer[ENTRY_NUM-3:0],
ibuf_create_pointer[ENTRY_NUM-1:ENTRY_NUM-2]};
...
assign ibuf_create_pointer8[ENTRY_NUM-1:0] = {ibuf_create_pointer[ENTRY_NUM-9:0],
ibuf_create_pointer[ENTRY_NUM-1:ENTRY_NUM-8]};

Create Num Logic

在对FIFO进行入队列操作时,我们还需要对进入FIFO的指令数量进行统计,这个参数会被用于判断bypass是否有效,以及ibuf是否为空,计算方法如下:

其中,$V$是用于保存至ibuf中的有效半字个数

Merge 情况处理

当ibuf中存在指令,但是指令不足以凑成完整的三条送入idu时,需要和ip送来的指令进行Merge

Merge 指令数量计算

我们需要根据ibuf中能pop出的指令数目,来确认需要bypass提供的merge指令数目

其中,$N_m$为bypass提供的与pop的指令进行merge的指令(32位)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
casez({ibuf_pop_inst1_valid,bypass_way_inst0_32_start,bypass_way_inst1_valid,bypass_way_inst1_32_start})
// if inst1 is valid, bypass only needs merge 1
4'b10?? : merge_way_inst1_num[4:0] = 5'b00001;
4'b11?? : merge_way_inst1_num[4:0] = 5'b00010;
4'b000? : merge_way_inst1_num[4:0] = 5'b00001;
4'b0010 : merge_way_inst1_num[4:0] = 5'b00010;
4'b0011 : merge_way_inst1_num[4:0] = 5'b00011;
4'b010? : merge_way_inst1_num[4:0] = 5'b00010;
4'b0110 : merge_way_inst1_num[4:0] = 5'b00011;
4'b0111 : merge_way_inst1_num[4:0] = 5'b00100;
default : merge_way_inst1_num[4:0] = 5'b00000;
endcase
// &CombEnd; @5278
end

当发生merge时

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// &CombBeg; @315
always @( merge_half_num[4:0]
or ibuf_create_num[4:0]
or ibuf_retire_num[4:0]
or ibuf_pop_inst2_valid
or ibuf_pop3_half_num[2:0])
begin
casez({ibuf_pop_inst2_valid, ibuf_pop3_half_num[2:0]})
// This is the merge number when merge happened, but don't know why add merge_half_num
// 这个是什么原理,为啥是创建的数目+merge的bypass数目
4'b0??? : retire_num_pre[4:0] = ibuf_create_num[4:0] + merge_half_num[4:0];

4'b1011 : retire_num_pre[4:0] = ibuf_retire_num[4:0] + 5'd3;
4'b1100 : retire_num_pre[4:0] = ibuf_retire_num[4:0] + 5'd4;
4'b1101 : retire_num_pre[4:0] = ibuf_retire_num[4:0] + 5'd5;
4'b1110 : retire_num_pre[4:0] = ibuf_retire_num[4:0] + 5'd6;
default : retire_num_pre[4:0] = ibuf_retire_num[4:0];
endcase
// &CombEnd; @324
end

Merge retire指针计算

1
2
3
4
5
6
7
8
9
casez({ibuf_pop_inst1_valid,bypass_way_inst0_32_start,bypass_way_inst1_valid,bypass_way_inst1_32_start})
4'b10?? : merge_way_retire_pointer[ENTRY_NUM-1:0] = ibuf_create_pointer1[ENTRY_NUM-1:0];
4'b11?? : merge_way_retire_pointer[ENTRY_NUM-1:0] = ibuf_create_pointer2[ENTRY_NUM-1:0];
...
4'b0111 : merge_way_retire_pointer[ENTRY_NUM-1:0] = ibuf_create_pointer4[ENTRY_NUM-1:0];
default : merge_way_retire_pointer[ENTRY_NUM-1:0] = ibuf_create_pointer0[ENTRY_NUM-1:0];
endcase
// &CombEnd; @5292
end

Create Valid Logic

在前面的小节中我们讨论了当向ibuf中保存指令时Create指针移动的逻辑,本节讨论哪些指令需要保存在ibuf中或从ibuf中退休,即entry create & retire vld的逻辑。在Xuantie的代码中,使用了entry_create_*entry_retire_*两个32位的变量,与ibuf的32个entry一一对应,用于表示哪一个entry需要创建或退休。本文首先讨论entry create的逻辑。

在entry create的逻辑中,Xuantie的代码分为了nopass和bypass两种类型进行讨论,实际nopass还可以分为merge和pop两种情况。对于nopass的情况,我们依据三个条件确定需要create的entry:

  • ibufcreate指针所在位置,即create_pointer0-9
  • 传递给ibuf的指令的有效性,即ib_hn_create_vld[8:0]
  • merge指令的掩码(当merge无效时为全1,此时无merge指令,否则根据merge指令数量及指令是否为32bit,确定掩码)

对于bypass的情况,则根据ibufcreate指针所在位置以及bypass的指令数量及类型,确定create的entry。

1
2
3
4
5
default  : begin //1?1?1
bypass_way_inst0_valid = bypass_way_h0_vld;
...
bypass_way_half_num[2:0] = 3'b110;
ib_hn_create_vld_bypass[8:0] = {6'b0,ib_hn_create_vld[2:0]};

得到32位create vld bits后,会将每一位赋给entry,作为vld信号。注意,vld信号仅控制control signal的更新,保存在entry中的数据不受vld信号控制。control signal包括:

  1. entry是否vld
  2. entry中的指令是否为32位指令前半部分

Create Input Data Generate

在得到ibuf的entry create vld信号后,还需要组织向ibuf中保存的指令的信息,这一部分逻辑较为简单,从3667行到5520行为该段逻辑。

retire logic

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
always @( ibuf_merge_retire_pointer[31:0]
or ibuf_retire_pointer[31:0]
or ibuf_pop_inst2_valid
or ibuf_pop3_half_num[2:0])
begin
casez({ibuf_pop_inst2_valid, ibuf_pop3_half_num[2:0]})
4'b0??? : retire_pointer_pre[ENTRY_NUM-1:0] = ibuf_merge_retire_pointer[ENTRY_NUM-1:0];
4'b1011 : retire_pointer_pre[ENTRY_NUM-1:0] = {ibuf_retire_pointer[ENTRY_NUM-4:0],
ibuf_retire_pointer[ENTRY_NUM-1:ENTRY_NUM-3]};
4'b1100 : retire_pointer_pre[ENTRY_NUM-1:0] = {ibuf_retire_pointer[ENTRY_NUM-5:0],
ibuf_retire_pointer[ENTRY_NUM-1:ENTRY_NUM-4]};
4'b1101 : retire_pointer_pre[ENTRY_NUM-1:0] = {ibuf_retire_pointer[ENTRY_NUM-6:0],
ibuf_retire_pointer[ENTRY_NUM-1:ENTRY_NUM-5]};
4'b1110 : retire_pointer_pre[ENTRY_NUM-1:0] = {ibuf_retire_pointer[ENTRY_NUM-7:0],
ibuf_retire_pointer[ENTRY_NUM-1:ENTRY_NUM-6]};
default : retire_pointer_pre[ENTRY_NUM-1:0] = ibuf_retire_pointer[ENTRY_NUM-1:0];
endcase
// &CombEnd; @399
end

Retire Valid Logic

Retire Valid逻辑相比Create来说较为简单,首先由retire_pointer*确定可能retire的entry,然后和retire_vld_*进行mask,而retire_vld_*根据实际从ibuf中pop的指令数量确定

1
2
3
4
5
6
7
8
9
casez({pop_h0_32_start,pop_h1_32_start,pop_h2_32_start,
pop_h3_32_start,pop_h4_32_start})
5'b000?? : begin
ibuf_pop_inst0_valid = pop_h0_vld;
ibuf_pop_inst0_data[31:0] = {16'b0,pop_h0_data[15:0]};
ibuf_pop_inst0_pc[14:0] = pop_h0_pc[14:0];
// inst0-2's data
ibuf_pop3_half_num[2:0] = 3'b011; // Get poped half number
ibuf_pop3_retire_vld[5:0] = 6'b111000; // Get the retire mask

Bypass 逻辑

何时进行bypass

在ibuf为空时,将会对指令进行bypass,最多bypass3条指令,其余的将会被保存在ibuf中。

1
2
3
assign bypass_vld = (ibuf_create_num[4:0] == 
ibuf_retire_num[4:0]) &&
!ibctrl_ibuf_bypass_not_select;

Inst attr

在IBUF的bypass逻辑中,最多可以输出三条指令(16bits或32bits)。首先,ibuf以16bits为单位,对h0到h5的属性进行了判断

1
2
3
4
5
6
7
assign bypass_way_h0_vld = (ibdp_ibuf_h0_vld)
? ibdp_ibuf_h0_vld
: ibdp_ibuf_hn_vld[7];
...
assign bypass_way_h4_vld = (ibdp_ibuf_h0_vld)
? ibdp_ibuf_hn_vld[4]
: ibdp_ibuf_hn_vld[3];

Inst Gen

指令生成逻辑。在bypass valid的情况下,ibuf最多会提供三条指令(16或32位),此时的逻辑如下:

图片名称

ibuf中会根据bypass_way_hn_32_start的标志位来进行指令的生成(8570-9383行),同时记录bypass了几条half的指令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
casez({bypass_way_h0_32_start,bypass_way_h1_32_start,bypass_way_h2_32_start,
bypass_way_h3_32_start,bypass_way_h4_32_start})
5'b1?1?0 : begin
bypass_way_inst0_valid = bypass_way_h0_vld;
bypass_way_inst0_data[31:0] = {bypass_way_h1_data[15:0],bypass_way_h0_data[15:0]};
... // inst 0 attrs
bypass_way_inst1_valid = bypass_way_h2_vld;
bypass_way_inst1_data[31:0] = {bypass_way_h3_data[15:0],bypass_way_h2_data[15:0]};
bypass_way_inst1_pc[14:0] = bypass_way_h2_pc[14:0];
...
bypass_way_inst2_data[31:0] = {16'b0,bypass_way_h4_data[15:0]};
bypass_way_inst2_pc[14:0] = bypass_way_h4_pc[14:0];
...
bypass_way_half_num[2:0] = 3'b101; //一共bypass了5条half指令
ib_hn_create_vld_bypass[8:0] = {5'b0,ib_hn_create_vld[3:0]};

指令Merge逻辑

ibuf中,需要根据ibuf中所保存的指令,将Bypass和ibuf中获得的指令进行Merge,并得到最终送给decoder的指令。其代码片段如下,此处容易迷惑的点是这里虽然叫merge_way_inst0但是实际上指的是用于指令merge的bypass的指令。当需要从ibuf中取出指令时,bypass最多提供两条指令进行merge,所以此处我们只需要考虑inst0inst1

1
2
3
4
5
6
7
8
assign merge_way_inst0_sel        = !ibuf_pop_inst1_valid;
assign merge_way_inst0_valid = bypass_way_inst0_valid && ibctrl_ibuf_merge_vld;
assign merge_way_inst0[31:0] = bypass_way_inst0_data[31:0];

assign merge_way_inst1_sel = !ibuf_pop_inst1_valid || !ibuf_pop_inst2_valid;
assign merge_way_inst1_valid = (ibuf_pop_inst1_valid) ? bypass_way_inst0_valid && ibctrl_ibuf_merge_vld
: bypass_way_inst1_valid && ibctrl_ibuf_merge_vld;
assign merge_way_inst1[31:0] = (ibuf_pop_inst1_valid) ? bypass_way_inst0_data[31:0]: bypass_way_inst1_data[31:0];

我们考虑如下几种情况:

  • 情况一:ibuf中的3条指令都有效

此时直接使用ibuf中pop出来的指令,无需进行merge

  • 情况二:ibuf中inst1inst2无效,此时分为如下几种情况讨论:
    • inst1无效
    • inst2无效
    • inst1inst2均无效

对于第一种子情况,得到的结果如下:

1
2
merge_way_inst0 = bypass_way_inst0_data;
merge_way_inst1 = bypass_way_inst1_data;

对于第二种子情况,得到的结果如下:

1
2
merge_way_inst0 // 不会被选择
merge_way_inst1 = bypass_way_inst1_data;

对于第三种子情况,得到的结果如下:

1
2
merge_way_inst0 = bypass_way_inst0_data;
merge_way_inst1 = bypass_way_inst1_data;

最终,从ibuf提供给ibdp的三条指令如下:

1
2
3
assign ibuf_ibdp_inst0[31:0]      = ibuf_pop_inst0_data[31:0];
assign ibuf_ibdp_inst1[31:0] = (merge_way_inst0_sel) ? merge_way_inst0[31:0] : ibuf_pop_inst1_data[31:0];
assign ibuf_ibdp_inst2[31:0] = (merge_way_inst1_sel) ? merge_way_inst1[31:0] : ibuf_pop_inst2_data[31:0];

这三条指令已经包含了merge bypass的过程,因此可以在ib阶段直接进行mux,选择需要送给decoder的三条指令即可:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
case({bypass_inst_vld,ibuf_inst_vld,lbuf_inst_vld})
// Pure bypass
3'b100: begin
inst0[31:0] = ibuf_ibdp_bypass_inst0[31:0];
inst1[31:0] = ibuf_ibdp_bypass_inst1[31:0];
inst2[31:0] = ibuf_ibdp_bypass_inst2[31:0];
// ibuf merged with bypass
3'b010: begin
inst0[31:0] = ibuf_ibdp_inst0[31:0];
inst1[31:0] = ibuf_ibdp_inst1[31:0];
inst2[31:0] = ibuf_ibdp_inst2[31:0];
// lbuf
3'b001: begin
inst0[31:0] = lbuf_ibdp_inst0[31:0];
inst1[31:0] = lbuf_ibdp_inst1[31:0];
inst2[31:0] = lbuf_ibdp_inst2[31:0];

Design: 4 normal inst bypass

为了增加ibuf的吞吐量,我们为ibuf设计了一个新的bypass路径,当fetch的指令为4条normal类型的指令,且ibuf为空的情况下,我们会将这四条normal指令直接送至IDU进行decoder处理,此时control signal新增一条

1
2
3
4
5
assign bypass_all_inst_vld  = ibctrl_ibdp_bypass_all_inst_vld;

assign bypass_inst_vld = ibctrl_ibdp_bypass_inst_vld & !bypass_all_inst_vld;
assign ibuf_inst_vld = ibctrl_ibdp_ibuf_inst_vld & !bypass_all_inst_vld;
assign lbuf_inst_vld = ibctrl_ibdp_lbuf_inst_vld & !bypass_all_inst_vld;

问题

  1. 在Bypass逻辑的判断过程中,为何是h0-h4而不是h0-h5?

因为根据h0-h4即可知道全部bypass inst的信息,首先如果h4是一条16bits的语句,那么bypass的指令必然没有h5;如果h4是32位指令,那么{h4, h5}可以拼成一条32位指令,根据h4即可获得h5的信息

  1. 在Merge的逻辑判断过程中,为何没有考虑ibuf的inst0无效的逻辑?
1
2
3
assign ibuf_ibdp_inst0[31:0]      = ibuf_pop_inst0_data[31:0];

assign ibuf_ibdp_inst1[31:0] = (merge_way_inst0_sel) ? merge_way_inst0[31:0] : ibuf_pop_inst1_data[31:0];

参考文献

0%