FPGA開発日記

カテゴリ別記事インデックス https://msyksphinz.github.io/github_pages , English Version https://fpgadevdiary.hatenadiary.com/

自作CPUにベクトル命令を追加する実装検討 (36. vadd.viの確認)

ベクトル命令のデバッグで面倒くさいのは、複数のレジスタへの書き込みが発生したときの対処だ。

LMUL>1の時に、複数のベクトルレジスタの検証が行えるように環境を変更しよう。

void step_spike(long long rtl_time, long long rtl_pc,
                int rtl_priv, long long rtl_mstatus,
                int rtl_exception, int rtl_exception_cause,
                int rtl_cmt_id, int rtl_grp_id,
                int rtl_insn,
                int rtl_wr_valid, int rtl_wr_type, int rtl_wr_gpr_addr,
                int rtl_wr_gpr_rnid, long long rtl_wr_val,
                const uint8_t* rtl_wr_vec_val0,
                const uint8_t* rtl_wr_vec_val1,
                const uint8_t* rtl_wr_vec_val2,
                const uint8_t* rtl_wr_vec_val3,
                const uint8_t* rtl_wr_vec_val4,
                const uint8_t* rtl_wr_vec_val5,
                const uint8_t* rtl_wr_vec_val6,
                const uint8_t* rtl_wr_vec_val7)

最大で8つのベクトル・レジスタなので、RTLから8つ分のレジスタの情報を取得し、LMULの数だけ比較する。

  } else if (rtl_wr_valid && (iss_wr_type == 2 || rtl_wr_type == 2)) { // VPR write
    const uint8_t* rtl_wr_vec_val[8] = {
      rtl_wr_vec_val0, rtl_wr_vec_val1, rtl_wr_vec_val2, rtl_wr_vec_val3,
      rtl_wr_vec_val4, rtl_wr_vec_val5, rtl_wr_vec_val6, rtl_wr_vec_val7
    };

    for (size_t lmul = 0; lmul < p->VU.vflmul; lmul++) {
      bool diff_found = false;
      for (int b = 0; b < g_rv_vlen / 8; b++) {
        if (rtl_wr_vec_val[lmul][b] != static_cast<uint8_t *>(p->VU.reg_file)[(rtl_wr_gpr_addr + lmul) * (g_rv_vlen/8) + b]) {
          diff_found = true;
        }
      }

RTL側の実装はこう。8つのベクトル・レジスタをまとめて、比較環境に渡していく。

byte w_physical_vec_data_rnid[scariv_pkg::DISP_SIZE-1: 0][8][riscv_vec_conf_pkg::VLEN_W/8-1: 0];

generate if (riscv_vec_conf_pkg::VLEN_W != 0) begin : vpu
  for (genvar grp_idx = 0; grp_idx < scariv_pkg::DISP_SIZE; grp_idx++) begin
    for (genvar lmul_idx = 0; lmul_idx < 8; lmul_idx++) begin
      for (genvar idx = 0; idx < riscv_vec_conf_pkg::VLEN_W/8; idx++) begin : array_loop
        assign w_physical_vec_data_rnid[grp_idx][lmul_idx][idx] = w_physical_vec_data[committed_rob_entry.inst[grp_idx].wr_reg.rnid + lmul_idx][idx*8 +: 8];
      end
    end
  end
end endgenerate
            step_spike ($time / 4, longint'(committed_rob_entry.inst[grp_idx].pc_addr),
                        int'(u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_csu.u_scariv_csr.r_priv),
                        u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_rob.w_sim_mstatus[u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_rob.w_out_cmt_entry_id][grp_idx],
                        u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_rob.w_valid_except_grp_id[grp_idx],
                        u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_rob.w_except_type_selected,
                        u_scariv_subsystem_wrapper.u_scariv_subsystem.u_tile.u_rob.w_out_cmt_id,
                        1 << grp_idx,
                        committed_rob_entry.inst[grp_idx].rvc_inst_valid ? committed_rob_entry.inst[grp_idx].rvc_inst : committed_rob_entry.inst[grp_idx].inst,
                        committed_rob_entry.inst[grp_idx].wr_reg.valid,
                        committed_rob_entry.inst[grp_idx].wr_reg.typ,
                        committed_rob_entry.inst[grp_idx].wr_reg.regidx,
                        committed_rob_entry.inst[grp_idx].wr_reg.rnid,
                        committed_rob_entry.inst[grp_idx].wr_reg.typ == scariv_pkg::GPR ?
                        w_physical_int_data[committed_rob_entry.inst[grp_idx].wr_reg.rnid] :
                        w_physical_fp_data [committed_rob_entry.inst[grp_idx].wr_reg.rnid],
                        w_physical_vec_data_rnid[grp_idx][0], w_physical_vec_data_rnid[grp_idx][1], w_physical_vec_data_rnid[grp_idx][2], w_physical_vec_data_rnid[grp_idx][3],
                        w_physical_vec_data_rnid[grp_idx][4], w_physical_vec_data_rnid[grp_idx][5], w_physical_vec_data_rnid[grp_idx][6], w_physical_vec_data_rnid[grp_idx][7]
                        );

こんな感じで、ログが出力されるようになる。

246541 : 218 : PC=[0000000080000170] (U,33,01) 02050107 vle8.v  v2, (a0)
VPR[02](16) <= f50471f1_efe95ed0_caced9af_287d3124_e3c0d409_47cd3b17_7d115150_b866a93d_8133a51e_228fc5c6_a340fa12_4cb8dc1d_f3dccbfa_ddff6231_3ad64ccb_ddb8bebc_
VPR[03](17) <= 2cb424c2_0332619c_e7dac12c_7f8e087c_d9605c57_e4400392_b0fbd100_4aea0272_b7fb9e95_d8f62420_ea875530_9ccef534_dc6de5fa_4dac412d_766c35e5_cc4055f3_

というわけで、問題はvadd.viの実装らしい。

246551 : 228 : PC=[0000000080000192] (U,39,01) 02403157 vadd.vi v2, v4, 0
VPR[02](20) <= f50471f1_efe95ed0_caced9af_287d3124_e3c0d409_47cd3b17_7d115150_b866a93d_8133a51e_228fc5c6_a340fa12_4cb8dc1d_f3dccbfa_ddff6231_3ad64ccb_ddb8bebc_
==========================================
Wrong VPR[03](20): 
ISS[03] = 2cb424c2_0332619c_e7dac12c_7f8e087c_d9605c57_e4400392_b0fbd100_4aea0272_b7fb9e95_d8f62420_ea875530_9ccef534_dc6de5fa_4dac412d_766c35e5_cc4055f3_
RTL[03] = 00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000_
          ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~   ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ 
===============================