diff --git a/fpga/emu/Makefile b/fpga/emu/Makefile index bf408b6f..e15d8005 100644 --- a/fpga/emu/Makefile +++ b/fpga/emu/Makefile @@ -25,6 +25,7 @@ nohype_dtb = $(build_dir)/c0.dtb nohype_dtb_hex_file = $(build_dir)/c0.dtb.txt # max_cycles = 40000000 max_cycles = 2209069 +#has_max_cycles = -m $(max_cycles) $(emu): $(original_emu) ln -sf $< $@ @@ -52,7 +53,7 @@ DEBUG_ARGS = +jtag_rbb_enable=1 -r 4040 endif run-emu: $(emu) $(emu_bin_hex_file) $(nohype_dtb_hex_file) - cd $(dir $(emu)) && LD_LIBRARY_PATH=$(RISCV)/lib time $< $(DEBUG_ARGS) $(SEED) +verbose -m $(max_cycles) . 3>&1 1>&2 2>&3 \ + cd $(dir $(emu)) && LD_LIBRARY_PATH=$(RISCV)/lib time $< $(DEBUG_ARGS) $(SEED) +verbose $(has_max_cycles) . 3>&1 1>&2 2>&3 \ | spike-dasm > $(dir $(emu))/emu.log dtb-clean: diff --git a/src/main/resources/vsrc/autocat.v b/src/main/resources/vsrc/autocat.v new file mode 100644 index 00000000..c4fc4799 --- /dev/null +++ b/src/main/resources/vsrc/autocat.v @@ -0,0 +1,160 @@ +module autocat +#( + parameter CACHE_ASSOCIATIVITY = 16, // currently only support 16-way fix configuration + parameter COUNTER_WIDTH = 32 +) +( + input clk_in, + input reset_in, + + // 2's power of reset limit + input [6 - 1 : 0] reset_bin_power, + input [32 - 1 : 0] allowed_gap, + input access_valid_in, + input [CACHE_ASSOCIATIVITY - 1 : 0] hit_vec_in, + output reg [CACHE_ASSOCIATIVITY - 1 : 0] suggested_waymask_out +); + +// overall request counter +reg access_valid_pre; +reg [CACHE_ASSOCIATIVITY - 1 : 0] hit_vec_pre; +reg [63 : 0] access_counter; + +wire request_limit = access_counter == 2 ** reset_bin_power; +wire reset_with_request_limit = reset_in | request_limit; + + +always@(posedge clk_in) +begin + if(reset_with_request_limit) + begin + access_counter <= 0; + access_valid_pre <= 0; + hit_vec_pre <= 0; + end + else + begin + access_valid_pre <= access_valid_in; + hit_vec_pre <= hit_vec_in; + + if(~access_valid_pre & access_valid_in) + access_counter <= access_counter + 1'b1; + end +end + +wire [CACHE_ASSOCIATIVITY * COUNTER_WIDTH - 1 : 0] counter_flatted; +wire [CACHE_ASSOCIATIVITY * COUNTER_WIDTH - 1 : 0] post_sort_counter_flatted; + +// hit counter array +genvar gen; +generate +for(gen = 0; gen < CACHE_ASSOCIATIVITY; gen = gen + 1) +begin + reg [COUNTER_WIDTH - 1 : 0] hit_counter; + assign counter_flatted[gen * COUNTER_WIDTH +: COUNTER_WIDTH] = hit_counter; + + always@(posedge clk_in) + begin + if(reset_with_request_limit) + begin + $fwrite(32'h80000002, "counter %d: %d\n", gen, hit_counter); + hit_counter <= 0; + end + else if(hit_vec_in[gen:gen]) // Per-way hit counting + begin + hit_counter <= hit_counter + 1'b1; + end + end +end +endgenerate + +// sort all the hit counters +bitonic_sorter +#( + .SINGLE_WAY_WIDTH_IN_BITS(COUNTER_WIDTH), + .NUM_WAY(CACHE_ASSOCIATIVITY) +) +sorter +( + .clk_in(clk_in), + .reset_in(reset_in), + .pre_sort_flatted_in(counter_flatted), + .post_sort_flatted_out(post_sort_counter_flatted) +); + +wire [CACHE_ASSOCIATIVITY * COUNTER_WIDTH - 1 : 0] post_calc_counter_flatted; + +integer index; +generate +for(gen = 0; gen < CACHE_ASSOCIATIVITY; gen = gen + 1) +begin + reg [COUNTER_WIDTH - 1:0] sum; + always@* + begin + sum = 'd0; + for (index = 0; index <= gen; index = index + 1) + begin + sum = sum + post_sort_counter_flatted[index * COUNTER_WIDTH +: COUNTER_WIDTH]; + end + end + assign post_calc_counter_flatted[gen * COUNTER_WIDTH +: COUNTER_WIDTH] = sum >> (reset_bin_power - 4); +end +endgenerate + +integer loop_index; +reg [CACHE_ASSOCIATIVITY - 1 : 0] first_best_partition; + +always@* +begin : Find + first_best_partition = 0; + + for(loop_index = 0; loop_index < CACHE_ASSOCIATIVITY; loop_index = loop_index + 1) + begin + if(post_calc_counter_flatted[loop_index * COUNTER_WIDTH +: COUNTER_WIDTH] + allowed_gap >= + post_calc_counter_flatted[(CACHE_ASSOCIATIVITY - 1) * COUNTER_WIDTH +: COUNTER_WIDTH]) + begin + /* verilator lint_off WIDTH */ + first_best_partition = loop_index; + /* verilator lint_on WIDTH */ + disable Find; //TO exit the loop + end + end +end + +always@(posedge clk_in) +begin : FindX + for(loop_index = 0; loop_index < CACHE_ASSOCIATIVITY; loop_index = loop_index + 1) + begin + if (reset_with_request_limit) begin + $fwrite(32'h80000002, "%d origin sorted %d, faltter %d\n", loop_index, + post_sort_counter_flatted[loop_index * COUNTER_WIDTH +: COUNTER_WIDTH], + post_calc_counter_flatted[loop_index * COUNTER_WIDTH +: COUNTER_WIDTH]); + end + end +end + +always@(posedge clk_in) +begin + if(reset_in) + begin + suggested_waymask_out <= 16'b1111_1111_1111_1111; + end + else if(request_limit) + begin + for(loop_index = 0; loop_index < CACHE_ASSOCIATIVITY; loop_index = loop_index + 1) + begin + /* verilator lint_off WIDTH */ + if(first_best_partition >= loop_index) + begin + /* verilator lint_on WIDTH */ + suggested_waymask_out[loop_index] <= 1'b1; + end + else + begin + suggested_waymask_out[loop_index] <= 1'b0; + $fwrite(32'h80000002, "suggest updating, loop_index %d\n", loop_index); + end + end + end +end +endmodule diff --git a/src/main/resources/vsrc/bitonic_sorter.v b/src/main/resources/vsrc/bitonic_sorter.v new file mode 100644 index 00000000..873238b1 --- /dev/null +++ b/src/main/resources/vsrc/bitonic_sorter.v @@ -0,0 +1,258 @@ +module bitonic_sorter +#( + parameter SINGLE_WAY_WIDTH_IN_BITS = 4, + parameter NUM_WAY = 16 // must be a power of 2 +) +( + input clk_in, + input reset_in, + input [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] pre_sort_flatted_in, + output [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] post_sort_flatted_out +); + +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_0_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_1_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_2_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_3_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_4_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_5_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_6_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_7_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_8_flatted; +wire [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] cross_9_flatted; + +reg [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] stage_1_flatted; +reg [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] stage_2_flatted; +reg [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] stage_3_flatted; + +always@(posedge clk_in) +begin + if(reset_in) + begin + stage_1_flatted <= 0; + stage_2_flatted <= 0; + stage_3_flatted <= 0; + end + + else + begin + stage_1_flatted <= cross_3_flatted; /**/ + stage_2_flatted <= cross_6_flatted; /**/ + stage_3_flatted <= cross_9_flatted; /**/ + end +end + +assign post_sort_flatted_out = stage_3_flatted; + +// cross 0 /**/ +generate +genvar gen; +`define stride 2 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_0 /**/ + + bitonic_sorter_orange /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out( cross_0_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 1 /**/ +generate +`define stride 4 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_1 /**/ + + bitonic_sorter_orange /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_0_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_1_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 2 /**/ +generate +`define stride 2 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_2 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_1_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_2_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 3 /**/ +generate +`define stride 8 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_3 /**/ + + bitonic_sorter_orange /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_2_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_3_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 4 /**/ +generate +`define stride 4 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_4 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (stage_1_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_4_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 5 /**/ +generate +`define stride 2 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_5 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_4_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_5_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 6 /**/ +generate +`define stride 16 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_6 /**/ + + bitonic_sorter_orange /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_5_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_6_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 7 /**/ +generate +`define stride 8 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_7 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (stage_2_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_7_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 8 /**/ +generate +`define stride 4 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_8 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_7_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_8_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +// cross 9 /**/ +generate +`define stride 2 /**/ +for(gen = 0; gen < NUM_WAY; gen = gen + `stride) +begin : cross_9 /**/ + + bitonic_sorter_pink /**/ + #( + .SINGLE_WAY_WIDTH_IN_BITS(SINGLE_WAY_WIDTH_IN_BITS), + .NUM_WAY(`stride) + ) + bitonic_sorter_cell + ( + /**/ + .pre_sort_flatted_in (cross_8_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]), + .post_sort_flatted_out(cross_9_flatted[gen * SINGLE_WAY_WIDTH_IN_BITS +: `stride * SINGLE_WAY_WIDTH_IN_BITS]) + ); +end +`undef stride +endgenerate + +endmodule \ No newline at end of file diff --git a/src/main/resources/vsrc/bitonic_sorter_orange.v b/src/main/resources/vsrc/bitonic_sorter_orange.v new file mode 100644 index 00000000..cea1f17a --- /dev/null +++ b/src/main/resources/vsrc/bitonic_sorter_orange.v @@ -0,0 +1,33 @@ +module bitonic_sorter_orange +#( + parameter SINGLE_WAY_WIDTH_IN_BITS = 32, + parameter NUM_WAY = 16 // must be a power of 2 +) +( + input [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] pre_sort_flatted_in, + output [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] post_sort_flatted_out +); + +generate +genvar gen; + +for(gen = 0; gen < NUM_WAY / 2; gen = gen + 1) +begin + assign post_sort_flatted_out[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] = + + (pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] < + pre_sort_flatted_in[(NUM_WAY - gen - 1) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]) ? + pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] : + pre_sort_flatted_in[(NUM_WAY - gen - 1) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]; + + assign post_sort_flatted_out[(NUM_WAY - gen - 1) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] = + + (pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] > + pre_sort_flatted_in[(NUM_WAY - gen - 1) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]) ? + pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] : + pre_sort_flatted_in[(NUM_WAY - gen - 1) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]; +end + +endgenerate + +endmodule \ No newline at end of file diff --git a/src/main/resources/vsrc/bitonic_sorter_pink.v b/src/main/resources/vsrc/bitonic_sorter_pink.v new file mode 100644 index 00000000..c3795152 --- /dev/null +++ b/src/main/resources/vsrc/bitonic_sorter_pink.v @@ -0,0 +1,35 @@ +module bitonic_sorter_pink +#( + parameter SINGLE_WAY_WIDTH_IN_BITS = 32, + parameter NUM_WAY = 16 // must be a power of 2 +) +( + input [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] pre_sort_flatted_in, + output [SINGLE_WAY_WIDTH_IN_BITS * NUM_WAY - 1 : 0] post_sort_flatted_out +); + +parameter NUM_WAY_HALF = NUM_WAY / 2; + +generate +genvar gen; + +for(gen = 0; gen < NUM_WAY_HALF; gen = gen + 1) +begin + assign post_sort_flatted_out[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] = + + (pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] < + pre_sort_flatted_in[(gen + NUM_WAY_HALF) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]) ? + pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] : + pre_sort_flatted_in[(gen + NUM_WAY_HALF) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]; + + assign post_sort_flatted_out[(gen + NUM_WAY_HALF) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] = + + (pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] > + pre_sort_flatted_in[(gen + NUM_WAY_HALF) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]) ? + pre_sort_flatted_in[gen * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS] : + pre_sort_flatted_in[(gen + NUM_WAY_HALF) * SINGLE_WAY_WIDTH_IN_BITS +: SINGLE_WAY_WIDTH_IN_BITS]; +end + +endgenerate + +endmodule \ No newline at end of file diff --git a/src/main/scala/devices/debug/Debug.scala b/src/main/scala/devices/debug/Debug.scala index 4928d6af..f22a344b 100644 --- a/src/main/scala/devices/debug/Debug.scala +++ b/src/main/scala/devices/debug/Debug.scala @@ -20,7 +20,7 @@ import freechips.rocketchip.subsystem.{NL2CacheWays, NTiles} import freechips.rocketchip.system.UseEmu import freechips.rocketchip.tile.XLen import ila.BoomCSRILABundle -import lvna.{ControlPlaneIO, HasControlPlaneParameters} +import lvna.{AutoCatConstants, ControlPlaneIO, HasControlPlaneParameters} import freechips.rocketchip.diplomaticobjectmodel.DiplomaticObjectModelAddressing import freechips.rocketchip.diplomaticobjectmodel.model._ @@ -1107,6 +1107,18 @@ class TLDebugModuleInner(device: Device, getNComponents: () => Int, beatBytes: I (CORE_CSR_PENDING_INT_HI << 2) -> Seq(RegField.r(32, io.zid(io.cp.hartSel).reg_mip(63, 32), RegFieldDesc("1", "1"))), (CORE_CSR_PENDING_INT_LO << 2) -> Seq(RegField.r(32, io.zid(io.cp.hartSel).reg_mip(31, 0), RegFieldDesc("1", "1"))), + (CP_L2_STAT_RESET<<2) -> Seq(RWNotify(1, WireInit(false.B), io.cp.updateData, WireInit(false.B), io.cp.l2_stat_reset_wen)), + (CP_L2_REQ_EN << 2) -> Seq(RWNotify(1, WireInit(0.U), WireInit(0.U), io.cp.l2_miss_en, WireInit(false.B))), + (CP_L2_REQ_MISS << 2) -> Seq(RegField.r(32, io.cp.l2_req_miss)), + (CP_L2_REQ_TOTAL<< 2) -> Seq(RegField.r(32, io.cp.l2_req_total)), + + (CP_AUTOCAT_EN << 2) -> Seq(RWNotify(1, io.cp.autocat_en, io.cp.updateData, WireInit(false.B), io.cp.autocat_wen)), + (CP_AUTOCAT_RESET_BIN_POWER << 2) -> Seq(RWNotify(AutoCatConstants.resetBinPowerWidth, io.cp.autocat_reset_bin_power, io.cp.updateData, WireInit(false.B), io.cp.autocat_reset_bin_power_wen)), + (CP_AUTOCAT_SUGGEST_WAYMASK << 2) -> Seq(RegField.r(AutoCatConstants.nrL2Ways, io.cp.autocat_suggested_waymask)), + (CP_AUTOCAT_WATCHING_DSID << 2) -> Seq(RWNotify(dsidWidth, io.cp.autocat_watching_dsid, io.cp.updateData, WireInit(false.B), io.cp.autocat_watching_dsid_wen)), + (CP_AUTOCAT_SET << 2) -> Seq(RWNotify(32, io.cp.autocat_set, io.cp.updateData, WireInit(false.B), io.cp.autocat_set_wen)), + (CP_AUTOCAT_GAP << 2) -> Seq(RWNotify(32, io.cp.autocat_gap, io.cp.updateData, WireInit(false.B), io.cp.autocat_gap_wen)), + (CP_DSID_SEL << 2) -> Seq(RWNotify(dsidWidth, io.cp.dsidSel, io.cp.updateData, WireInit(false.B), io.cp.dsidSelWen, None)) ) diff --git a/src/main/scala/devices/debug/dm_registers.scala b/src/main/scala/devices/debug/dm_registers.scala index dd679d31..f6ba887e 100644 --- a/src/main/scala/devices/debug/dm_registers.scala +++ b/src/main/scala/devices/debug/dm_registers.scala @@ -386,6 +386,18 @@ object DMI_RegAddrs { def CP_TIMER_HI = 0x57 + def CP_AUTOCAT_EN = 0x58 + + def CP_AUTOCAT_RESET_BIN_POWER = 0x59 + + def CP_AUTOCAT_SUGGEST_WAYMASK = 0x5a + + def CP_AUTOCAT_WATCHING_DSID = 0x5b + + def CP_AUTOCAT_SET = 0x5c + + def CP_AUTOCAT_GAP = 0x5d + def CORE_PC_HI = 0x70 def CORE_PC_LO = 0x71 @@ -410,6 +422,15 @@ object DMI_RegAddrs { def CP_HART_ID = 0x7b + + /* L2 Miss */ + def CP_L2_REQ_MISS = 0x7c + + def CP_L2_REQ_TOTAL = 0x7d + + def CP_L2_STAT_RESET = 0x7e + + def CP_L2_REQ_EN = 0x7f } class DMSTATUSFields extends Bundle { diff --git a/src/main/scala/l2cache/TLSimpleL2.scala b/src/main/scala/l2cache/TLSimpleL2.scala index 26368d20..664d970d 100644 --- a/src/main/scala/l2cache/TLSimpleL2.scala +++ b/src/main/scala/l2cache/TLSimpleL2.scala @@ -8,12 +8,14 @@ package freechips.rocketchip.subsystem import Chisel._ +import chisel3.core.WireInit import chisel3.util.IrrevocableIO import freechips.rocketchip.config.{Field, Parameters} import freechips.rocketchip.diplomacy._ import freechips.rocketchip.util._ import freechips.rocketchip.tilelink._ -import lvna.{HasControlPlaneParameters, CPToL2CacheIO} +import lvna.{AutoCatIOInternal, CPToL2CacheIO, HasControlPlaneParameters} + import scala.math._ case class TLL2CacheParams( @@ -43,6 +45,8 @@ with HasControlPlaneParameters val nSets = p(NL2CacheCapacity) * 1024 / 64 / nWays println(s"nSets = $nSets") val cp = IO(new CPToL2CacheIO().flip()) + val autocat = IO(new (AutoCatIOInternal).flip()) + (node.in zip node.out) foreach { case ((in, edgeIn), (out, edgeOut)) => require(isPow2(nSets)) require(isPow2(nWays)) @@ -329,8 +333,23 @@ with HasControlPlaneParameters hit_way := Bits(0) (0 until nWays).foreach(i => when (tag_match_way(i)) { hit_way := Bits(i) }) + val autocat_is_sampling = WireInit((idx & cp.autocat_set) === cp.autocat_set) + autocat.access_valid_in := dsid === cp.autocat_watching_dsid && autocat_is_sampling && state === s_tag_read + autocat.hit_vec_in := (0 until nWays).map(tag_match_way(_)).asUInt + cp.dsid := dsid - val curr_mask = cp.waymask + val curr_mask = Wire(UInt(nWays.W)) + curr_mask := cp.waymask // default + when (cp.autocat_en) { + when (!autocat_is_sampling) { // not sampling, under control + when (dsid === cp.autocat_watching_dsid) { + curr_mask := cp.waymask & autocat.suggested_waymask_out + }.otherwise { + curr_mask := cp.waymask & (~autocat.suggested_waymask_out).asUInt + } + } + } + val repl_way = Mux((curr_state_reg & curr_mask).orR, PriorityEncoder(curr_state_reg & curr_mask), Mux(curr_mask.orR, PriorityEncoder(curr_mask), UInt(0))) val repl_dsid = set_dsids_reg(repl_way) @@ -342,6 +361,11 @@ with HasControlPlaneParameters log("req_dsid %d occ %d repl_dsid %d occ %d way %d", dsid, requester_occupacy, repl_dsid, victim_occupacy, repl_way) } + val pre_state = RegNext(state) + when (state === s_tag_read || pre_state === s_tag_read) { + log("suggested_waymask_out %b\n", autocat.suggested_waymask_out) + } + cp.capacity := dsid_occupacy(cp.capacity_dsid) @@ -357,6 +381,51 @@ with HasControlPlaneParameters val need_data_read = read_hit || write_hit || read_miss_writeback || write_miss_writeback + class MissStat() extends Bundle { + val miss = UInt(32.W) + val total = UInt(32.W) + } + + val miss_stat_array = DescribedSRAM( + name = "miss_stat", + desc = "L2 cache miss stat", + size = 1 << dsidWidth, + data = new MissStat() + ) + + val miss_stat_origin = miss_stat_array.read(dsid, state === s_tag_read_resp) + + val miss_stat_update = Wire(new MissStat()) + miss_stat_update.miss := miss_stat_origin.miss + Mux(hit, 0.U, 1.U) + miss_stat_update.total := miss_stat_origin.total + 1.U + + val miss_stat_query = miss_stat_array.read(cp.capacity_dsid, cp.req_miss_en) + cp.req_miss := miss_stat_query.miss + cp.req_total := miss_stat_query.total + when (RegNext(cp.req_miss_en)) { + log("query miss stat dsid %d miss %d total %d", cp.capacity_dsid, miss_stat_query.miss, miss_stat_query.total) + } + + val reset_miss_stat = cp.stat_reset || reset + val miss_stat_iter = Reg(UInt(dsidWidth.W)) + when (reset_miss_stat) { + miss_stat_iter := miss_stat_iter + 1.U + }.otherwise{ + miss_stat_iter := 0.U + } + val miss_stat_init = Wire(new MissStat()) + miss_stat_init.miss := 0.U + miss_stat_init.total := 0.U + val miss_stat_idx = Mux(reset_miss_stat, miss_stat_iter, dsid) + val miss_stat_data = Mux(reset_miss_stat, miss_stat_init, miss_stat_update) + when (state === s_tag_read || reset_miss_stat) { + when (reset_miss_stat) { + log("iter %d, miss_stat_data %x", miss_stat_idx, miss_stat_data.asUInt) + } + miss_stat_array.write(miss_stat_idx, miss_stat_data) + } + + when (state === s_tag_read) { log("hit: %d idx: %x curr_state_reg: %x waymask: %x hit_way: %x repl_way: %x", hit, idx, curr_state_reg, curr_mask, hit_way, repl_way) when (ren) { diff --git a/src/main/scala/lvna/AutoCat.scala b/src/main/scala/lvna/AutoCat.scala new file mode 100644 index 00000000..67622f59 --- /dev/null +++ b/src/main/scala/lvna/AutoCat.scala @@ -0,0 +1,26 @@ +package lvna + +import chisel3._ + +object AutoCatConstants { + val resetBinPowerWidth = 6 // 2^0 ~ 2^63 is enough. + val nrL2Ways = 16 // Currently fixed. +} + +class AutoCatIOInternal extends Bundle { + import AutoCatConstants._ + + val access_valid_in = Input(Bool()) + // 2's power of reset limit, say, update suggested waymask per 2^reset_bin_power cycles. + val reset_bin_power = Input(UInt(resetBinPowerWidth.W)) + val allowed_gap = Input(UInt(32.W)) + val hit_vec_in = Input(UInt(nrL2Ways.W)) + val suggested_waymask_out = Output(UInt(nrL2Ways.W)) +} + +class autocat extends BlackBox { + val io = IO(new AutoCatIOInternal { + val clk_in = Input(Clock()) + val reset_in = Input(Bool()) + }) +} diff --git a/src/main/scala/lvna/LvNAConfigs.scala b/src/main/scala/lvna/LvNAConfigs.scala index b5be7ce9..bf3f56b6 100644 --- a/src/main/scala/lvna/LvNAConfigs.scala +++ b/src/main/scala/lvna/LvNAConfigs.scala @@ -35,7 +35,7 @@ class LvNABoomConfig extends Config( ++ new WithEmu ++ new WithBoom ++ new WithRationalRocketTiles - ++ new WithExtMemSize(0x8000000L) // 32MB + ++ new WithExtMemSize(0x8000000L * 2) // 32MB ++ new WithNoMMIOPort ++ new WithJtagDTM ++ new WithDebugSBA @@ -119,7 +119,7 @@ class LvNAConfigemu extends Config( new WithoutFPU ++ new WithNonblockingL1(8) ++ new WithNL2CacheCapacity(256) - ++ new WithNBigCores(1) + ++ new WithNBigCores(2) ++ new WithEmu ++ new WithRationalRocketTiles ++ new WithExtMemSize(0x8000000L) // 32MB diff --git a/src/main/scala/lvna/controlplane/ControlPlane.scala b/src/main/scala/lvna/controlplane/ControlPlane.scala index 7eafd6e3..fe2f9881 100644 --- a/src/main/scala/lvna/controlplane/ControlPlane.scala +++ b/src/main/scala/lvna/controlplane/ControlPlane.scala @@ -3,6 +3,7 @@ package lvna import Chisel._ import boom.system.{HasBoomTiles, HasBoomTilesModuleImp} import chisel3.core.{IO, Input, Output, WireInit} +import chisel3.experimental.chiselName import freechips.rocketchip.config.{Field, Parameters} import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp, SimpleDevice} import freechips.rocketchip.subsystem._ @@ -13,6 +14,7 @@ import freechips.rocketchip.util.{AsyncQueue, GTimer} import freechips.rocketchip.devices.debug.DMI_RegAddrs._ import freechips.rocketchip.devices.debug.RWNotify import freechips.rocketchip.regmapper.{RegField, RegReadFn, RegWriteFn} +import chisel3.util.Fill object log2Safe { def apply(n: BigInt): Int = { @@ -90,6 +92,24 @@ class ControlPlaneIO(implicit val p: Parameters) extends Bundle with HasControlP val PC = UInt(OUTPUT, p(XLen)) val assertDebugInt = Bool(INPUT) + + val l2_miss_en = Bool(INPUT) + val l2_req_miss = UInt(OUTPUT, 32) + val l2_req_total = UInt(OUTPUT, 32) + val l2_stat_reset_wen = Bool(INPUT) + + import AutoCatConstants._ + val autocat_en = Bool(OUTPUT) + val autocat_wen = Bool(INPUT) + val autocat_reset_bin_power = UInt(OUTPUT, resetBinPowerWidth) + val autocat_reset_bin_power_wen = Bool(INPUT) + val autocat_suggested_waymask = UInt(OUTPUT, nrL2Ways) + val autocat_watching_dsid = UInt(OUTPUT, dsidWidth) + val autocat_watching_dsid_wen = Bool(INPUT) + val autocat_set = UInt(OUTPUT, 32) + val autocat_set_wen = Bool(INPUT) + val autocat_gap = UInt(OUTPUT, 32) + val autocat_gap_wen = Bool(INPUT) } /* From ControlPlane's View */ @@ -98,6 +118,15 @@ class CPToL2CacheIO(implicit val p: Parameters) extends Bundle with HasControlPl val dsid = Input(UInt(dsidWidth.W)) // DSID from requests L2 cache received val capacity = Input(UInt(cacheCapacityWidth.W)) // Count on way numbers val capacity_dsid = Output(UInt(dsidWidth.W)) // Capacity query dsid + val req_miss = Input(UInt(32.W)) + val req_miss_en = Output(Bool()) + val req_total = Input(UInt(32.W)) + val stat_reset = Output(Bool()) + + val autocat_watching_dsid = Output(UInt(dsidWidth.W)) + val autocat_suggested_waymask = Input(UInt(p(NL2CacheWays).W)) + val autocat_set = Output(UInt(32.W)) + val autocat_en = Output(Bool()) } class BucketState(implicit val p: Parameters) extends Bundle with HasControlPlaneParameters with HasTokenBucketParameters { @@ -197,6 +226,7 @@ with HasTokenBucketParameters val mem_part_en = Bool().asInput val distinct_hart_dsid_en = Bool().asInput val progHartIds = Vec(nTiles, UInt(log2Safe(nTiles).W)).asOutput + val autocat_watching_change = Bool().asOutput }) val hartSel = RegInit(0.U(ldomDSidWidth.W)) @@ -281,6 +311,65 @@ with HasTokenBucketParameters waymasks(currDsid) := io.cp.updateData } + // Miss + val l2_stat_reset = RegInit(false.B) + val sbus_l2_miss_en = Wire(Bool()) + val miss_en = sbus_l2_miss_en || io.cp.l2_miss_en + + io.l2.req_miss_en := miss_en + io.l2.stat_reset := l2_stat_reset + + when (io.cp.l2_stat_reset_wen) { + l2_stat_reset := io.cp.updateData + } + + io.cp.l2_req_miss := RegEnable(io.l2.req_miss, RegNext(RegNext(miss_en))) // Wait miss_stat sram addr changes + io.cp.l2_req_total := RegEnable(io.l2.req_total, RegNext(RegNext(miss_en))) // Wait miss_stat sram addr changes + + + // Autocat + import AutoCatConstants._ + + @chiselName + def cpRegTmpl(init: UInt, enable: Bool): UInt = RegEnable(io.cp.updateData, init, enable) + + io.cp.autocat_suggested_waymask := io.l2.autocat_suggested_waymask + + /** + * AutoCat Enable + */ + val autocat_en_reg = cpRegTmpl(false.B, io.cp.autocat_wen) + io.cp.autocat_en := autocat_en_reg + io.l2.autocat_en := autocat_en_reg + + /** + * Decide autocat refresh cycles: 2**(value) + */ + val autocat_reset_bin_power_reg = cpRegTmpl(10.U(resetBinPowerWidth.W), io.cp.autocat_reset_bin_power_wen) + io.cp.autocat_reset_bin_power := autocat_reset_bin_power_reg + + /** + * The label autocat are serving. + * Updating this field will refresh autocat's way hit counters. + */ + val autocat_watching_dsid = cpRegTmpl(0.U(dsidWidth.W), io.cp.autocat_watching_dsid_wen) + io.cp.autocat_watching_dsid := autocat_watching_dsid + io.l2.autocat_watching_dsid := autocat_watching_dsid + val watch_change = WireInit(false.B) + io.autocat_watching_change := watch_change || io.cp.autocat_watching_dsid_wen + + /** + * The allowed hits gap between current allocated ways to full-occupied. + */ + val autocat_gap = cpRegTmpl(500.U(32.W), io.cp.autocat_gap_wen) + io.cp.autocat_gap := autocat_gap + + private val l2SetCnt: BigInt = p(NL2CacheCapacity) * 1024 / p(NL2CacheWays) / p(CacheBlockBytes) + private val l2SetBits = l2SetCnt.bitLength + println(s"cp: l2SetCnt $l2SetCnt, l2SetBits $l2SetBits") + val autocat_set_sampling_mask = cpRegTmpl(0xf.U(l2SetBits.W), io.cp.autocat_set_wen) + io.cp.autocat_set := autocat_set_sampling_mask + io.l2.autocat_set := autocat_set_sampling_mask // TL node def offset(addr: Int): Int = { (addr - CP_HART_DSID) << 2 } @@ -320,6 +409,16 @@ with HasTokenBucketParameters offset(CP_HART_ID) -> Seq(RegField(32, progHartIds(hartSel))), offset(CP_TIMER_LO) -> Seq(RegField(32, timestamp_buffered(31, 0), ())), offset(CP_TIMER_HI) -> Seq(RegField(32, timestamp_buffered(63, 32), ())), + offset(CP_L2_REQ_EN) -> Seq(RWNotify(1, WireInit(false.B), WireInit(false.B), sbus_l2_miss_en, Wire(Bool()))), + offset(CP_L2_REQ_MISS) -> Seq(RegField.r(32, io.l2.req_miss)), + offset(CP_L2_REQ_TOTAL)-> Seq(RegField.r(32, io.l2.req_total)), + offset(CP_L2_STAT_RESET)->Seq(RWNotify(1, WireInit(false.B), l2_stat_reset, Wire(Bool()), WireInit(false.B))), + offset(CP_AUTOCAT_EN) -> Seq(RegField(1, autocat_en_reg)), + offset(CP_AUTOCAT_RESET_BIN_POWER) -> Seq(RegField(resetBinPowerWidth, autocat_reset_bin_power_reg)), + offset(CP_AUTOCAT_SUGGEST_WAYMASK) -> Seq(RegField.r(nrL2Ways, io.l2.autocat_suggested_waymask)), + offset(CP_AUTOCAT_WATCHING_DSID) -> Seq(RWNotify(dsidWidth, autocat_watching_dsid, autocat_watching_dsid, WireInit(false.B), watch_change)), + offset(CP_AUTOCAT_SET) -> Seq(RegField(32, autocat_set_sampling_mask)), + offset(CP_AUTOCAT_GAP) -> Seq(RegField(32, autocat_gap)), ) @@ -606,8 +705,29 @@ trait BindL2WayMask extends HasRocketTiles { trait BindL2WayMaskModuleImp extends HasRocketTilesModuleImp { val outer: BindL2WayMask - if (p(NL2CacheCapacity) != 0) { outer._l2.module.cp <> outer._cp.module.io.l2 + val cat = Module(new autocat) + val cpio = outer._cp.module.io + val l2 = outer._l2.module + + cat.io.clk_in := clock + cat.io.reset_in := reset || cpio.autocat_watching_change + cat.io.access_valid_in := cpio.cp.autocat_en && l2.autocat.access_valid_in + cat.io.reset_bin_power := cpio.cp.autocat_reset_bin_power + cat.io.allowed_gap := cpio.cp.autocat_gap + cat.io.hit_vec_in := l2.autocat.hit_vec_in + cpio.l2.autocat_suggested_waymask := cat.io.suggested_waymask_out + l2.autocat.suggested_waymask_out := + Mux(cpio.cp.autocat_en, cat.io.suggested_waymask_out, Fill(p(NL2CacheWays), 1.U)) + + when (cat.io.access_valid_in) { + printf("hit_vec_in %b\n", cat.io.hit_vec_in) + } + + val pre_way = RegNext(cat.io.suggested_waymask_out) + when (pre_way =/= cat.io.suggested_waymask_out) { + printf("suggested waymask %b\n", cat.io.suggested_waymask_out) + } } } diff --git a/src/main/scala/lvna/controlplane/TokenBucketNode.scala b/src/main/scala/lvna/controlplane/TokenBucketNode.scala index ffa42399..1b33d238 100644 --- a/src/main/scala/lvna/controlplane/TokenBucketNode.scala +++ b/src/main/scala/lvna/controlplane/TokenBucketNode.scala @@ -27,10 +27,8 @@ class TokenBucketNodeImp(outer: TokenBucketNode) extends LazyModuleImp(outer) { bucketIO.fire := out.a.ready && out.a.valid && !phy bucketIO.size := (1.U << in.a.bits.size) >> 6 -// out.a.valid := in.a.valid && (phy || bucketIO.enable) -// in.a.ready := out.a.ready && (phy || bucketIO.enable) - out.a.valid := in.a.valid - in.a.ready := out.a.ready + out.a.valid := in.a.valid && (phy || bucketIO.enable) + in.a.ready := out.a.ready && (phy || bucketIO.enable) if (DEBUG_TB_FETCH) { when(in.a.valid && !out.a.valid) { printf(p"request blocked by token bucket: 0x${Hexadecimal(in.a.bits.address)}\n") diff --git a/src/main/scala/system/TestHarness.scala b/src/main/scala/system/TestHarness.scala index 36120033..e309242d 100644 --- a/src/main/scala/system/TestHarness.scala +++ b/src/main/scala/system/TestHarness.scala @@ -101,6 +101,9 @@ class TestHarness()(implicit p: Parameters) extends Module { dut.coreclk := dut.clock } + dut.mem_part_en := true.B + dut.distinct_hart_dsid_en := true.B + dut.dontTouchPorts() dut.tieOffInterrupts() dut.connectSimAXIMem()