From 442f51d5a46884821d3176e176e90c53279da34d Mon Sep 17 00:00:00 2001 From: Liphen Date: Sat, 23 Dec 2023 11:47:35 +0800 Subject: [PATCH] =?UTF-8?q?fix(icache):=20=E4=BF=AE=E5=A4=8D=E4=B9=8B?= =?UTF-8?q?=E5=89=8Dicache=E9=81=97=E7=95=99=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chisel/playground/src/CpuConfig.scala | 15 +- chisel/playground/src/cache/Cache.scala | 4 +- chisel/playground/src/cache/DCache.scala | 22 +-- chisel/playground/src/cache/ICache.scala | 174 +++++++++++------- .../src/cache/memory/PortDefinitions.scala | 4 +- .../src/cache/memory/SimpleDualPortRam.scala | 25 ++- chisel/playground/src/defines/Const.scala | 2 +- 7 files changed, 142 insertions(+), 104 deletions(-) diff --git a/chisel/playground/src/CpuConfig.scala b/chisel/playground/src/CpuConfig.scala index c430d51..8a3eec3 100644 --- a/chisel/playground/src/CpuConfig.scala +++ b/chisel/playground/src/CpuConfig.scala @@ -28,23 +28,22 @@ case class BranchPredictorConfig( val phtDepth: Int = 6) case class CacheConfig( - nway: Int = 2, // 路数 - nbank: Int = 8, // 每个项目中的bank数 - nindex: Int, // 每路的项目数 - bankWidth: Int // 每个bank中的字节数 + nway: Int = 2, // 路数 + nbank: Int, // 每个项目中的bank数 + nindex: Int, // 每路的项目数 + bytesPerBank: Int // 每个bank中的字节数 ) { val config = CpuConfig() val indexWidth = log2Ceil(nindex) // index的位宽 val bankIndexWidth = log2Ceil(nbank) - val bankOffsetWidth = log2Ceil(bankWidth) + val bankOffsetWidth = log2Ceil(bytesPerBank) val offsetWidth = bankIndexWidth + bankOffsetWidth // offset的位宽 val tagWidth = 32 - indexWidth - offsetWidth // tag的位宽 - val bankWidthBits = bankWidth * 8 - val burstSize = 16 + val bitsPerBank = bytesPerBank * 8 require(isPow2(nindex)) require(isPow2(nway)) require(isPow2(nbank)) - require(isPow2(bankWidth)) + require(isPow2(bytesPerBank)) require( tagWidth + indexWidth + bankIndexWidth + bankOffsetWidth == 32, "basic request calculation" diff --git a/chisel/playground/src/cache/Cache.scala b/chisel/playground/src/cache/Cache.scala index c50a199..76e6d96 100644 --- a/chisel/playground/src/cache/Cache.scala +++ b/chisel/playground/src/cache/Cache.scala @@ -15,9 +15,9 @@ class Cache(implicit config: CpuConfig) extends Module { }) implicit val iCacheConfig = - CacheConfig(nindex = 64, nbank = 4, bankWidth = (32 / 8) * 4) // 每个 bank 存 4 条 32 bit 指令 + CacheConfig(nindex = 64, nbank = 4, bytesPerBank = (32 / 8) * config.instFetchNum) // 每个 bank 存 2 条 32 bit 指令 implicit val dCacheConfig = - CacheConfig(nindex = 128, bankWidth = XLEN / 8) // 每个 bank 存 1 条 XLEN bit 数据 + CacheConfig(nindex = 128, nbank = 8, bytesPerBank = XLEN / 8) // 每个 bank 存 1 条 XLEN bit 数据 val icache = Module(new ICache(iCacheConfig)) val dcache = Module(new DCache(dCacheConfig)) diff --git a/chisel/playground/src/cache/DCache.scala b/chisel/playground/src/cache/DCache.scala index f683904..e86bffd 100644 --- a/chisel/playground/src/cache/DCache.scala +++ b/chisel/playground/src/cache/DCache.scala @@ -16,12 +16,12 @@ class WriteBufferUnit extends Bundle { } class DCache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Module { - val nway: Int = cacheConfig.nway - val nset: Int = cacheConfig.nindex - val nbank: Int = cacheConfig.nbank - val bankWidthBits: Int = cacheConfig.bankWidthBits - val tagWidth: Int = cacheConfig.tagWidth - val burstSize: Int = cacheConfig.burstSize + val nway: Int = cacheConfig.nway + val nindex: Int = cacheConfig.nindex + val nbank: Int = cacheConfig.nbank + val bitsPerBank: Int = cacheConfig.bitsPerBank + val tagWidth: Int = cacheConfig.tagWidth + val burstSize: Int = 16 val io = IO(new Bundle { val cpu = Flipped(new Cache_DCache()) @@ -37,9 +37,9 @@ class DCache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul io.cpu.tlb.dcache_is_idle := state === s_idle // * valid dirty * // - val valid = RegInit(VecInit(Seq.fill(nset)(VecInit(Seq.fill(nway)(false.B))))) - val dirty = RegInit(VecInit(Seq.fill(nset)(VecInit(Seq.fill(nway)(false.B))))) - val lru = RegInit(VecInit(Seq.fill(nset)(0.U(1.W)))) + val valid = RegInit(VecInit(Seq.fill(nindex)(VecInit(Seq.fill(nway)(false.B))))) + val dirty = RegInit(VecInit(Seq.fill(nindex)(VecInit(Seq.fill(nway)(false.B))))) + val lru = RegInit(VecInit(Seq.fill(nindex)(0.U(1.W)))) val write_fifo = Module(new Queue(new WriteBufferUnit(), 4)) @@ -115,7 +115,7 @@ class DCache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul // bank tagv ram for { i <- 0 until nway } { - val bank_ram = Module(new SimpleDualPortRam(nset * nbank, bankWidthBits, byteAddressable = true)) + val bank_ram = Module(new SimpleDualPortRam(nindex * nbank, bitsPerBank, byteAddressable = true)) bank_ram.io.ren := true.B bank_ram.io.raddr := data_raddr data(i) := bank_ram.io.rdata @@ -125,7 +125,7 @@ class DCache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul bank_ram.io.wdata := data_wdata bank_ram.io.wstrb := data_wstrb(i) - val tag_ram = Module(new LUTRam(nset, tagWidth)) + val tag_ram = Module(new LUTRam(nindex, tagWidth)) tag_ram.io.raddr := tag_raddr tag(i) := tag_ram.io.rdata diff --git a/chisel/playground/src/cache/ICache.scala b/chisel/playground/src/cache/ICache.scala index 5739d06..da24e57 100644 --- a/chisel/playground/src/cache/ICache.scala +++ b/chisel/playground/src/cache/ICache.scala @@ -10,35 +10,37 @@ import cpu.defines.Const._ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Module { val nway: Int = cacheConfig.nway - val nset: Int = cacheConfig.nindex + val nindex: Int = cacheConfig.nindex val nbank: Int = cacheConfig.nbank - val ninst: Int = config.instFetchNum + val instFetchNum: Int = config.instFetchNum val bankOffsetWidth: Int = cacheConfig.bankOffsetWidth - val bankWidth: Int = cacheConfig.bankWidth + val bankIndexWidth: Int = cacheConfig.offsetWidth - bankOffsetWidth + val bytesPerBank: Int = cacheConfig.bytesPerBank val tagWidth: Int = cacheConfig.tagWidth val indexWidth: Int = cacheConfig.indexWidth val offsetWidth: Int = cacheConfig.offsetWidth + val bitsPerBank: Int = cacheConfig.bitsPerBank val io = IO(new Bundle { val cpu = Flipped(new Cache_ICache()) val axi = new ICache_AXIInterface() }) - require(isPow2(ninst), "ninst must be power of 2") - // * addr organization * // - // ====================================== - // | tag | index |offset| - // |31 12|11 6|5 0| - // ====================================== - // | offset | - // | bank index | bank offset | - // | 5 4 | 3 2 | - // ============================ + require(isPow2(instFetchNum), "ninst must be power of 2") + + // 整个宽度为PADDR_WID的地址 + // ========================================================== + // | tag | index | offset | + // | | | bank index | bank offset | + // ========================================================== + + val bank_index = io.cpu.addr(0)(offsetWidth - 1, bankOffsetWidth) + val bank_offset = io.cpu.addr(0)(bankOffsetWidth - 1, 2) // PC低2位必定是0 val tlb_fill = RegInit(false.B) // * fsm * // val s_idle :: s_uncached :: s_replace :: s_save :: Nil = Enum(4) val state = RegInit(s_idle) - // * nway * nset * // + // * nway * nindex * // // * 128 bit for 4 inst * // // ========================================================= // | valid | tag | bank 0 | bank 1 | bank 2 | bank 3 | @@ -48,24 +50,24 @@ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul // | inst 0 | inst 1 | inst 2 | inst 3 | // | 32 | 32 | 32 | 32 | // ===================================== - val instperbank = bankWidth / 4 // 每个bank存储的指令数 - val valid = RegInit(VecInit(Seq.fill(nset * nbank)(VecInit(Seq.fill(instperbank)(false.B))))) + require(instFetchNum == bytesPerBank / 4, "instFetchNum must equal to instperbank") + val valid = RegInit(VecInit(Seq.fill(nindex)(VecInit(Seq.fill(nbank)(false.B))))) - val data = Wire(Vec(nway, Vec(instperbank, UInt(XLEN.W)))) + val data = Wire(Vec(nway, Vec(nbank, UInt(XLEN.W)))) val tag = RegInit(VecInit(Seq.fill(nway)(0.U(tagWidth.W)))) // * should choose next addr * // val should_next_addr = (state === s_idle && !tlb_fill) || (state === s_save) - val data_raddr = io.cpu.addr(should_next_addr)(indexWidth + offsetWidth - 1, bankOffsetWidth) - val data_wstrb = RegInit(VecInit(Seq.fill(nway)(VecInit(Seq.fill(instperbank)(0.U(4.W)))))) + val data_raddr = io.cpu.addr(should_next_addr)(indexWidth + offsetWidth - 1, offsetWidth) + val data_wstrb = RegInit(VecInit(Seq.fill(nway)(VecInit(Seq.fill(nbank)(false.B))))) val tag_raddr = io.cpu.addr(should_next_addr)(indexWidth + offsetWidth - 1, offsetWidth) val tag_wstrb = RegInit(VecInit(Seq.fill(nway)(false.B))) val tag_wdata = RegInit(0.U(tagWidth.W)) // * lru * // - val lru = RegInit(VecInit(Seq.fill(nset * nbank)(false.B))) + val lru = RegInit(VecInit(Seq.fill(nindex * nbank)(false.B))) // * itlb * // when(tlb_fill) { tlb_fill := false.B } @@ -73,64 +75,85 @@ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul io.cpu.tlb.icache_is_save := (state === s_save) // * fence * // + // fence指令时清空cache,即将所有valid位置0 when(io.cpu.fence && !io.cpu.icache_stall && io.cpu.cpu_ready) { - valid.map(_ := VecInit(Seq.fill(instperbank)(false.B))) + valid := 0.U.asTypeOf(valid) } - // * replace set * // - val rset = RegInit(0.U(6.W)) + // * replace index * // + val rindex = RegInit(0.U(indexWidth.W)) - // * virtual set * // - val vset = io.cpu.addr(0)(indexWidth + offsetWidth - 1, offsetWidth) + // * virtual index * // + val vindex = io.cpu.addr(0)(indexWidth + offsetWidth - 1, offsetWidth) // * cache hit * // - val tag_compare_valid = VecInit(Seq.tabulate(nway)(i => tag(i) === io.cpu.tlb.tag && valid(vset)(i))) + val tag_compare_valid = VecInit(Seq.tabulate(nway)(i => tag(i) === io.cpu.tlb.tag && valid(vindex)(i))) val cache_hit = tag_compare_valid.contains(true.B) val cache_hit_available = cache_hit && io.cpu.tlb.translation_ok && !io.cpu.tlb.uncached val sel = tag_compare_valid(1) - val bank_offset = io.cpu.addr(0)(log2Ceil(instperbank) + 1, 2) - val inst = VecInit( - Seq.tabulate(instperbank)(i => Mux(i.U <= (3.U - bank_offset), data(sel)(i.U + bank_offset), 0.U)) + // 将一个 bank 中的指令分成 instFetchNum 份,每份 INST_WID bit + val inst_in_bank = VecInit( + Seq.tabulate(instFetchNum)(i => data(sel)(bank_index)((i + 1) * INST_WID - 1, i * INST_WID)) ) - val inst_valid = VecInit(Seq.tabulate(instperbank)(i => cache_hit_available && i.U <= (3.U - bank_offset))) - val saved = RegInit(VecInit(Seq.fill(instperbank)(0.U.asTypeOf(new Bundle { + // 将 inst_in_bank 中的指令按照 bank_offset 位偏移量重新排列 + // 处理偏移导致的跨 bank 读取 + // 当offset为0时,不需要重新排列 + // 当offset为1时,此时发送到cpu的inst0应该是inst1,inst1应该无数据 + // | bank | + // | inst 0 | inst 1 | + // | 32 | 32 | + val inst = VecInit( + Seq.tabulate(instFetchNum)(i => + Mux( + i.U <= ((instFetchNum - 1).U - bank_offset), + inst_in_bank(i.U + bank_offset), + 0.U + ) + ) + ) + val inst_valid = VecInit( + Seq.tabulate(instFetchNum)(i => cache_hit_available && i.U <= ((instFetchNum - 1).U - bank_offset)) + ) + + val saved = RegInit(VecInit(Seq.fill(instFetchNum)(0.U.asTypeOf(new Bundle { val inst = UInt(INST_WID.W) val valid = Bool() })))) - val axi_cnt = Counter(cacheConfig.burstSize) + val rlen = nbank + val rsize = log2Ceil(bytesPerBank) // bank tag ram - for { i <- 0 until nway; j <- 0 until instperbank } { - val bank = Module(new SimpleDualPortRam(nset * nbank, INST_WID, byteAddressable = true)) - bank.io.ren := true.B - bank.io.raddr := data_raddr - data(i)(j) := bank.io.rdata + for { i <- 0 until nway } { + // 每一个条目中有nbank个bank,每个bank存储instFetchNum个指令 + val bank = + Seq.fill(nbank)(Module(new SimpleDualPortRam(depth = nindex, width = bitsPerBank, byteAddressable = false))) + for { j <- 0 until nbank } { + bank(j).io.ren := true.B + bank(j).io.raddr := data_raddr + data(i)(j) := bank(j).io.rdata - bank.io.wen := data_wstrb(i)(j).orR - bank.io.waddr := Cat(rset, axi_cnt.value(log2Ceil(cacheConfig.burstSize) - 1, log2Ceil(instperbank))) - bank.io.wdata := Mux( - j.U === axi_cnt.value(log2Ceil(instperbank) - 1, 0), - Mux(axi_cnt.value(0) === 0.U, io.axi.r.bits.data(31, 0), io.axi.r.bits.data(63, 32)), - 0.U - ) - bank.io.wstrb := data_wstrb(i)(j) + bank(j).io.wen := data_wstrb(i)(j) + bank(j).io.waddr := rindex + bank(j).io.wdata := io.axi.r.bits.data + bank(j).io.wstrb := data_wstrb(i)(j) + } } - for { i <- 0 until ninst } { + for { i <- 0 until instFetchNum } { io.cpu.inst_valid(i) := Mux(state === s_idle && !tlb_fill, inst_valid(i), saved(i).valid) && io.cpu.req io.cpu.inst(i) := Mux(state === s_idle && !tlb_fill, inst(i), saved(i).inst) } for { i <- 0 until nway } { - val tag_bram = Module(new LUTRam(nset, tagWidth)) + val tag_bram = Module(new LUTRam(nindex, tagWidth)) tag_bram.io.raddr := tag_raddr tag(i) := tag_bram.io.rdata tag_bram.io.wen := tag_wstrb(i) - tag_bram.io.waddr := rset + tag_bram.io.waddr := rindex tag_bram.io.wdata := tag_wdata } @@ -171,28 +194,29 @@ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul }.elsewhen(io.cpu.tlb.uncached) { state := s_uncached ar.addr := io.cpu.tlb.pa - ar.len := 0.U(log2Ceil((nbank * bankWidth) / 4).W) - ar.size := 2.U(bankOffsetWidth.W) + ar.len := 0.U + ar.size := rsize.U arvalid := true.B }.elsewhen(!cache_hit) { - state := s_replace - ar.addr := Cat(io.cpu.tlb.pa(31, 6), 0.U(6.W)) - ar.len := 15.U(log2Ceil((nbank * bankWidth) / 4).W) - ar.size := 2.U(bankOffsetWidth.W) + state := s_replace + // 取指时按bank块取指 + ar.addr := Cat(io.cpu.tlb.pa(PADDR_WID - 1, offsetWidth), 0.U(offsetWidth.W)) + ar.len := (rlen - 1).U + ar.size := rsize.U arvalid := true.B - rset := vset - (0 until instperbank).foreach(i => data_wstrb(lru(vset))(i) := Mux(i.U === 0.U, 0xf.U, 0x0.U)) - tag_wstrb(lru(vset)) := true.B - tag_wdata := io.cpu.tlb.tag - valid(vset)(lru(vset)) := true.B - axi_cnt.reset() + rindex := vindex + data_wstrb(lru(vindex)).map(_ := false.B) + data_wstrb(lru(vindex))(0) := true.B // 从第一个bank开始写入 + tag_wstrb(lru(vindex)) := true.B + tag_wdata := io.cpu.tlb.tag + valid(vindex)(lru(vindex)) := true.B }.elsewhen(!io.cpu.icache_stall) { - lru(vset) := ~sel + lru(vindex) := ~sel when(!io.cpu.cpu_ready) { state := s_save - (1 until instperbank).foreach(i => saved(i).inst := data(sel)(i)) - (0 until instperbank).foreach(i => saved(i).valid := inst_valid(i)) + (1 until instFetchNum).foreach(i => saved(i).inst := data(sel)(i)) + (0 until instFetchNum).foreach(i => saved(i).valid := inst_valid(i)) } } } @@ -221,13 +245,12 @@ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul }.elsewhen(io.axi.r.fire) { // * burst transport * // when(!io.axi.r.bits.last) { - axi_cnt.inc() - data_wstrb(lru(vset))(0) := data_wstrb(lru(vset))(instperbank - 1) - (1 until instperbank).foreach(i => data_wstrb(lru(vset))(i) := data_wstrb(lru(vset))(i - 1)) + // 左移写掩码,写入下一个bank + data_wstrb(lru(vindex)) := ((data_wstrb(lru(vindex)).asUInt << 1)(nbank - 1, 0)).asBools }.otherwise { - rready := false.B - data_wstrb(lru(vset)) := 0.U.asTypeOf(Vec(instperbank, UInt(4.W))) - tag_wstrb(lru(vset)) := false.B + rready := false.B + data_wstrb(lru(vindex)).map(_ := false.B) + tag_wstrb(lru(vindex)) := false.B } }.elsewhen(!io.axi.r.ready) { state := s_idle @@ -236,8 +259,19 @@ class ICache(cacheConfig: CacheConfig)(implicit config: CpuConfig) extends Modul is(s_save) { when(io.cpu.cpu_ready && !io.cpu.icache_stall) { state := s_idle - (0 until instperbank).foreach(i => saved(i).valid := false.B) + (0 until instFetchNum).foreach(i => saved(i).valid := false.B) } } } + + println("ICache: ") + println("nindex: " + nindex) + println("nbank: " + nbank) + println("bankOffsetWidth: " + bankOffsetWidth) + println("bytesPerBank: " + bytesPerBank) + println("tagWidth: " + tagWidth) + println("indexWidth: " + indexWidth) + println("offsetWidth: " + offsetWidth) + println("size: " + rsize) + println("len: " + rlen) } diff --git a/chisel/playground/src/cache/memory/PortDefinitions.scala b/chisel/playground/src/cache/memory/PortDefinitions.scala index b918f5e..0c587ab 100644 --- a/chisel/playground/src/cache/memory/PortDefinitions.scala +++ b/chisel/playground/src/cache/memory/PortDefinitions.scala @@ -17,7 +17,7 @@ class WriteOnlyPort[+T <: Data](gen: T)(implicit cacheConfig: CacheConfig) exten class WriteOnlyMaskPort[+T <: Data](gen: T)(implicit cacheConfig: CacheConfig) extends Bundle { val addr = Input(UInt(log2Ceil(cacheConfig.nindex * cacheConfig.nbank).W)) - val en = Input(UInt(cacheConfig.bankWidth.W)) + val en = Input(UInt(cacheConfig.bytesPerBank.W)) val data = Input(gen) } @@ -31,7 +31,7 @@ class ReadWritePort[+T <: Data](gen: T)(implicit cacheConfig: CacheConfig) exten class MaskedReadWritePort[+T <: Data](gen: T)(implicit cacheConfig: CacheConfig) extends Bundle { val addr = Input(UInt(log2Ceil(cacheConfig.nindex * cacheConfig.nbank).W)) - val writeMask = Input(UInt(cacheConfig.bankWidth.W)) + val writeMask = Input(UInt(cacheConfig.bytesPerBank.W)) val wdata = Input(gen) val rdata = Output(gen) } diff --git a/chisel/playground/src/cache/memory/SimpleDualPortRam.scala b/chisel/playground/src/cache/memory/SimpleDualPortRam.scala index 44ef47a..576d6e4 100644 --- a/chisel/playground/src/cache/memory/SimpleDualPortRam.scala +++ b/chisel/playground/src/cache/memory/SimpleDualPortRam.scala @@ -16,13 +16,18 @@ import cpu.CpuConfig * @param cpuCfg * the implicit configuration for simulation and elaboration */ -class SimpleDualPortRam(depth: Int, width: Int, byteAddressable: Boolean)(implicit - val config: CpuConfig, -) extends Module { +class SimpleDualPortRam( + depth: Int, + width: Int, + byteAddressable: Boolean +)( + implicit + val config: CpuConfig) + extends Module { require(isPow2(depth)) require( width % 8 == 0 || !byteAddressable, - "if memory is byte addressable, then the adderss width must be a multiple of 8", + "if memory is byte addressable, then the adderss width must be a multiple of 8" ) val waddridth = log2Ceil(depth) @@ -40,11 +45,11 @@ class SimpleDualPortRam(depth: Int, width: Int, byteAddressable: Boolean)(implic if (config.build) { val memory = Module( new SimpleDualPortRamIP( - wdataidth = width, + wdataidth = width, byteWriteWidth = if (byteAddressable) 8 else width, - numberOfLines = depth, - waddridth = waddridth, - ), + numberOfLines = depth, + waddridth = waddridth + ) ) memory.io.clka := clock memory.io.clkb := clock @@ -62,12 +67,12 @@ class SimpleDualPortRam(depth: Int, width: Int, byteAddressable: Boolean)(implic } else { assert( io.wstrb.orR || !io.wen, - "when write port enable is high, write vector cannot be all 0", + "when write port enable is high, write vector cannot be all 0" ) if (byteAddressable) { val bank = SyncReadMem(depth, Vec(width / 8, UInt(8.W))) when(io.ren) { - io.rdata := bank(io.raddr).asTypeOf(io.rdata) + io.rdata := bank.read(io.raddr).asTypeOf(UInt(width.W)) }.otherwise { io.rdata := DontCare } diff --git a/chisel/playground/src/defines/Const.scala b/chisel/playground/src/defines/Const.scala index f28185f..b2b567f 100644 --- a/chisel/playground/src/defines/Const.scala +++ b/chisel/playground/src/defines/Const.scala @@ -20,7 +20,7 @@ trait Constants extends CoreParameter { val EXC_WID = 16 // inst rom - val INST_WID = XLEN + val INST_WID = 32 val INST_ADDR_WID = XLEN // data ram