Skip to content

Instantly share code, notes, and snippets.

@sylefeb
Created December 16, 2025 14:24
Show Gist options
  • Select an option

  • Save sylefeb/7c20ce7508c1e7f04bf496d5d29abf79 to your computer and use it in GitHub Desktop.

Select an option

Save sylefeb/7c20ce7508c1e7f04bf496d5d29abf79 to your computer and use it in GitHub Desktop.
ice-v CPU fix
// SL 2020-06-12 @sylefeb
//
// Fun with RISC-V! RV32I cpu, see README.md
//
// https://github.com/sylefeb/Silice
// MIT license, see LICENSE_MIT in Silice repo root
$$if ICEV_FAST_SHIFT then
$$print("Ice-V configured for fast shift (barrel shifter)")
$$end
// --------------------------------------------------
// Processor
// --------------------------------------------------
// bitfield for easier decoding of instructions
$$if not ICEV_RTYPE then
bitfield Rtype { uint1 unused1, uint1 sign, uint5 unused2, uint5 rs2,
uint5 rs1, uint3 op, uint5 rd, uint7 opcode}
$$ICEV_RTYPE = 1
$$end
// --------------------------------------------------
// execute: decoder + ALU
// - decodes instructions
// - performs all integer computations
unit execute(
// instruction, program counter and registers
input uint32 instr, input uint$addrW$ pc, input int32 xa, input int32 xb,
// trigger: pulsed high when the decoder + ALU should start
input uint1 trigger,
// outputs all information the processor needs to decide what to do next
output uint3 op, output uint5 write_rd, output uint1 no_rd,
output uint1 jump, output uint1 load, output uint1 store,
output int32 val, output uint1 storeVal, output uint1 working(0),
output uint32 n, output uint1 storeAddr, // next address adder
output uint1 intop, output int32 r, // integer operations
) {
uint5 shamt(0); uint32 cycle(0); // shifter status and cycle counter
always {
// ==== decode immediates
int32 imm_u = {instr[12,20],12b0};
int32 imm_j = {{12{instr[31,1]}},instr[12,8],instr[20,1],instr[21,10],1b0};
int32 imm_i = {{20{instr[31,1]}},instr[20,12]};
int32 imm_b = {{20{instr[31,1]}},instr[7,1],instr[25,6],instr[8,4],1b0};
int32 imm_s = {{20{instr[31,1]}},instr[25,7],instr[7,5]};
// ==== decode opcode // load-store op
uint5 opcode = instr[ 2, 5]; op = Rtype(instr).op;
uint1 AUIPC = opcode == 5b00101; uint1 LUI = opcode == 5b01101;
uint1 JAL = opcode == 5b11011; uint1 JALR = opcode == 5b11001;
uint1 IntImm = opcode == 5b00100; uint1 IntReg = opcode == 5b01100;
uint1 Cycles = opcode == 5b11100; uint1 branch = opcode == 5b11000;
uint1 regOrImm = IntReg | branch; // reg or imm in ALU?
uint1 pcOrReg = AUIPC | JAL | branch; // pc or reg in addr?
uint1 sub = IntReg & Rtype(instr).sign; // subtract
uint1 aluShift = (IntImm | IntReg) & op[0,2] == 2b01; // shift requested
// ==== select next address adder first input
int32 addr_a = pcOrReg ? __signed({1b0,pc,2b0}) : xa;
// ==== select ALU second input
int32 b = regOrImm ? (xb) : imm_i;
// ==== allows to do subtraction and all comparisons with a single adder
// trick from femtorv32/swapforth/J1
int33 a_minus_b = {1b1,~b} + {1b0,xa} + 33b1;
uint1 a_lt_b = (xa[31,1] ^ b[31,1]) ? xa[31,1] : a_minus_b[32,1];
uint1 a_lt_b_u = a_minus_b[32,1];
uint1 a_eq_b = a_minus_b[0,32] == 0;
// ==== set decoder outputs depending on incoming instructions
// register to write to?
write_rd = Rtype(instr).rd;
// load/store?
load = opcode == 5b00000; store = opcode == 5b01000;
// do we have to write a result to a register?
no_rd = branch | store | (Rtype(instr).rd == 5b0);
// integer operations // store next address?
intop = (IntImm | IntReg); storeAddr = AUIPC;
// value to store directly // store value?
val = LUI ? imm_u : cycle; storeVal = LUI | Cycles;
// ==== select immediate for the next address computation
// 'or trick' from femtorv32
int32 addr_imm = (AUIPC ? imm_u : 32b0) | (JAL ? imm_j : 32b0)
| (branch ? imm_b : 32b0) | ((JALR|load) ? imm_i : 32b0)
| (store ? imm_s : 32b0);
// ==== increment cycle counter
cycle = cycle + 1;
int32 shift(0); uint1 j(0); // temp variables for shifter and comparator
// ====================== ALU
$$if not ICEV_FAST_SHIFT then
// shift (one bit per clock)
if (working) {
// decrease shift size
shamt = shamt - 1;
// shift one bit
shift = op[2,1] ? (Rtype(instr).sign ? {r[31,1],r[1,31]}
: {__signed(1b0),r[1,31]}) : {r[0,31],__signed(1b0)};
} else {
// start shifting?
shamt = ((aluShift & trigger) ? __unsigned(b[0,5]) : 0);
// store value to be shifted
shift = xa;
}
// are we still shifting?
working = (shamt != 0);
$$end
// all ALU operations
switch (op) {
case 3b000: { r = sub ? a_minus_b : xa + b; } // ADD / SUB
case 3b010: { r = a_lt_b; } case 3b011: { r = a_lt_b_u; }// SLTI / SLTU
case 3b100: { r = xa ^ b; } case 3b110: { r = xa | b; }// XOR / OR
$$if not ICEV_FAST_SHIFT then
case 3b001: { r = shift; } case 3b101: { r = shift; }// SLLI/SRLI/SRAI
$$else
case 3b001: { r = (xa <<< b[0,5]); }
case 3b101: { r = Rtype(instr).sign ? (xa >>> b[0,5]) : (xa >> b[0,5]); }
$$end
case 3b111: { r = xa & b; } // AND
default: { r = {32{1bx}}; } // don't care
}
// ====================== Comparator for branching
switch (op[1,2]) {
case 2b00: { j = a_eq_b; } /*BEQ */ case 2b10: { j=a_lt_b;} /*BLT*/
case 2b11: { j = a_lt_b_u;} /*BLTU*/ default: { j = 1bx; }
}
jump = (JAL | JALR) | (branch & (j ^ op[0,1]));
// ^^^^^^^ negates comparator result
// ====================== Next address adder
n = addr_a + addr_imm;
}
}
// --------------------------------------------------
// The Risc-V RV32I CPU itself
unit rv32i_cpu(bram_port mem) {
// register file, uses two BRAMs to fetch two registers at once
bram int32 xregsA[32] = {pad(0)}; bram int32 xregsB[32] = {pad(0)};
// current instruction
uint32 instr(0);
// program counter
uint$addrW$ pc = uninitialized;
uint$addrW$ next_pc <:: pc + 1; // next_pc tracks the expression 'pc + 1'
// value that has been loaded from memory
int32 loaded = uninitialized;
// decoder + ALU, executes the instruction and tells processor what to do
execute exec(
instr <:: instr, pc <:: pc, xa <: xregsA.rdata, xb <: xregsB.rdata
);
int32 sext_n <: {{$32-(addrW+2)${exec.n[$addrW+2$,1]}},exec.n[0,$addrW+2$]};
// what do we write in register? (pc, alu or val, load is handled separately)
int32 write_back <: (exec.jump ? (next_pc<<2) : 32b0)
| (exec.storeAddr ? sext_n : 32b0)
| (exec.storeVal ? exec.val : 32b0)
| (exec.load ? loaded : 32b0)
| (exec.intop ? exec.r : 32b0);
// The 'always_before' block is applied at the start of every cycle.
// This is a good place to set default values, which also indicates
// to Silice that some variables (e.g. xregsA.wdata) are fully set
// every cycle, enabling further optimizations.
// Default values are overriden from within the algorithm loop.
always_before {
// decodes values loaded from memory (used when exec.load == 1)
uint32 aligned = mem.rdata >> {exec.n[0,2],3b000};
switch ( exec.op[0,2] ) { // LB / LBU, LH / LHU, LW
case 2b00:{ loaded = {{24{(~exec.op[2,1])&aligned[ 7,1]}},aligned[ 0,8]}; }
case 2b01:{ loaded = {{16{(~exec.op[2,1])&aligned[15,1]}},aligned[ 0,16]};}
case 2b10:{ loaded = aligned; }
default: { loaded = {32{1bx}}; } // don't care
}
// what to write on a store (used when exec.store == 1)
mem.wdata = xregsB.rdata << {exec.n[0,2],3b000};
// maintain write enable low (pulses high when needed)
mem.wenable = 4b0000;
// maintain alu trigger low
exec.trigger = 0;
// maintain register wenable low
// (pulsed when necessary)
xregsA.wenable = 0;
}
algorithm <onehot,autorun> {
// =========== CPU runs forever
while (1) {
// data is now available
instr = mem.rdata;
pc = mem.addr;
++: // wait for register read (BRAM takes one cycle)
exec.trigger = 1;
while (1) { // decode + ALU refresh during the cycle entering the loop
// this operations loop allows to wait for ALU when needed
// it is built such that no cycles are wasted
// load/store?
if (exec.load | exec.store) {
// memory address from which to load/store
mem.addr = exec.n >> 2;
// == Store (enabled if exec.store == 1)
// build write mask depending on SB, SH, SW
// assumes aligned, e.g. SW => next_addr[0,2] == 2
mem.wenable = ({4{exec.store}} & { { 2{exec.op[0,2]==2b10} },
exec.op[0,1] | exec.op[1,1], 1b1
} ) << exec.n[0,2];
++: // wait for data transaction
// == Load (enabled if exec.load == 1)
// commit result
xregsA.wenable = ~exec.no_rd;
// restore address to program counter
mem.addr = next_pc;
// exit the operations loop
$$if ICEV_VERILATOR_TRACE then
// this is used by SOCs/ice-v-cmp, to track retired instr. and compare CPUs
__verilog("$c32(\"cpu_retires(1,\",%,\",\",%,\",\",%,\",\",%,\");\");",
{pc,2b00},instr,xregsA.wenable?exec.write_rd:0,write_back);
$$end
break;
// instruction read from BRAM and write to register
// occurs as we jump back to loop start
} else {
// commit result
xregsA.wenable = ~exec.no_rd;
// next instruction address
mem.addr = exec.jump ? (exec.n >> 2) : next_pc;
// ALU done?
if (exec.working == 0) {
// yes: all is correct, stop here
$$if ICEV_VERILATOR_TRACE then
// this is used by SOCs/ice-v-cmp, to track retired instr. and compare CPUs
__verilog("$c32(\"cpu_retires(1,\",%,\",\",%,\",\",%,\",\",%,\");\");",
{pc,2b00},instr,xregsA.wenable?exec.write_rd:0,write_back);
$$end
break;
// instruction read from BRAM and write to register
// occurs as we jump back to loop start
}
}
}
}
}
// the 'always_after' block is executed at the end of every cycle
always_after {
// write back data to both register BRAMs
xregsA.wdata = write_back; xregsB.wdata = write_back;
// xregsB written when xregsA is
xregsB.wenable = xregsA.wenable;
// write to write_rd, else track instruction register
xregsA.addr = xregsA.wenable ? exec.write_rd : Rtype(instr).rs1;
xregsB.addr = xregsA.wenable ? exec.write_rd : Rtype(instr).rs2;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment