In an earlier blog post, I was lamenting how one-ninth of an FPGA block RAM was wasted when storing 8-bit ROM data, because there’s no simple way to make use of the 9th parity bit in each word of a block RAM. Horrors! To fight this injustice, I’ve developed a solution that I call packed ROM. It stores nine 8-bit bytes in eight 9-bit words of block RAM, and provides an interface to read the data as if it were an 8-bit memory with a larger depth. Using this method, I’m able to store 1152 bytes of read-only data per block RAM instead of only 1024. The solution relies on the fact that the block RAMs are dual port – you can read from two different addresses simultaneously. Compared with using the same number of block RAMs as a standard 8-bit wide ROM, this solution consumes an extra 54 LUT4s in a MachXO2-1200 FPGA – about 4 percent of the total. It increases the MachXO2-1200’s effective capacity for this type of 8-bit ROM data from 7168 to 8064 bytes.
Here’s the Verilog code, as well as a Python program that reads a plain binary file and writes a “packed” file in .mem format. The code assumes 7 block RAMs, but should be easily adaptable to other numbers.
module packedROM #(parameter NUM_BLOCK_RAMS = 7) ( input [12:0] addr, input clk, output reg [7:0] Q ); // packs 1152*NUM_BLOCK_RAMS 8-bit data bytes into 1024*NUM_BLOCK_RAMS 9-bit words // uses 54 LUT4s of the MachXO2 // may need to change addr width depending on NUM_BLOCK_RAMS. Use $clog2()? // nine bytes A-I are packed into eight 9-bit words as follows: // 0: I3 I2 I1 I0 A4 A3 A2 A1 A0 // 1: I7 I6 I5 I4 B4 B3 B2 B1 B0 // 2: F7 F6 E7 E6 C4 C3 C2 C1 C0 // 3: H7 H6 G7 G6 D4 D3 D2 D1 D0 // 4: A7 A6 A5 E5 E4 E3 E2 E1 E0 // 5: B7 B6 B5 F5 F4 F3 F2 F1 F0 // 6: C7 C6 C5 G5 G4 G3 G2 G1 G0 // 7: D7 D6 D5 H5 H4 H3 H2 H1 H0 // bytes A-H are sequental in the byte-oriented address space below addr 1024*NUM_BLOCK_RAMS // byte I is one of the "extra" bytes, in byte-oriented address space beyond addr 1024*NUM_BLOCK_RAMS reg [12:0] wordAddressA; reg [12:0] wordAddressB; wire [8:0] QA; wire [8:0] QB; // dualPortROM is a wrapper for the MachXO2 block RAMs, created by the Lattice IP Express tool. // it is actually a dual port RAM with the write input unused dualPortROM myDualPortROM( .DataInA(9'b000000000), .DataInB(9'b000000000), .AddressA(wordAddressA), .AddressB(wordAddressB), .ClockA(clk), .ClockB(clk), .ClockEnA(1'b1), .ClockEnB(1'b1), .WrA(1'b0), .WrB(1'b0), .ResetA(1'b0), .ResetB(1'b0), .QA(QA), .QB(QB) ); wire [12:0] overflowAddr = addr - (NUM_BLOCK_RAMS * 1024); always @* begin if (addr < NUM_BLOCK_RAMS * 1024) begin // packed area, bytes A-H wordAddressA <= addr; // word address for the upper bits depends on low three bits of the byte address case (addr[2:0]) 0: begin // A wordAddressB <= { addr[12:3], 3'b100 }; Q <= { QB[8:6], QA[4:0] }; end 1: begin // B wordAddressB <= { addr[12:3], 3'b101 }; Q <= { QB[8:6], QA[4:0] }; end 2: begin // C wordAddressB <= { addr[12:3], 3'b110 }; Q <= { QB[8:6], QA[4:0] }; end 3: begin // D wordAddressB <= { addr[12:3], 3'b111 }; Q <= { QB[8:6], QA[4:0] }; end 4: begin // E wordAddressB <= { addr[12:3], 3'b010 }; Q <= { QB[6:5], QA[5:0] }; end 5: begin // F wordAddressB <= { addr[12:3], 3'b010 }; Q <= { QB[8:7], QA[5:0] }; end 6: begin // G wordAddressB <= { addr[12:3], 3'b011 }; Q <= { QB[6:5], QA[5:0] }; end 7: begin // H wordAddressB <= { addr[12:3], 3'b011 }; Q <= { QB[8:7], QA[5:0] }; end endcase end else begin // overflow area, byte I // word address is byte overflow address times 8 for the lower bits, and times 8 plus 1 for the upper bits wordAddressA <= { overflowAddr[9:0], 3'b000 }; wordAddressB <= { overflowAddr[9:0], 3'b001 }; Q <= { QB[8:5], QA[8:5] }; end end endmodule import os from array import array infile = "coderom.bin" outfile = "coderom.mem" inputData = array('B') insize = os.path.getsize(infile) with open(infile, 'rb') as f: inputData.fromfile(f, insize) out = open(outfile,"w") num_block_rams = 7 outsize = 1024 * num_block_rams for x in range(0,outsize): baseAddr = x & ~7 if x & 7 == 0: out.write('{:02X}\n'.format( (((inputData[outsize+baseAddr//8])&0xF)<<5) | ((inputData[baseAddr])&0x1F))) elif x & 7 == 1: out.write('{:02X}\n'.format( (((inputData[outsize+baseAddr//8])&0xF0)<<1) | ((inputData[baseAddr+1])&0x1F))) elif x & 7 == 2: out.write('{:02X}\n'.format( (((inputData[baseAddr+5])&0xC0)<<1) | (((inputData[baseAddr+4])&0xC0)>>1) | ((inputData[baseAddr+2])&0x1F))) elif x & 7 == 3: out.write('{:02X}\n'.format( (((inputData[baseAddr+7])&0xC0)<<1) | (((inputData[baseAddr+6])&0xC0)>>1) | ((inputData[baseAddr+3])&0x1F))) elif x & 7 == 4: out.write('{:02X}\n'.format( (((inputData[baseAddr])&0xE0)<<1) | ((inputData[baseAddr+4])&0x3F))) elif x & 7 == 5: out.write('{:02X}\n'.format( (((inputData[baseAddr+1])&0xE0)<<1) | ((inputData[baseAddr+5])&0x3F))) elif x & 7 == 6: out.write('{:02X}\n'.format( (((inputData[baseAddr+2])&0xE0)<<1) | ((inputData[baseAddr+6])&0x3F))) elif x & 7 == 7: out.write('{:02X}\n'.format( (((inputData[baseAddr+3])&0xE0)<<1) | ((inputData[baseAddr+7])&0x3F)))