Minispec FIFOs

// None of the code in this file has been tested. Just be aware.

// AXI-style ready/valid handshake
// Key rule: data_downs must not depend on stall_downs. That is, if we are ready to send data,
// we should make it available regardless of whether downstream says it is ready to take.
// Of course, this means we can assume upstream module tries it best to make data available
// even if we say we aren't taking the data yet. This rule avoids deadlocks and livelocks.
// Heuristic: Data transfer happens on the cycle when data is valid and stall is false.
module AXIModule#(type In, type Out);
  input Maybe#(In) data_ups; // Data from upstream. Guaranteed to remain valid as long as we tell upstream to keep stalling. (Not stalling means consuming.)
  input Bool stall_downs; // The stall signal from downstream. if !stall_downs and we're providing data, we must consider the data consumed.
  method Maybe#(Out) data_downs; // Data to provide to downstream. Once valid, must remain valid until consumed.
  method Bool stall_ups; // Stall signal to return to upstream.
endmodule

// A register between pipeline stages, with stall signal.
// If downstream isn't taking data and we're full, then we tell upstream to stall.
// If downstream is taking data and we have data to provide from the buffer, we do.
// In the same cycle, we can also tell upstream to send data (since the buffer is being consumed),
// which will get put into the buffer (as long as downstream is taking data).
// That is, there is a combinational path from stall_downs to stall_ups.
// Not the other way around: Data from upstream will always take at least one cycle to arrive to downstream,
// because it needs to go through the buffer first. This is exactly the same behavior as pipeline registers we've seen in 6.191.
module PipelineFIFO1#(type T);
  Reg#(Maybe#(T)) buffer(Invalid);

  input Maybe#(T) data_ups;
  input Bool stall_downs;
  method Maybe#(T) data_downs = buffer; // always make data available without waiting for !stall_downs
  method Bool stall_ups(Bool stall_downs) = stall_downs && isValid(buffer); // tell upstream to stall if we can't send out this buffer

  rule update;
    // downstream tells us it's not stalling
    if (!stall_downs) begin
      // if our buffer is valid, we are indeed sending out that data (as we always do when buffer is valid),
      // and we must consider it consumed because downstream is not stalling.
      // the buffer needs to be cleaned out, taking in the new input if possible (could be empty).
      // otherwise, if our buffer is empty, well, we still try to take new input anyway (could be empty).
      buffer <= data_ups;
    end else
      // downstream tells us it IS stalling.
      // if the buffer is empty, we can afford to still take the data.
      // indeed, we have to, because we told upstream not to stall in this case.
      if (!isValid(buffer)) buffer <= data_ups;
      // otherwise, buffer is full and we can't clean it out. that's fine.
      // stall_ups returns true in this exact case.
    end
  endrule
endmodule

// An example where we chain a bunch of pipeline registers. This essentially creates a pipelined FIFO
// with an unfortunate latency of n. The FIFO is "pipelined" in the sense that
// even when the FIFO is full, enqueue and dequeue can happen in the same cycle.
module PipelinedFIFO#(Integer 1, type T);
  PipelineFIFO1#(T) buffer;
  input Maybe#(T) data_ups;
  input Bool stall_downs;
  method Maybe#(T) data_downs = buffer.data_downs;
  method Bool stall_ups(Bool stall_downs) = buffer.stall_ups(stall_downs);
  rule update;
    buffer.data_ups = data_ups;
    buffer.stall_downs = stall_downs;
  endrule
endmodule
module PipelinedFIFO#(Integer n, type T);
  PipelineFIFO1#(T) first;
  PileineBufferChain#(n-1, T) rest;
  input Maybe#(T) data_ups;
  input Bool stall_downs;
  method Maybe#(T) data_downs = rest.data_downs;
  method Bool stall_ups(Bool stall_downs) = first.stall_ups(stall_downs);
  rule update;
    first.data_ups = data_ups;
    rest.data_ups = first.data_downs;
    rest.stall_downs = stall_downs;
    first.stall_downs = rest.stall_ups(stall_downs);
  endrule
endmodule

// A fast path between stages. Data from upstream goes to downstream in the same cycle
// if downstream is willing to take it. Otherwise, it gets put into the buffer.
// That is, there is a combinational path from upstream to downstream.
// To avoid having a combinational stall signal from downstream to upstream,
// we only tell upstream whether to stall or not based on the state of the buffer,
// i.e. essentially registering the stall signal.
// If the buffer is full, we need to clear out the buffer first. Only once cleared,
// we can tell upstream that we're consuming.
// This obviously does not provide pipelining in a traditional sense. (The main path is wholly combinational.)
// It's usually only useful for a path that goes back up against the pipeline (e.g. annulment, bypassing in pipelined processors),
// where the buffer's "downstream" is actually up the pipeline and "upstream" is later in the pipeline,
// and that up-the-pipeline may not be ready to act on the bypassed information.
// (This has never really been useful in most 6.191 processors. Writeback always completes in one cycle.
// So, whenever execute completes and has data to bypass to decode, execute can move on to writeback.
// If execute moves on, then our implementation is such that decode also moves on, i.e. it can use the bypassed information
// immediately in this cycle. That's why we never needed a FIFO to store the bypass information.
// When writeback bypasses to execute and execute isn't ready to move on, meaning decode isn't ready to move on,
// it doesn't matter anyway because writeback ends up writing to the register file, also eliminating the need for the FIFO.
// It is fine if we did have the FIFO, though, since Bypass FIFO does not add a combinational path down the processor pipeline.)
module BypassFIFO1#(type T);
  Reg#(Maybe#(T)) buffer(Invalid);

  input Maybe#(T) data_ups;
  input Bool stall_downs;
  method Maybe#(T) data_downs(Maybe#(T) data_ups) = isValid(buffer) ? buffer : data_ups;
  method Bool stall_ups = isValid(buffer);
  rule update;
    // downstream tells us not to stall
    if (!stall_downs) begin
      // if buffer is filled, we are sending from buffer so we should clear it.
      // if not, we bypassed data to downstream, so we shouldn't store anything.
      buffer <= Invalid;
    end else begin
      // downstream is stalling. if buffer is free, we've told upstream we're consuming its data,
      // so we do need to take and buffer the data.
      if (!isValid(buffer)) buffer <= data_ups;
      // otherwise, buffer is full, and it remains full because we can't send it to downstream yet
    end
  endrule
endmodule

// Pipeline buffer has a combinational path from downstream to upstream due to stall signal.
// Bypass buffer has a combinational path from upstream to downstream due to data, defeating the point of pipelining.
// Can we get the best of both worlds? Let's try a FIFO of size 1.
// When FIFO is empty:
//   Input gets registered for outputting. Let's call this the "output register."
//   This may or may not be actually used in the next cycle depending on whether downstream stalls.
//   The input _has_ to get registered regardless of whether downstream stalls. If we could decide not to register
//   the output based on whether downstream stalls, then that means we had a combinational path for the stall,
//   same problem as the pipeline buffer!
// When FIFO is full:
//   This corresponds to the case where we did register something and it wasn't consumed by downstream.
//   We must be continually trying to put out the content of this FIFO.
//   Furthermore, since we avoid the combinational path from downstream to upstream,
//   that means we have no choice but to tell the upstream to stall. (Otherwise, where would we put the data?)
// This design means we do not have any combinational path between the upstream and downstream,
// but we halved the throughput! Why? Because when the buffer is full, upstream could never put anything
// in the buffer, so it has to wait a cycle until the buffer is emptied again.
// So, this is actually kinda the worst of both worlds.
module FIFO1#(type T);
  Reg#(Maybe#(T)) buffer(Invalid);
  input Maybe#(T) data_ups;
  input Bool stall_downs;
  method Maybe#(T) data_downs = buffer;
  method Bool stall_ups = isValid(buffer);
  rule update;
    if (!stall_downs) begin
      // If downstream is consuming and we are providing data (buffer is valid), we need to clear the buffer,
      // since we told upstream we are stalling, upstream won't change its output.
      // If we were to take the output now, we would end up getting duplicate data in the next cycle.
      // If we are not providing any data, it is fine to take data from upstream; in fact, we must.
      buffer <= isValid(buffer) ? Invalid : data_ups;
    end else begin
      // Downstream isn't consuming any data. If we already have data in buffer, don't change.
      // (It's fine, since we're telling upstream to stall.) Otherwise, must take data.
      buffer <= isValid(buffer) ? buffer : stall_ups;
    end
  endrule
endmodule

// Can we fix this issue by putting a BypassFIFO1 in front of FIFO1?
// It seems when FIFO1 is empty, BypassFIFO1 helps us quickly get upstream data to the front.
// On the next cycle, BypassFIFO1 takes an input in the same cycle that FIFO1 is being dequeued.
// Since FIFO1 cannot immediately tell BypassFIFO1, that means BypassFIFO1 will have to put data in its buffer, and FIFO1 becomes empty.
// Next cycle, BypassFIFO1 needs to put data in FIFO1. It can't tell upstream in the same cycle that it is putting data into FIFO1.
// Putting a PipelinedFIFO1 in front of FIFO1 does not work for a similar reason.
// First two cycles are fine, but then since FIFO1 stalls when dequeuing, PipelineFIFO1 needs to stall, causing upstream to stall.
// Ultimately, the throughput is halved whenever FIFO1 is involved.

// We need something different altogether. We need a "skid buffer," which is equivalent to a conflict-free FIFO of size 2.
// Here, conflict-free means, if the FIFO is neither empty nor full, it should be able to both dequeue and enqueue data
// in the same cycle, where that data should be immediately available in the next cycle.
// A skid buffer, intuitively, is like a pipeline register where the stall signal is also registered.
// But, since the stall signal is registered, it means we can't immediately tell the upstream module we're not consuming its data.
// The upstream module believes we are consuming its data, so we have no choice but to put that data in a new, "internal register".
// We can summarize the operation of the skid buffer as follows.
// When FIFO is empty:
//   We tell upstream we're not stalling.
//   If there's upstream data, it gets put into the "output register."
// When FIFO has size 1:
//   We tell upstream we're not stalling (i.e. we're consuming). The output register is shown to downstream.
//   If downstream is consuming, then input data gets put into the output register, replacing the current data.
//   If downstream is not consuming, input data gets put into an "internal register."
// When FIFO has size 2:
//   We tell upstream we are stalling (i.e. not consuming).
//   If downstream is consuming, we move data from internal register to output register.
//   Unfortunately, input can't go into the internal register yet, to avoid accidentally duplicating the data.
// Now we fixed the throughput issue!
// If the upstream steadily provides input data, it goes right into the output register,
// which means the buffer remains empty, allowing upstream to keep inputting data.
// But now, if upstream puts data in again, it goes right into the output register rather than the internal buffer.
// So, the throughput is maintained!
// Note, however, skid buffer does not smooth out irregularities in data rates. If downstream stalls, upstream will see a stall.
// Use a longer FIFO instead to help smooth out these irregularities.
// (Obviously, it's a lost cause if upstream data rate is consistently higher than downstream. We need unbounded FIFOs in that case.)
module SkidBuffer#(type T);
  Reg#(Maybe#(T)) output(Invalid);
  Reg#(Maybe#(T)) internal(Invalid);

  input Maybe#(T) data_ups;
  input Bool stall_downs;

  method Maybe#(T) data_downs = output;
  method Bool stall_ups = isValid(internal);

  rule update;
    if (!isValid(output)) begin // FIFO is empty
      $assert(!isValid(internal));
      output <= data_ups;
    end else if (!isValid(internal)) begin // FIFO has size 1
      $assert(isValid(output));
      if (!stall_downs) begin
        output <= data_ups;
      end else begin
        internal <= data_ups;
      end
    end else begin // FIFO is full
      if (!stall_downs) begin
        output <= internal;
        internal <= Invalid;
      end
    end
  endrule
endmodule

// Actually, now that I think about it, can we simulate a skid buffer by putting a
// bypass FIFO (internal) followed by a pipelined FIFO (output)?
// When overall FIFO is empty, upstream input automatically goes to the output FIFO.
// When the overall FIFO has an element, it is always at the output FIFO,
// and bypass FIFO is still willing to take an element, so upstream is not stalled.
// When there are two elements, then upstream is indeed stalled.
// When an element is taken out, the pipeline FIFO tells bypass FIFO to also dequeue,
// so it's essentially the same behavior as skid buffer.
// I'm convinced to say they are equivalent, but I'll have to prove this later.