Assignment 6 - Static Control Flow

Problem 1

In this part, you will be implementing the rest of the SM213 ISA, consisting of instructions that control the flow of execution. You may use the version you implemented yourself for Assignment 2 as a starting point.

The six instructions you must implement are branch, branch if equal, branch if greater, jump, get PC, and indirect jump. Their format and semantics are listed on the slides, as well as in the table below. Note that the indirect-jump offset is unsigned and so for indirect jump, pp ranges from 0 to 255. On the other hand, the branch pc-relative value is signed and so for branches, pp ranges from -128 to 127.

// This code contains the solution to CPSC 213 Assignment 2.
// Do not distribute this file or any portion of it to anyone.
// Do not remove this comment.

package arch.sm213.machine.student;

import arch.sm213.machine.AbstractSM213CPU;
import machine.AbstractMainMemory;
import machine.RegisterSet;
import util.UnsignedByte;


/**
 * The Simple Machine CPU.
 *
 * Simulate the execution of a single cycle of the Simple Machine SM213 CPU. 
 */

public class CPU extends AbstractSM213CPU {
  
  /**
   * Create a new CPU.
   *
   * @param name   fully-qualified name of CPU implementation.
   * @param memory main memory used by CPU.
   */
  public CPU (String name, AbstractMainMemory memory) {
    super (name, memory);
  }
  
  /**
   * Fetch Stage of CPU Cycle.
   * Fetch instruction at address stored in "pc" register from memory into instruction register
   * and set "pc" to point to the next instruction to execute.
   *
   * Input register:   pc.
   * Output registers: pc, instruction, insOpCode, insOp0, insOp1, insOp2, insOpImm, insOpExt
   * @see AbstractSM213CPU for pc, instruction, insOpCode, insOp0, insOp1, insOp2, insOpImm, insOpExt
   *
   * @throws MainMemory.InvalidAddressException when program counter contains an invalid memory address
   */
  @Override protected void fetch() throws MainMemory.InvalidAddressException {
    int            pcVal  = pc.get();
    UnsignedByte[] ins    = mem.read (pcVal, 2);
    byte           opCode = (byte) (ins[0].value() >>> 4);
    insOpCode.set (opCode);
    insOp0.set    (ins[0].value() & 0x0f);
    insOp1.set    (ins[1].value() >>> 4);
    insOp2.set    (ins[1].value() & 0x0f);
    insOpImm.set  (ins[1].value());
    pcVal += 2;
    switch (opCode) {
      case 0x0:
      case 0xb:
        long opExt = mem.readIntegerUnaligned (pcVal) & 0xffffffffL;
	pcVal += 4;
	insOpExt.set    (opExt);
	instruction.set (ins[0].value() << 40 | ins[1].value() << 32 | opExt);
	break;
      default:
	insOpExt.set    (0);
	instruction.set (ins[0].value() << 40 | ins[1].value() << 32);
    }
    pc.set (pcVal);
  }

  
  /**
   * Execution Stage of CPU Cycle.
   * Execute instruction that was fetched by Fetch stage.
   *
   * Input state: pc, instruction, insOpCode, insOp0, insOp1, insOp2, insOpImm, insOpExt, reg, mem
   * Ouput state: pc, reg, mem
   * @see AbstractSM213CPU for pc, instruction, insOpCode, insOp0, insOp1, insOp2, insOpImm, insOpExt
   * @see MainMemory       for mem
   * @see machine.AbstractCPU      for reg
   *
   * @throws InvalidInstructionException                when instruction format is invalid.
   * @throws MachineHaltException                       when instruction is the HALT instruction.
   * @throws RegisterSet.InvalidRegisterNumberException when instruction references an invalid register (i.e, not 0-7).
   * @throws MainMemory.InvalidAddressException         when instruction references an invalid memory address.
   */
  @Override protected void execute () throws InvalidInstructionException, MachineHaltException, RegisterSet.InvalidRegisterNumberException, MainMemory.InvalidAddressException
  {
    switch (insOpCode.get()) {

      case 0x0: // ld $v, d .............. 0d-- vvvv vvvv
        reg.set (insOp0.get(), insOpExt.get());
        break;

      case 0x1: // ld o(rs), rd .......... 1psd  (p = o / 4)
        reg.set (insOp2.get(), mem.readInteger ((insOp0.get() << 2) + reg.get (insOp1.get())));
        break;

      case 0x2: // ld (rs, ri, 2), rd .... 2sid
        reg.set (insOp2.get(), mem.readInteger (reg.get (insOp0.get()) + (reg.get (insOp1.get())<<2)));
        break;

      case 0x3: // st rs, o(rd) .......... 3spd  (p = o / 4)
        mem.writeInteger ((insOp1.get() << 2) + reg.get (insOp2.get()), reg.get (insOp0.get()));
        break;

      case 0x4: // st rs, (rd, ri, 4) .... 4sdi
        mem.writeInteger (reg.get (insOp1.get()) + (reg.get (insOp2.get())<<2), reg.get (insOp0.get()));
        break;

      case 0x6: // ALU ................... 6-sd
        switch (insOp0.get()) {

          case 0x0: // mov rs, rd ........ 60sd
            reg.set (insOp2.get(), reg.get (insOp1.get()));
            break;

          case 0x1: // add rs, rd ........ 61sd
            reg.set (insOp2.get(), reg.get (insOp1.get()) + reg.get (insOp2.get()));
            break;

          case 0x2: // and rs, rd ........ 62sd
            reg.set (insOp2.get(), reg.get (insOp1.get()) & reg.get (insOp2.get()));
            break;

          case 0x3: // inc rr ............ 63-r
            reg.set (insOp2.get(), reg.get (insOp2.get()) + 1);
            break;

          case 0x4: // inca rr ........... 64-r
            reg.set (insOp2.get(), reg.get (insOp2.get()) + 4);
            break;

          case 0x5: // dec rr ............ 65-r
            reg.set (insOp2.get(), reg.get (insOp2.get()) - 1);
            break;

          case 0x6: // deca rr ........... 66-r
            reg.set (insOp2.get(), reg.get (insOp2.get()) - 4);
            break;

          case 0x7: // not ............... 67-r
            reg.set (insOp2.get(), ~reg.get (insOp2.get()));
            break;

          case 0xf: // gpc ............... 6fpr
            reg.set(insOp2.get(), pc.get() + (insOp1.get() << 1));
            break;

          default:
            throw new InvalidInstructionException();
        }
        break;

      case 0x7: // sh? $i,rd ............. 7dii
        if (insOpImm.get() > 0)
          reg.set (insOp0.get(), reg.get (insOp0.get()) << insOpImm.get());
        else
          reg.set (insOp0.get(), reg.get (insOp0.get()) >> -insOpImm.get());
        break;

      case 0x8: // br a .................. 8-pp  (a = pc + pp * 2)
        pc.set(pc.get() + ((byte) insOpImm.get()) * 2);
        break;

      case 0x9: // beq rs, a ............. 9rpp  (a = pc + pp * 2)
        if (reg.get(insOp0.get()) != 0) break;

        pc.set(pc.get() + ((byte) insOpImm.get()) * 2);
        break;

      case 0xa: // bg rs, a .............. arpp  (a = pc + pp * 2)
        if (reg.get(insOp0.get()) <= 0) break;

        pc.set(pc.get() + ((byte) insOpImm.get()) * 2);
        break;

      case 0xb: // j i ................... b--- iiii iiii
        pc.set((int) insOpExt.get());
        break;

      case 0xc: // j o(rr) ............... crpp  (pp = o / 2)
        pc.set(reg.get(insOp0.get()) + (insOpImm.getUnsigned() * 2));
        break;

      case 0xf: // halt or nop ............. f?--
        if (insOp0.get() == 0)
          // halt .......................... f0--
          throw new MachineHaltException();
        else if (insOp0.getUnsigned() == 0xf)
          // nop ........................... ff--
          break;
        break;
        
      default:
        throw new InvalidInstructionException();
    }
  }
}

Problem 2

Run each of these snippets through the simulator. Carefully observe what happens as they execute so that you could explain this code to someone if they asked. These files are for your reference only.

Now use the files above as a reference to translate q2.c into commented assembly code, placing your code in a file named q2.s. Labels for global variables should be the name of the variable. Run your code in the simulator and examine every variable to be sure they have the correct value when execution halts.

.pos 0x1000
# === Loop Initialization ===
ld $0, r0 # r0 = 0
ld $i, r1 # r1 = &i
st r0, 0(r1) # i = 0

ld $n, r2 # r2 = &n
ld (r2), r2 # r2 = n
not r2 # r2 = ~n
inc r2 # r2 = -n

loop:
# === Loop Conditional Check ===
ld (r1), r3 # r3 = i
add r2, r3 # r3 = i - n

# i >= 0; goto end_loop

# i - n > 0 
# i > n
bgt r3, end_loop
# i - n == 0 
# i == n
beq r3, end_loop

# === Loop Body ===
ld (r1), r3 # r3 = i
ld $a, r4 # r4 = &a
ld (r4, r3, 4), r4 # r4 = a[i]
ld $b, r5 # r5 = &b
ld (r5, r3, 4), r5 # r5 = b[i]

# === Conditional Check ===
not r5 # r5 = ~b[i]
inc r5 # r5 = -b[i]
add r5, r4 # r4 = a[i] - b[i]

# a[i] - b[i] > 0
# a[i] > b[i]
bgt r4, if
br end_if

if:
	# === Conditional Body ===
	ld $c, r4 # r4 = &c
	ld (r4), r5 # r5 = c
	inc r5 # r5 = c + 1
	st r5, (r4) # c = c+1

end_if:

# === Index Variable Incrementation ===
ld (r1), r3 # r3 = i
inc r3 # r3 = i + 1
st r3, (r1) # i = i + 1

br loop
end_loop:
halt

.pos 0x2000
	i:  .long -1
	n:  .long 5
	a:  .long 10
		.long 20
		.long 30
		.long 40
		.long 50
	b:  .long 11
		.long 20
		.long 28
		.long 44
		.long 48
	c:  .long 0

Problem 3

Implement an assembly-language program that examines a list of student grades and finds the student with the median average grade; this student’s id should be placed in the variable m.

Your program must correspond to a C program where the input list of students, and the output student id are given as follows.

struct Student {
    int sid;
    int grade[4];
    int average;      // this is computed by your program
};

int n;              // number of students
int m;              // you store the median student's id here
struct Student* s;  // a dynamic array of n students

Your assembly file must format student records exactly as C would; i.e., a struct with 6 integers. You must use the following assembly declarations for this input and output data.

Note: this example is for an array with one student. You should obviously test your code on larger arrays. The locations for each variable are also not predetermined. Your code should work no matter where in memory each of the variables are placed.

n:      .long 1       # just one student
m:      .long 0       # put the answer here
s:      .long base    # address of the array
base:   .long 1234    # student ID
        .long 80      # grade 0
        .long 60      # grade 1
        .long 78      # grade 2
        .long 90      # grade 3
        .long 0       # computed average

Note that the code above uses the label base as the base of the array to simplify the assignment of values to s; you may assign this label to s, but your code must not use this label in its implementation. You must use s as the pointer variable that contains the base address of the array. The autograder assigns ("allocates") a different base address to s, so your code must be ready to handle that.

Put your solution in a file named q3.s.

Work Incrementally – Do Each of These Steps Separately

This is a very challenging programming problem. It will stretch (and strengthen) your ability to manage a large number of programming details, as is required when writing assembly. To be successful with this you must be very disciplined to program and test incrementally.

Observe that the program breaks down into several subproblems. You should tackle them one at a time and thoroughly test each step before moving to the next.

Here’s a list of the key subproblems. You might want to further sub-divide things, but do not try to combine steps.

Compute the average grade for a single student and store it in the struct. For simplicity, you can ignore the fractional part of the average; i.e., you do not need to round.
Iterate through the list of students and compute their average grades and store them.
Swap the position of two adjacent students in the list.
Compare the average grades of two adjacent students and swap their position conditionally, using your code from Step 3.
Now, optionally, consider creating a procedure to encapsulate either Step 3 or Step 4 as described in the Use Procedures section below.
Sort the list by average grade in ascending order. You are free to use any sorting algorithm you like, but Bubble Sort is the simplest. Here’s a version of bubble (sinking) sort in C that you might consider.

void sort (int* a, int n) {
  for (int i=n-1; i>0; i--)
    for (int j=1; j<=i; j++)
      if (a[j-1] > a[j]) {
        int t = a[j];
        a[j] = a[j-1];
        a[j-1] = t;
      }
}

Take this step by step and incorporate your work from Step 3-5 above. For Bubble Sort, here are two subproblems.

First, write a loop that iterates through the list once, bubbling the student with the lowest average to the top (or sinking the highest student to the bottom). If you created a procedure in Step 5, then the inner part of this loop is a call to this procedure; otherwise it is the code from Step 4.
Then, add the outer loop that repeats the inner loop on the unsorted sublist repeatedly until the entire list is sorted.

Find the median entry in the sorted list and store that student’s sid in m. For simplicity you can assume the list contains an odd number of students.

Ordering and Combining Steps

Notice that it is not necessary to do these steps in the order listed above. Steps 1 and 2, for example, are completely independent from Steps 3-7. Step 7 is independent from the other steps. You could do Step 6 before Steps 3-5 and then incorporate the conditional swap once the other parts of Step 6 are working. Similarly, you can do Step 5 before Steps 3 or 4 and incorporate them into the procedure later. The key thing here is to work on each piece independently and then carefully combine steps to eventually produce the program. The program itself will be on the order of 70 to 100 assembly-language instructions. That sounds like a lot ... and it is. But if you think of this as seven steps each with 15 or so instructions (some will have less), it's not so bad. Or maybe it will be bad, but at least not impossible.

Be sure to comment every line of your code and to separate subproblems with comments and labels so that it is easy for you to see what each part of the code does without having to read the code.

Multiplying and Dividing by 24

You will have noticed that the variable s is a dynamic array of type struct Student and that sizeof(struct Student) is 24. And so you might find that you’d like to multiply or divide by 24 (i.e., to convert between an array index and a byte-offset), but you’ll recall that you can’t do this with a single instruction in our ISA. Then you’ll notice that x*24 = x*16 + x*8 and you’ll see that you can multiply by 24 (or divide) using 3 instructions (or 4 depending on how you count).

Register Allocation

Keeping track of registers is going to be a challenge. Note that this program is too big for you to use a distinct register for every value in the program. You are going to have to re-use registers to store different things in different parts of the program.

This will add complexity, for example, when combining two steps or when adding a step to the rest of the program. For example, if your code for Step 4 uses register r0 and one of the loops you write in Step 6 also uses r0, you’re going to have to change one of them to use a different register (or use a procedure as described in the next section). So, for parts of the code that connect like this, some pre-planning will help.

Another useful strategy is to group sections of code (one or maybe a few steps) and treat them as independent stages with respect to registers. At the beginning of a stage you assume nothing about the current value of registers and you are free to use any registers you like within that stage. At the end of the stage, any register values that are needed by subsequent stages should be written to memory. You may want to create some additional variables to store these temporary values.

Using Procedures

One way to divide code into stages is to use procedures. Step 5 suggests that you do this to encapsulate the code that conditionally swaps two adjacent elements of the list (i.e., Steps 3 and 4). You can use this approach to simplify register management by having the procedure save registers to memory before it starts and restore them from memory before it returns (e.g., using temporary variables). Any register the procedure saves in this way is a register it is free to use without interfering with the registers used in the two loops in Step 6 that call it.

This swap procedure needs one argument / parameter: i.e., the index of one of the array elements to swap. The index of the other element is this value plus or minus one, depending on how you do it. The caller should pass this value in a register; for example if the loop has the index in r4 then the procedure should just use r4 to get this value.

Use gpc and j to call the procedure and use the indirect jump to return. Do not worry about creating a stack.

All of this is optional, and you can do this for other parts of your program as well.

Partial Marks - Work Incrementally

Finally, partial marks (if you need them ... probably not!) will be awarded based on the number of the steps listed above that you’ve written correctly. Submitting a big blob of assembly that doesn’t do any of the steps right won’t be worth anything. So, work incrementally. Test each step separately. Once you have a step working, save it in an auxiliary file just in case.

.pos 0x1000

# Jump to main program
br main

main:

# for (i = n; i > 0; i--)
#   sum = s[i-1].grade[0] + ... + s[i-1].grade[3]
#   s[i-1].average = sum / 4

    ld $n, r0                   # r0 = &n
    ld (r0), r0                 # r0 = n (number of students)
    ld $i, r7                   # r7 = &i
    st r0, (r7)                 # i = n

avg_loop:
    ld $i, r7                   # r7 = &i
    ld (r7), r7                 # r7 = i
    beq r7, avg_done            # if i == 0, all averages computed
    dec r7                      # r7 = i - 1 (0-based student index)

    # Compute byte offset = (i-1)*24 = (i-1)*16 + (i-1)*8
    mov r7, r0                  # r0 = i - 1
    shl $4, r7                  # r7 = (i-1) * 16
    shl $3, r0                  # r0 = (i-1) * 8
    add r0, r7                  # r7 = (i-1) * 24

    # Get address of student s[i-1]
    ld $s, r1                   # r1 = &s
    ld (r1), r1                 # r1 = s (pointer to student array)
    add r7, r1                  # r1 = &s[i-1]

    # Sum the 4 grades at offsets 4, 8, 12, 16
    ld 4(r1), r2                # r2 = s[i-1].grade[0]
    ld 8(r1), r3                # r3 = s[i-1].grade[1]
    add r3, r2                  # r2 = grade[0] + grade[1]
    ld 12(r1), r3               # r3 = s[i-1].grade[2]
    add r3, r2                  # r2 += grade[2]
    ld 16(r1), r3               # r3 = s[i-1].grade[3]
    add r3, r2                  # r2 = sum of all 4 grades

    # Divide sum by 4 (truncating integer division)
    shr $2, r2                  # r2 = sum / 4

    # Store computed average into the student struct
    st r2, 20(r1)               # s[i-1].average = r2

    # Decrement i and repeat
    ld $i, r7                   # r7 = &i
    ld (r7), r0                 # r0 = i
    dec r0                      # r0 = i - 1
    st r0, (r7)                 # i = i - 1

    br avg_loop                 # continue loop

avg_done:

# for (i = n-1; i > 0; i--)
#   for (j = 0; j < i; j++)
#     if (s[j].average > s[j+1].average)
#       swap(s[j], s[j+1])

    ld $n, r0                   # r0 = &n
    ld (r0), r0                 # r0 = n
    dec r0                      # r0 = n - 1
    ld $i, r7                   # r7 = &i
    st r0, (r7)                 # i = n - 1

sort_outer:
    ld $i, r7                   # r7 = &i
    ld (r7), r7                 # r7 = i
    bgt r7, sort_inner_init     # if i > 0, proceed to inner loop
    br sort_done                # else sorting is complete

sort_inner_init:
    ld $0, r0                   # r0 = 0
    ld $j, r7                   # r7 = &j
    st r0, (r7)                 # j = 0

sort_inner:
    # Load j and i for loop bound check
    ld $j, r7                   # r7 = &j
    ld (r7), r3                 # r3 = j
    ld $i, r7                   # r7 = &i
    ld (r7), r4                 # r4 = i

    # Check j < i by computing j - i
    mov r3, r7                  # r7 = j
    not r4                      # r4 = ~i
    inc r4                      # r4 = -i
    add r4, r7                  # r7 = j - i
    bgt r7, sort_inner_end      # if j > i, exit inner loop
    beq r7, sort_inner_end      # if j == i, exit inner loop

    # Compute &s[j] = s + j * 24
    ld $s, r1                   # r1 = &s
    ld (r1), r1                 # r1 = s (base address)
    mov r3, r5                  # r5 = j
    mov r3, r6                  # r6 = j
    shl $4, r5                  # r5 = j * 16
    shl $3, r6                  # r6 = j * 8
    add r6, r5                  # r5 = j * 24
    add r5, r1                  # r1 = &s[j]
    mov r1, r5                  # r5 = &s[j] (save for swap)

    # Compare s[j].average with s[j+1].average
    ld 20(r5), r1               # r1 = s[j].average
    ld 44(r5), r2               # r2 = s[j+1].average (offset 20+24=44)

    # Compute difference: s[j].avg - s[j+1].avg
    not r2                      # r2 = ~s[j+1].average
    inc r2                      # r2 = -s[j+1].average
    add r2, r1                  # r1 = s[j].avg - s[j+1].avg
    bgt r1, do_swap             # if positive, s[j] has higher avg -> swap
    br no_swap                  # otherwise skip swap

# r5 = &s[j]; s[j+1] starts at r5 + 24
do_swap:
    # Swap field 0: sid (offsets 0 and 24)
    ld 0(r5), r1               # r1 = s[j].sid
    ld 24(r5), r2              # r2 = s[j+1].sid
    st r2, 0(r5)               # s[j].sid = s[j+1].sid
    st r1, 24(r5)              # s[j+1].sid = old s[j].sid

    # Swap field 1: grade[0] (offsets 4 and 28)
    ld 4(r5), r1               # r1 = s[j].grade[0]
    ld 28(r5), r2              # r2 = s[j+1].grade[0]
    st r2, 4(r5)               # s[j].grade[0] = s[j+1].grade[0]
    st r1, 28(r5)              # s[j+1].grade[0] = old s[j].grade[0]

    # Swap field 2: grade[1] (offsets 8 and 32)
    ld 8(r5), r1               # r1 = s[j].grade[1]
    ld 32(r5), r2              # r2 = s[j+1].grade[1]
    st r2, 8(r5)               # s[j].grade[1] = s[j+1].grade[1]
    st r1, 32(r5)              # s[j+1].grade[1] = old s[j].grade[1]

    # Swap field 3: grade[2] (offsets 12 and 36)
    ld 12(r5), r1              # r1 = s[j].grade[2]
    ld 36(r5), r2              # r2 = s[j+1].grade[2]
    st r2, 12(r5)              # s[j].grade[2] = s[j+1].grade[2]
    st r1, 36(r5)              # s[j+1].grade[2] = old s[j].grade[2]

    # Swap field 4: grade[3] (offsets 16 and 40)
    ld 16(r5), r1              # r1 = s[j].grade[3]
    ld 40(r5), r2              # r2 = s[j+1].grade[3]
    st r2, 16(r5)              # s[j].grade[3] = s[j+1].grade[3]
    st r1, 40(r5)              # s[j+1].grade[3] = old s[j].grade[3]

    # Swap field 5: average (offsets 20 and 44)
    ld 20(r5), r1              # r1 = s[j].average
    ld 44(r5), r2              # r2 = s[j+1].average
    st r2, 20(r5)              # s[j].average = s[j+1].average
    st r1, 44(r5)              # s[j+1].average = old s[j].average

no_swap:
    # Increment j and continue inner loop
    ld $j, r7                   # r7 = &j
    ld (r7), r0                 # r0 = j
    inc r0                      # r0 = j + 1
    st r0, (r7)                 # j = j + 1
    br sort_inner               # repeat inner loop

sort_inner_end:
    # Decrement i and continue outer loop
    ld $i, r7                   # r7 = &i
    ld (r7), r0                 # r0 = i
    dec r0                      # r0 = i - 1
    st r0, (r7)                 # i = i - 1
    br sort_outer               # repeat outer loop

sort_done:

# For odd n, median is at index n/2 (0-based) in sorted array

    ld $n, r0                   # r0 = &n
    ld (r0), r0                 # r0 = n
    shr $1, r0                  # r0 = n / 2 (median index, 0-based)

    # Compute &s[median] = s + median_index * 24
    ld $s, r1                   # r1 = &s
    ld (r1), r1                 # r1 = s (base address of array)
    mov r0, r2                  # r2 = median_index
    shl $4, r0                  # r0 = median_index * 16
    shl $3, r2                  # r2 = median_index * 8
    add r2, r0                  # r0 = median_index * 24
    add r0, r1                  # r1 = &s[median_index]

    # Store median student's sid in m
    ld (r1), r0                 # r0 = s[median_index].sid
    ld $m, r1                   # r1 = &m
    st r0, (r1)                 # m = median student's sid

    halt                        # done

.pos 0x2000
    n:      .long 3             # number of students
    m:      .long 0             # output: median student's sid goes here
    s:      .long base          # pointer to student array
    i:      .long 0             # loop variable i
    j:      .long 0             # loop variable j

    base:   .long 1             # student 0: sid = 1
            .long 80            # grade[0]
            .long 60            # grade[1]
            .long 78            # grade[2]
            .long 90            # grade[3]
            .long 0             # average (computed by program)

            .long 2             # student 1: sid = 2
            .long 90            # grade[0]
            .long 95            # grade[1]
            .long 88            # grade[2]
            .long 87            # grade[3]
            .long 0             # average (computed by program)

            .long 3             # student 2: sid = 3
            .long 50            # grade[0]
            .long 60            # grade[1]
            .long 70            # grade[2]
            .long 40            # grade[3]
            .long 0             # average (computed by program)