_site/MSIdebugging/index.html - public/gem5-website - Git at Google

 <!DOCTYPE html>
 <html>
 <head>
 	<meta charset="utf-8">
 	<meta http-equiv="x-ua-compatible" content="ie=edge">
 	<meta name="viewport" content="width=device-width, initial-scale=1">

 	<title>gem5</title>

 	<!-- SITE FAVICON -->
 	<link rel="shortcut icon" type="image/gif" href="/assets/img/gem5ColorVert.gif"/>

 	<link rel="canonical" href="http://localhost:4000/MSIdebugging/">
 	<link href='https://fonts.googleapis.com/css?family=Open+Sans:400,300,700,800,600' rel='stylesheet' type='text/css'>
 	<link href='https://fonts.googleapis.com/css?family=Muli:400,300' rel='stylesheet' type='text/css'>

 	<!-- FAVICON -->
 	<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css">

 	<!-- BOOTSTRAP -->
 	<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">

 	<!-- CUSTOM CSS -->
 	<link rel="stylesheet" href="/css/main.css">
 </head>


 <body>
 	<nav class="navbar navbar-expand-md navbar-light bg-light">
   <a class="navbar-brand" href="/">
 		<img src="/assets/img/gem5ColorLong.gif" alt="gem5" height=45px>
 	</a>
   <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNavDropdown" aria-controls="navbarNavDropdown" aria-expanded="false" aria-label="Toggle navigation">
     <span class="navbar-toggler-icon"></span>
   </button>
   <div class="collapse navbar-collapse" id="navbarNavDropdown">
     <ul class="navbar-nav ml-auto">
       <li class="nav-item ">
         <a class="nav-link" href="/">Home</a>
       </li>

 			<li class="nav-item dropdown ">
 				<a class="nav-link dropdown-toggle" href="/about" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
 					About
 				</a>
 				<div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink">
           <a class="dropdown-item" href="/about">About</a>
           <a class="dropdown-item" href="/publications">Publications</a>
           <a class="dropdown-item" href="/governance">Governance</a>
 				</div>
 			</li>

 			<li class="nav-item dropdown active">
 				<a class="nav-link dropdown-toggle" href="#" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
 					Documentation
 				</a>
 				<div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink">
 					<!-- Pull navigation from _data/documentation.yml -->

             <a class="dropdown-item" href="/introduction">Introduction</a>

             <a class="dropdown-item" href="/building">Getting Started</a>

             <a class="dropdown-item" href="/environment">Modifying/Extending</a>

             <a class="dropdown-item" href="/MSIintro">Modeling Cache Coherence with Ruby</a>

 				</div>
 			</li>

       <li class="nav-item ">
         <a class="nav-link" href="/contributing">Contributing</a>
       </li>

 			<li class="nav-item ">
         <a class="nav-link" href="/search">Search</a>
       </li>
     </ul>
   </div>
 </nav>

 	<main>
 		<div class="sidenav-top">
   <img src="/assets/img/gem5ColorLong.gif" height="80">
   <div class="search">
     <form action="/search" method="get">
       <!-- <label for="search-box"><i class="fa fa-search"></i></label> -->
       <input type="text" name="query">
       <button type="submit" name="submit"><i class="fa fa-search"></i></button>
     </form>
   </div>
 </div>
 <div class="sidenav">
   <!-- Pull navigation from _data/documentation.yml -->

     <a class="item" href="/introduction" role="button" aria-expanded="false" aria-controls="collapseExample">
       Introduction
     </a>
     <div class="collapse " id="introdution">

     </div>

     <a class="item" data-toggle="collapse" href="#pt1" role="button" aria-expanded="false" aria-controls="collapseExample">
       Getting Started
     </a>
     <div class="collapse " id="pt1">

         <a class="subitem " href="/building">Building gem5</a>

         <a class="subitem " href="/simple_config">Creating a simple configuration script</a>

         <a class="subitem " href="/cache_config">Adding cache to configuration script</a>

         <a class="subitem " href="/gem5_stats">Understanding gem5 statistics and output</a>

         <a class="subitem " href="/example_configs">Using the default configuration scripts</a>

     </div>

     <a class="item" data-toggle="collapse" href="#pt2" role="button" aria-expanded="false" aria-controls="collapseExample">
       Modifying/Extending
     </a>
     <div class="collapse " id="pt2">

         <a class="subitem " href="/environment">Setting up your development environment</a>

         <a class="subitem " href="/helloobject">Creating a very simple SimObject</a>

         <a class="subitem " href="/debugging">Debugging gem5</a>

         <a class="subitem " href="/events">Event-driven programming</a>

         <a class="subitem " href="/parameters">Adding parameters to SimObjects and more events</a>

         <a class="subitem " href="/memoryobject">Creating SimObjects in the memory system</a>

         <a class="subitem " href="/simplecache">Creating a simple cache object</a>

     </div>

     <a class="item" data-toggle="collapse" href="#pt3" role="button" aria-expanded="false" aria-controls="collapseExample">
       Modeling Cache Coherence with Ruby
     </a>
     <div class="collapse show" id="pt3">

         <a class="subitem " href="/MSIintro">Introduction to Ruby</a>

         <a class="subitem " href="/cache-intro">MSI example cache protocol</a>

         <a class="subitem " href="/cache-declarations">Declaring a state machine</a>

         <a class="subitem " href="/cache-in-ports">In port code blocks</a>

         <a class="subitem " href="/cache-actions">Action code blocks</a>

         <a class="subitem " href="/cache-transitions">Transition code blocks</a>

         <a class="subitem " href="/directory">MSI Directory implementation</a>

         <a class="subitem " href="/MSIbuilding">Compiling a SLICC protocol</a>

         <a class="subitem " href="/configuration">Configuring a simple Ruby system</a>

         <a class="subitem " href="/running">Running the simple Ruby system</a>

         <a class="subitem active" href="/MSIdebugging">Debugging SLICC Protocols</a>

         <a class="subitem " href="/simple-MI_example">Configuring for a standard protocol</a>

     </div>

 </div>

 <div class="container" id="doc-container">

   <!-- <h1>Debugging SLICC Protocols</h1> -->
   <dl>
   <dt>authors</dt>
   <dd>Jason Lowe-Power</dd>
 </dl>

 <h1 id="debugging-slicc-protocols">Debugging SLICC Protocols</h1>

 <p>In this section, I present the steps that I took while debugging the MSI
 protocol implemented earlier in this chapter. Learning to debug
 coherence protocols is a challenge. The best way is by working with
 others who have written SLICC protocols in the past. However, since you,
 the reader, cannot look over my shoulder while I am debugging a
 protocol, I am trying to present the next-best thing.</p>

 <p>Here, I first present some high-level suggestions to tackling protocol
 errors. Next, I discuss some details about deadlocks, and how to
 understand protocol traces that can be used to fix them. Then, I present
 my experience debugging the MSI protocol in this chapter in a
 stream-of-consciousness style. I will show the error that was generated,
 then the solution to the error, sometimes with some commentary of the
 different tactics I tried to solve the error.</p>

 <h2 id="general-debugging-tips">General debugging tips</h2>

 <p>Ruby has many useful debug flags. However, the most useful, by far, is
 <code class="highlighter-rouge">ProtocolTrace</code>. Below, you will see several examples of using the
 protocol trace to debug a protocol. The protocol trace prints every
 transition for all controllers. Thus, you can simply trace the entire
 execution of the cache system.</p>

 <p>Other useful debug flags include:</p>

 <dl>
   <dt>RubyGenerated</dt>
   <dd>Prints a bunch of stuff from the ruby generated code.</dd>
   <dt>RubyPort/RubySequencer</dt>
   <dd>See the details of sending/receiving messages into/out of ruby.</dd>
   <dt>RubyNetwork</dt>
   <dd>Prints entire network messages including the sender/receiver and the
 data within the message for all messages. This flag is useful when
 there is a data mismatch.</dd>
 </dl>

 <p>The first step to debugging a Ruby protocol is to run it with the Ruby
 random tester. The random tester issues semi-random requests into the
 Ruby system and checks to make sure the returned data is correct. To
 make debugging faster, the random tester issues read requests from one
 controller for a block and a write request for the same cache block (but
 a different byte) from a different controller. Thus, the Ruby random
 tester does a good job exercising the transient states and race
 conditions in the protocol.</p>

 <p>Unfortunately, the random tester’s configuration is slightly different
 than when using normal CPUs. Thus, we need to use a different
 <code class="highlighter-rouge">MyCacheSystem</code> than before. You can download this different cache
 system file
 here &lt;../../_static/scripts/part3/configs/test_caches.py&gt; and you
 can download the modified run script
 here &lt;../../_static/scripts/part3/configs/ruby_test.py&gt;. The test
 run script is mostly the same as the simple run script, but creates the
 <code class="highlighter-rouge">RubyRandomTester</code> instead of CPUs.</p>

 <p>It is often a good idea to first run the random tester with a single
 “CPU”. Then, increase the number of loads from the default of 100 to
 something that takes a few minutes to execute on your host system. Next,
 if there are no errors, then increase the number of “CPUs” to two and
 reduce the number of loads to 100 again. Then, start increasing the
 number of loads. Finally, you can increase the number of CPUs to
 something reasonable for the system you are trying to simulate. If you
 can run the random tester for 10-15 minutes, you can be slightly
 confident that the random tester isn’t going to find any other bugs.</p>

 <p>Once you have your protocol working with the random tester, you can move
 on to using real applications. It is likely that real applications will
 expose even more bugs in the protocol. If at all possible, it is much
 easier to debug your protocol with the random tester than with real
 applications!</p>

 <h2 id="understanding-protocol-traces">Understanding Protocol Traces</h2>

 <p>Unfortunately, despite extensive effort to catch bugs in them, coherence
 protocols (even heavily tested ones) will have bugs. Sometimes these
 bugs are relatively simple fixes, while other times the bugs will be
 very insidious and difficult to track down. In the worst case, the bugs
 will manifest themselves as deadlocks: bugs that literally prevent the
 application from making progress. Another similar problem is livelocks:
 where the program runs forever due to a cycle somewhere in the system.
 Whenever livelocks or deadlocks occur, the next thing to do is generate
 a protocol trace. Traces print a running list of every transition that
 is happening in the memory system: memory requests starting and
 completing, L1 and directory transitions, etc. You can then use these
 traces to identify why the deadlock is occurring. However, as we will
 discuss in more detail below, debugging deadlocks in protocol traces is
 often extremely challenging.</p>

 <p>Here, we discuss what appears in the protocol trace to help explain what
 is happening. To start with, lets look at a small snippet of a protocol
 trace (we will discuss the details of this trace further below):</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>...
 4541   0    L1Cache         Replacement   MI_A&gt;MI_A   [0x4ac0, line 0x4ac0]
 4542   0    L1Cache              PutAck   MI_A&gt;I      [0x4ac0, line 0x4ac0]
 4549   0  Directory              MemAck   MI_M&gt;I      [0x4ac0, line 0x4ac0]
 4641   0        Seq               Begin       &gt;       [0x4aec, line 0x4ac0] LD
 4652   0    L1Cache                Load      I&gt;IS_D   [0x4ac0, line 0x4ac0]
 4657   0  Directory                GetS      I&gt;S_M    [0x4ac0, line 0x4ac0]
 4669   0  Directory             MemData    S_M&gt;S      [0x4ac0, line 0x4ac0]
 4674   0        Seq                Done       &gt;       [0x4aec, line 0x4ac0] 33 cycles
 4674   0    L1Cache       DataDirNoAcks   IS_D&gt;S      [0x4ac0, line 0x4ac0]
 5321   0        Seq               Begin       &gt;       [0x4aec, line 0x4ac0] ST
 5322   0    L1Cache               Store      S&gt;SM_AD  [0x4ac0, line 0x4ac0]
 5327   0  Directory                GetM      S&gt;M_M    [0x4ac0, line 0x4ac0]
 </code></pre></div></div>

 <p>Every line in this trace has a set pattern in terms of what information
 appears on that line. Specifically, the fields are:</p>

 <ol>
   <li>Current Tick: the tick the print is occurs in</li>
   <li>Machine Version: The number of the machine where this request is
 coming from. For example, if there are 4 L1 caches, then the numbers
 would be 0-3. Assuming you have 1 L1 Cache per core, you can think
 of this as representing the core the request is coming from.</li>
   <li>Component: which part of the system is doing the print. Generally,
 <code class="highlighter-rouge">Seq</code> is shorthand for Sequencer, <code class="highlighter-rouge">L1Cache</code> represents the L1 Cache,
 “Directory” represents the directory, and so on. For L1 caches and
 the directory, this represents the name of the machine type (i.e.,
 what is after “MachineType:” in the <code class="highlighter-rouge">machine()</code> definition).</li>
   <li>Action: what the component is doing. For example, “Begin” means the
 Sequencer has received a new request, “Done” means that the
 Sequencer is completing a previous request, and “DataDirNoAcks”
 means that our DataDirNoAcks event is being triggered.</li>
   <li>Transition (e.g., MI_A&gt;MI_A): what state transition this action
 is doing (format: “currentState&gt;nextState”). If no transition is
 happening, this is denoted with “&gt;”.</li>
   <li>Address (e.g., [0x4ac0, line 0x4ac0]): the physical address of the
 request (format: [wordAddress, lineAddress]). This address will
 always be cache-block aligned except for requests from the
 <code class="highlighter-rouge">Sequencer</code> and <code class="highlighter-rouge">mandatoryQueue</code>.</li>
   <li>(Optional) Comments: optionally, there is one additional field to
 pass comments. For example, the “LD” , “ST”, and “33 cycles” lines
 use this extra field to pass additional information to the trace –
 such as identifying the request as a load or store. For SLICC
 transitions, <code class="highlighter-rouge">APPEND_TRANSITION_COMMENT</code> often use this, as we
 discussed previously &lt;MSI-actions-section&gt;.</li>
 </ol>

 <p>Generally, spaces are used to separate each of these fields (the space
 between the fields are added implicitly, you do not need to add them).
 However, sometimes if a field is very long, there may be no spaces or
 the line may be shifted compared to other lines.</p>

 <p>Using this information, let’s analyze the above snippet. The first
 (tick) field tells us that this trace snippet is showing what was
 happening in the memory system between ticks 4541 and 5327. In this
 snippet, all of the requests are coming from L1Cache-0 (core 0) and
 going to Directory-0 (the first bank of the directory). During this
 time, we see several memory requests and state transitions for the cache
 line 0x4ac0, both at the L1 caches and the directory. For example, in
 tick 5322, the core executes a store to 0x4ac0. However, it currently
 does not have that line in Modified in its cache (it is in Shared after
 the core loaded it from ticks 4641-4674), so it needs to request
 ownership for that line from the directory (which receives this request
 in tick 5327). While waiting for ownership, L1Cache-0 transitions from S
 (Shared) to SM_AD (a transient state – was in S, going to M, waiting
 for Ack and Data).</p>

 <p>To add a print to the protocol trace, you will need to add a print with
 these fields with the ProtocolTrace flag. For example, if you look at
 <code class="highlighter-rouge">src/mem/ruby/system/Sequencer.cc</code>, you can see where the
 <code class="highlighter-rouge">Seq               Begin</code> and <code class="highlighter-rouge">Seq                Done</code> trace prints
 come from (search for ProtocolTrace).</p>

 <h2 id="errors-i-ran-into-debugging-msi">Errors I ran into debugging MSI</h2>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>gem5.opt: build/MSI/mem/ruby/system/Sequencer.cc:423: void Sequencer::readCallback(Addr, DataBlock&amp;, bool, MachineType, Cycles, Cycles, Cycles): Assertion `m_readRequestTable.count(makeLineAddress(address))' failed.
 </code></pre></div></div>

 <p>I’m an idiot, it was that I called readCallback in externalStoreHit
 instead of writeCallback. It’s good to start simple!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>gem5.opt: build/MSI/mem/ruby/network/MessageBuffer.cc:220: Tick MessageBuffer::dequeue(Tick, bool): Assertion `isReady(current_time)' failed.
 </code></pre></div></div>

 <p>I ran gem5 in GDB to get more information. Look at
 L1Cache_Controller::doTransitionWorker. The current transition is:
 event=L1Cache_Event_PutAck, state=L1Cache_State_MI_A,
 &lt;next_state=@0x7fffffffd0a0&gt;: L1Cache_State_FIRST This is more simply
 MI_A-&gt;I on a PutAck See it’s in popResponseQueue.</p>

 <p>The problem is that the PutAck is on the forward network, not the
 response network.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Invalid transition
 system.caches.controllers0 time: 3594 addr: 3264 event: DataDirAcks state: IS_D
 </code></pre></div></div>

 <p>Hmm. I think this shouldn’t have happened. The needed acks should always
 be 0 or you get data from the owner. Ah. So I implemented sendDataToReq
 at the directory to always send the number of sharers. If we get this
 response in IS_D we don’t care whether or not there are sharers. Thus,
 to make things more simple, I’m just going to transition to S on
 DataDirAcks. This is a slight difference from the original
 implementation in Sorin et al.</p>

 <p>Well, actually, I think it’s that we send the request after we add
 ourselves to the sharer list. The above is <em>incorrect</em>. Sorin et al.
 were not wrong! Let’s try not doing that!</p>

 <p>So, I fixed this by checking to see if the requestor is the <em>owner</em>
 before sending the data to the requestor at the directory. Only if the
 requestor is the owner do we include the number of sharers. Otherwise,
 it doesn’t matter at all and we just set the sharers to 0.</p>

 <dl>
   <dt>::</dt>
   <dd>panic: Invalid transition system.caches.controllers0 time: 5332
 addr: 0x4ac0 event: Inv state: SM_AD</dd>
 </dl>

 <p>First, let’s look at where Inv is triggered. If you get an invalidate…
 only then. Maybe it’s that we are on the sharer list and shouldn’t be?</p>

 <p>We can use protocol trace and grep to find what’s going on.</p>

 <p>``` {.sourceCode .sh}
 build/MSI/gem5.opt –debug-flags=ProtocolTrace configs/learning_gem5/part6/ruby_test.py | grep 0x4ac0</p>
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>
     ...
     4541   0    L1Cache         Replacement   MI_A&gt;MI_A   [0x4ac0, line 0x4ac0]
     4542   0    L1Cache              PutAck   MI_A&gt;I      [0x4ac0, line 0x4ac0]
     4549   0  Directory              MemAck   MI_M&gt;I      [0x4ac0, line 0x4ac0]
     4641   0        Seq               Begin       &gt;       [0x4aec, line 0x4ac0] LD
     4652   0    L1Cache                Load      I&gt;IS_D   [0x4ac0, line 0x4ac0]
     4657   0  Directory                GetS      I&gt;S_M    [0x4ac0, line 0x4ac0]
     4669   0  Directory             MemData    S_M&gt;S      [0x4ac0, line 0x4ac0]
     4674   0        Seq                Done       &gt;       [0x4aec, line 0x4ac0] 33 cycles
     4674   0    L1Cache       DataDirNoAcks   IS_D&gt;S      [0x4ac0, line 0x4ac0]
     5321   0        Seq               Begin       &gt;       [0x4aec, line 0x4ac0] ST
     5322   0    L1Cache               Store      S&gt;SM_AD  [0x4ac0, line 0x4ac0]
     5327   0  Directory                GetM      S&gt;M_M    [0x4ac0, line 0x4ac0]

 Maybe there is a sharer in the sharers list when there shouldn't be? We
 can add a defensive assert in clearOwner and setOwner.

 ``` {.sourceCode .c++}
 action(setOwner, "sO", desc="Set the owner") {
     assert(getDirectoryEntry(address).Sharers.count() == 0);
     peek(request_in, RequestMsg) {
         getDirectoryEntry(address).Owner.add(in_msg.Requestor);
     }
 }

 action(clearOwner, "cO", desc="Clear the owner") {
     assert(getDirectoryEntry(address).Sharers.count() == 0);
     getDirectoryEntry(address).Owner.clear();
 }
 </code></pre></div></div>

 <p>Now, I get the following error:</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Runtime Error at MSI-dir.sm:301: assert failure.
 </code></pre></div></div>

 <p>This is in setOwner. Well, actually this is OK since we need to have the
 sharers still set until we count them to send the ack count to the
 requestor. Let’s remove that assert and see what happens. Nothing. That
 didn’t help anything.</p>

 <p>When are invalidations sent from the directory? Only on S-&gt;M_M. So,
 here, we need to remove ourselves from the invalidation list. I think we
 need to keep ourselves in the sharer list since we subtract one when
 sending the number of acks.</p>

 <p>Note: I’m coming back to this a little later. It turns out that both of
 these asserts are wrong. I found this out when running with more than
 one CPU below. The sharers are set before clearing the Owner in M-&gt;S_D
 on a GetS.</p>

 <p>So, onto the next problem!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Deadlock detected: current_time: 56091 last_progress_time: 6090 difference:  50001 processor: 0
 </code></pre></div></div>

 <p>Deadlocks are the worst kind of error. Whatever caused the deadlock is
 ancient history (i.e., likely happened many cycles earlier), and often
 very hard to track down.</p>

 <p>Looking at the tail of the protocol trace (note: sometimes you must put
 the protocol trace into a file because it grows <em>very</em> big) I see that
 there is an address that is trying to be replaced. Let’s start there.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 56091   0    L1Cache         Replacement   SM_A&gt;SM_A   [0x5ac0, line 0x5ac0]
 </code></pre></div></div>

 <p>Before this replacement got stuck I see the following in the protocol
 trace. Note: this is 50000 cycles in the past!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>...
 5592   0    L1Cache               Store      S&gt;SM_AD  [0x5ac0, line 0x5ac0]
 5597   0  Directory                GetM      S&gt;M_M    [0x5ac0, line 0x5ac0]
 ...
 5641   0  Directory             MemData    M_M&gt;M      [0x5ac0, line 0x5ac0]
 ...
 5646   0    L1Cache         DataDirAcks  SM_AD&gt;SM_A   [0x5ac0, line 0x5ac0]
 </code></pre></div></div>

 <p>Ah! This clearly should not be DataDirAcks since we only have a single
 CPU! So, we seem to not be subtracting properly. Going back to the
 previous error, I was wrong about needing to keep ourselves in the list.
 I forgot that we no longer had the -1 thing. So, let’s remove ourselves
 from the sharing list before sending the invalidations when we
 originally get the S-&gt;M request.</p>

 <p>So! With those changes the Ruby tester completes with a single core.
 Now, to make it harder we need to increase the number of loads we do and
 then the number of cores.</p>

 <p>And, of course, when I increase it to 10,000 loads there is a deadlock.
 Fun!</p>

 <p>What I’m seeing at the end of the protocol trace is the following.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>144684   0    L1Cache         Replacement   MI_A&gt;MI_A   [0x5bc0, line 0x5bc0]
 ...
 144685   0  Directory                GetM   MI_M&gt;MI_M   [0x54c0, line 0x54c0]
 ...
 144685   0    L1Cache         Replacement   MI_A&gt;MI_A   [0x5bc0, line 0x5bc0]
 ...
 144686   0  Directory                GetM   MI_M&gt;MI_M   [0x54c0, line 0x54c0]
 ...
 144686   0    L1Cache         Replacement   MI_A&gt;MI_A   [0x5bc0, line 0x5bc0]
 ...
 144687   0  Directory                GetM   MI_M&gt;MI_M   [0x54c0, line 0x54c0]
 ...
 </code></pre></div></div>

 <p>This is repeated for a long time.</p>

 <p>It seems that there is a circular dependence or something like that
 causing this deadlock.</p>

 <p>Well, it seems that I was correct. The order of the in_ports really
 matters! In the directory, I previously had the order: request,
 response, memory. However, there was a memory packet that was blocked
 because the request queue was blocked, which caused the circular
 dependence and the deadlock. The order <em>should</em> be memory, response, and
 request. I believe the memory/response order doesn’t matter since no
 responses depend on memory and vice versa.</p>

 <p>Now, let’s try with two CPUs. First thing I run into is an assert
 failure. I’m seeing the first assert in setState fail.</p>

 <p>``` {.sourceCode .c++}
 void setState(Addr addr, State state) {
     if (directory.isPresent(addr)) {
         if (state == State:M) {
             assert(getDirectoryEntry(addr).Owner.count() == 1);
             assert(getDirectoryEntry(addr).Sharers.count() == 0);
         }
         getDirectoryEntry(addr).DirState := state;
         if (state == State:I)  {
             assert(getDirectoryEntry(addr).Owner.count() == 0);
             assert(getDirectoryEntry(addr).Sharers.count() == 0);
         }
     }
 }</p>
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>
 To track this problem down, let's add a debug statement (DPRINTF) and
 run with protocol trace. First I added the following line just before
 the assert. Note that you are required to use the RubySlicc debug flag.
 This is the only debug flag included in the generated SLICC files.

 ``` {.sourceCode .c++}
 DPRINTF(RubySlicc, "Owner %s\n", getDirectoryEntry(addr).Owner);
 </code></pre></div></div>

 <p>Then, I see the following output when running with ProtocolTrace and
 RubySlicc.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>118   0  Directory             MemData    M_M&gt;M      [0x400, line 0x400]
 118: system.caches.controllers2: MSI-dir.sm:160: Owner [NetDest (16) 1 0  -  -  - 0  -  -  -  -  -  -  -  -  -  -  -  -  - ]
 118   0  Directory                GetM      M&gt;M      [0x400, line 0x400]
 118: system.caches.controllers2: MSI-dir.sm:160: Owner [NetDest (16) 1 1  -  -  - 0  -  -  -  -  -  -  -  -  -  -  -  -  - ]
 </code></pre></div></div>

 <p>It looks like when we process the GetM when in state M we need to first
 clear the owner before adding the new owner. The other options is in
 setOwner we could have Set the Owner specifically instead of adding it
 to the NetDest.</p>

 <p>Oooo! This is a new error!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Runtime Error at MSI-dir.sm:229: Unexpected message type..
 </code></pre></div></div>

 <p>What is this message that fails? Let’s use the RubyNetwork debug flag to
 try to track down what message is causing this error. A few lines above
 the error I see the following message whose destination is the
 directory.</p>

 <p>The destination is a NetDest which is a bitvector of MachineIDs. These
 are split into multiple sections. I know I’m running with two CPUs, so
 the first two 0’s are for the CPUs, and the other 1 must be fore the
 directory.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>2285: PerfectSwitch-2: Message: [ResponseMsg: addr = [0x8c0, line 0x8c0] Type = InvAck Sender = L1Cache-1 Destination = [NetDest (16) 0 0  -  -  - 1  -  -  -  -  -  -  -  -  -  -  -  -  - ] DataBlk = [ 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0xb1 0xb2 0xb3 0xb4 0xca 0xcb 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 ] MessageSize = Control Acks = 0 ]
 </code></pre></div></div>

 <p>This message has the type InvAck, which is clearly wrong! It seems that
 we are setting the requestor wrong when we send the invalidate (Inv)
 message to the L1 caches from the directory.</p>

 <p>Yes. This is the problem. We need to make the requestor the original
 requestor. This was already correct for the FwdGetS/M, but I missed the
 invalidate somehow. On to the next error!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Invalid transition
 system.caches.controllers0 time: 2287 addr: 0x8c0 event: LastInvAck state: SM_AD
 </code></pre></div></div>

 <p>This seems to be that I am not counting the acks correctly. It could
 also be that the directory is much slower than the other caches at
 responding since it has to get the data from memory.</p>

 <p>If it’s the latter (which I should be sure to verify), what we could do
 is include an ack requirement for the directory, too. Then, when the
 directory sends the data (and the owner, too) decrement the needed acks
 and trigger the event based on the new ack count.</p>

 <p>Actually, that first hypothesis was not quite right. I printed out the
 number of acks whenever we receive an InvAck and what’s happening is
 that the other cache is responding with an InvAck before the directory
 has told it how many acks to expect.</p>

 <p>So, what we need to do is something like what I was talking about above.
 First of all, we will need to let the acks drop below 0 and add the
 total acks to it from the directory message. Then, we are going to have
 to complicate the logic for triggering last ack, etc.</p>

 <p>Ok. So now we’re letting the tbe.Acks drop below 0 and then adding the
 directory acks whenever they show up.</p>

 <p>Next error: This is a tough one. The error is now that the data doesn’t
 match as it should. Kind of like the deadlock, the data could have been
 corrupted in the ancient past. I believe the address is the last one in
 the protocol trace.</p>

 <dl>
   <dt>::</dt>
   <dd>panic: Action/check failure: proc: 0 address: 19688 data: 0x779e6d0
 byte_number: 0 m_value+byte_number: 53 byte: 0 [19688, value: 53,
 status: Check_Pending, initiating node: 0, store_count: 4]Time:
 5843</dd>
 </dl>

 <p>So, it could be something to do with ack counts, though I don’t think
 this is the issue. Either way, it’s a good idea to annotate the protocol
 trace with the ack information. To do this, we can add comments to the
 transition with APPEND_TRANSITION_COMMENT.</p>

 <p><code class="highlighter-rouge">{.sourceCode .c++}
 action(decrAcks, "da", desc="Decrement the number of acks") {
     assert(is_valid(tbe));
     tbe.Acks := tbe.Acks - 1;
     APPEND_TRANSITION_COMMENT("Acks: ");
     APPEND_TRANSITION_COMMENT(tbe.Acks);
 }
 </code></p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>5737   1    L1Cache              InvAck  SM_AD&gt;SM_AD  [0x400, line 0x400] Acks: -1
 </code></pre></div></div>

 <p>For these data issues, the debug flag RubyNetwork is useful because it
 prints the value of the data blocks at every point it is in the network.
 For instance, for the address in question above, it looks like the data
 block is all 0’s after loading from main-memory. I believe this should
 have valid data. In fact, if we go back in time some we see that there
 was some non-zero elements.</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>5382   1    L1Cache                 Inv      S&gt;I      [0x4cc0, line 0x4cc0]
 </code></pre></div></div>

 <blockquote>
   <p>5383: PerfectSwitch-1: Message: [ResponseMsg: addr = [0x4cc0, line
 0x4cc0] Type = InvAck Sender = L1Cache-1 Destination = [NetDest (16) 1
 0 - - - 0 - - - - - - - - - - - - - ] DataBlk = [ 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x35 0x36 0x37 0x61 0x6d 0x6e 0x6f 0x70 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 ] MessageSize = Control Acks =
 0 ] … … … 5389 0 Directory MemData M_M&gt;M [0x4cc0, line 0x4cc0]
 5390: PerfectSwitch-2: incoming: 0 5390: PerfectSwitch-2: Message:
 [ResponseMsg: addr = [0x4cc0, line 0x4cc0] Type = Data Sender =
 Directory-0 Destination = [NetDest (16) 1 0 - - - 0 - - - - - - - - -</p>
   <ul>
     <li>
       <ul>
         <li>
           <ul>
             <li>
               <ul>
                 <li>] DataBlk = [ 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
 0x0 ] MessageSize = Data Acks = 1 ]</li>
               </ul>
             </li>
           </ul>
         </li>
       </ul>
     </li>
   </ul>
 </blockquote>

 <p>It seems that memory is not being updated correctly on the M-&gt;S
 transition. After lots of digging and using the MemoryAccess debug flag
 to see exactly what was being read and written to main memory, I found
 that in sendDataToMem I was using the request_in. This is right for
 PutM, but not right for Data. We need to have another action to send
 data from response queue!</p>

 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>panic: Invalid transition
 system.caches.controllers0 time: 44381 addr: 0x7c0 event: Inv state: SM_AD
 </code></pre></div></div>

 <p>Invalid transition is my personal favorite kind of SLICC error. For this
 error, you know exactly what address caused it, and it’s very easy to
 trace through the protocol trace to find what went wrong. However, in
 this case, nothing went wrong, I just forgot to put this transition in!
 Easy fix!</p>


 </div>

 	</main>

 	<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
 	<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
 	<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script>

 	<script>
 	  // When the user scrolls down 20px from the top of the document, show the button
 	  window.onscroll = function() {scrollFunction()};

 	  function scrollFunction() {
 	      if (document.body.scrollTop > 100 || document.documentElement.scrollTop > 20) {
 	          document.getElementById("myBtn").style.display = "block";
 	      } else {
 	          document.getElementById("myBtn").style.display = "none";
 	      }
 	  }

 	  // When the user clicks on the button, scroll to the top of the document
 	  function topFunction() {
 	      document.body.scrollTop = 0;
 	      document.documentElement.scrollTop = 0;
 	  }
 	</script>

 </body>

 <footer class="page-footer">
 	<div class="container">
 		<div class="row">

 			<div class="col-12 col-sm-4">
 				<p><a href="/about">About</a></p>
 				<p><a href="/publications">Publications</a></p>
 				<p><a href="/contributing">Contributing</a></p>
 			</div><br>

 			<div class="col-12 col-sm-4">
 				<p><a href="/gettingstarted">Documentation</a></p>
 				<p><a href="#">Source</a></p>
 			</div><br>

 			<div class="col-12 col-sm-4">
 				<p><a href="/help">Help</a></p>
 				<p><a href="/search">Search</a></p>
 				<p><a href="#">Mailing Lists</a></p>
 			</div>

 		</div>
 	</div>
 </footer>


 </html>