| <!DOCTYPE html> |
| <html> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="x-ua-compatible" content="ie=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| |
| <title>gem5</title> |
| |
| <!-- SITE FAVICON --> |
| <link rel="shortcut icon" type="image/gif" href="/assets/img/gem5ColorVert.gif"/> |
| |
| <link rel="canonical" href="http://localhost:4000/search/"> |
| <link href='https://fonts.googleapis.com/css?family=Open+Sans:400,300,700,800,600' rel='stylesheet' type='text/css'> |
| <link href='https://fonts.googleapis.com/css?family=Muli:400,300' rel='stylesheet' type='text/css'> |
| |
| <!-- FAVICON --> |
| <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css"> |
| |
| <!-- BOOTSTRAP --> |
| <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous"> |
| |
| <!-- CUSTOM CSS --> |
| <link rel="stylesheet" href="/css/main.css"> |
| </head> |
| |
| |
| <body> |
| <nav class="navbar navbar-expand-md navbar-light bg-light"> |
| <a class="navbar-brand" href="/"> |
| <img src="/assets/img/gem5ColorLong.gif" alt="gem5" height=45px> |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNavDropdown" aria-controls="navbarNavDropdown" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarNavDropdown"> |
| <ul class="navbar-nav ml-auto"> |
| <li class="nav-item "> |
| <a class="nav-link" href="/">Home</a> |
| </li> |
| |
| <li class="nav-item dropdown "> |
| <a class="nav-link dropdown-toggle" href="/about" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| About |
| </a> |
| <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> |
| <a class="dropdown-item" href="/about">About</a> |
| <a class="dropdown-item" href="/publications">Publications</a> |
| <a class="dropdown-item" href="/governance">Governance</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown "> |
| <a class="nav-link dropdown-toggle" href="#" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| Documentation |
| </a> |
| <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> |
| <!-- Pull navigation from _data/documentation.yml --> |
| |
| <a class="dropdown-item" href="/introduction">Introduction</a> |
| |
| <a class="dropdown-item" href="/building">Getting Started</a> |
| |
| <a class="dropdown-item" href="/environment">Modifying/Extending</a> |
| |
| <a class="dropdown-item" href="/MSIintro">Modeling Cache Coherence with Ruby</a> |
| |
| </div> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="/contributing">Contributing</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="/blog">Blog</a> |
| </li> |
| |
| <li class="nav-item active"> |
| <a class="nav-link" href="/search">Search</a> |
| </li> |
| </ul> |
| </div> |
| </nav> |
| |
| <main> |
| <br><br> |
| <div class="container"> |
| |
| <h1 class="title">Search</h1> |
| <br> |
| <br> |
| <div class="search"> |
| <form action="/search" method="get"> |
| <label for="search-box"><i class="fa fa-search"></i></label> |
| <input type="text" id="search-box" name="query" placeholder="search"> |
| <button type="submit" value="search" class="btn-outline-primary">Search</button> |
| </form> |
| </div> |
| <br><br> |
| |
| |
| <ul id="search-results"></ul> |
| |
| <script> |
| window.store = { |
| |
| "about": { |
| "title": "About", |
| "content": "About content goes here. A list item Another list item", |
| "url": "/about/" |
| } |
| , |
| |
| "contact": { |
| "title": "Contact", |
| "content": "Contact content goes here.My e-mail is email@something.com.", |
| "url": "/contact/" |
| } |
| , |
| |
| "contributing": { |
| "title": "Contributing", |
| "content": " High-level flow Cloning the repo to contribute Other gem5 repositories Other gem5 branches Making changes to gem5 Requirements for change descriptions Posting a review Setting up an account Submitting a change Push change to gerrit review Pushing your first change Push change to gerrit as a draft Push change bypassing gerrit Other gerrit push options Reviewing patches Committing Governance Overview Philosophy gem5 Roadmap Roles And Responsibilities Support Contribution Process Decision Making Process If you’ve made changes to gem5 that might benefit others, we strongly encourage you to contribute those changes to the public gem5 repository. There are several reasons to do this: Share your work with others, so that they can benefit from new functionality. Support the scientific principle by enabling others to evaluate your suggestions without having to guess what you did. Once your changes are part of the main repo, you no longer have to merge them back in every time you update your local repo. This can be a huge time saving! Once your code is in the main repo, other people have to make their changes work with your code, and not the other way around. Others may build on your contributions to make them even better, or extend them in ways you did not have time to do. You will have the satisfaction of contributing back to the community.The main method for contributing code to gem5 is via our code review website: https://gem5-review.googlesource.com/. This documents describes the details of how to create code changes, upload your changes, have your changes reviewed, and finally push your changes to gem5. More information can be found from the following sources: http://gem5.org/Submitting_Contributions https://gerrit-review.googlesource.com/Documentation/index.html https://git-scm.com/bookHigh-level flow +-------------+ | Make change | +------+------+ | | v +------+------+ | Post review | +------+------+ | v +--------+---------+ | Wait for reviews | <--------+ +--------+---------+ | | | | | v | +----+----+ No +------+------+ |Reviewers+--------->+ Update code | |happy? | +------+------+ +----+----+ ^ | | | Yes | v | +----+-----+ No | |Maintainer+----------------+ |happy? | +----+-----+ | | Yes v +------+------+ | Submit code | +-------------+After creating your change to gem5, you can post a review on our Gerrit code-review site: https://gem5-review.googlesource.com. Before being able to submit your code to the mainline of gem5, the code is reviewed by others in the community. Additionally, the maintainer for that part of the code must sign off on it.Cloning the repo to contributeIf you plan on contributing, it is strongly encouraged for you to clone the repository directly from our gerrit instance at https://gem5.googlesource.com/.To clone the master gem5 repository: git clone https://gem5.googlesource.com/public/gem5Other gem5 repositoriesThere are a few repositories other than the main gem5 development repository. public/m5threads: The code for a pthreads implementation that works with gem5’s syscall emulation mode.Other gem5 branchesNone right now.Making changes to gem5It is strongly encouraged to use git branches when making changes to gem5. Additionally, keeping changes small and concise and only have a single logical change per commit.Unlike our previous flow with Mercurial and patch queues, when using git, you will be committing changes to your local branch. By using separate branches in git, you will be able to pull in and merge changes from mainline and simply keep up with upstream changes.Requirements for change descriptionsTo help reviewers and future contributors more easily understand and track changes, we require all change descriptions be strictly formatted.A canonical commit message consists of three parts: A short summary line describing the change. This line starts with one or more keywords (found in the MAINTAINERS file) separated by commas followed by a colon and a description of the change. This line should be no more than 65 characters long since version control systems usually add a prefix that causes line-wrapping for longer lines. (Optional, but highly recommended) A detailed description. This describes what you have done and why. If the change isn’t obvious, you might want to motivate why it is needed. Lines need to be wrapped to 75 characters or less. Tags describing patch metadata. You are highly recommended to use tags to acknowledge reviewers for their work. Gerrit will automatically add most tags.Tags are an optional mechanism to store additional metadata about a patch and acknowledge people who reported a bug or reviewed that patch. Tags are generally appended to the end of the commit message in the order they happen. We currently use the following tags: Signed-off-by: Added by the author and the submitter (if different). This tag is a statement saying that you believe the patch to be correct and have the right to submit the patch according to the license in the affected files. Similarly, if you commit someone else’s patch, this tells the rest of the world that you have have the right to forward it to the main repository. If you need to make any changes at all to submit the change, these should be described within hard brackets just before your Signed-off-by tag. By adding this line, the contributor certifies the contribution is made under the terms of the Developer Certificate of Origin (DCO) [https://developercertificate.org/]. Reviewed-by: Used to acknowledge patch reviewers. It’s generally considered good form to add these. Added automatically. Reported-by: Used to acknowledge someone for finding and reporting a bug. Reviewed-on: Link to the review request corresponding to this patch. Added automatically. Change-Id: Used by Gerrit to track changes across rebases. Added automatically with a commit hook by git. Tested-by: Used to acknowledge people who tested a patch. Sometimes added automatically by review systems that integrate with CI systems.Other than the “Signed-off-by”, “Reported-by”, and “Tested-by” tags, you generally don’t need to add these manually as they are added automatically by Gerrit.It is encouraged for the author of the patch and the submitter to add a Signed-off-by tag to the commit message. By adding this line, the contributor certifies the contribution is made under the terms of the Developer Certificate of Origin (DCO) [https://developercertificate.org/].It is imperative that you use your real name and your real email address in both tags and in the author field of the changeset.For significant changes, authors are encouraged to add copyright information and their names at the beginning of the file. The main purpose of the author names on the file is to track who is most knowledgeable about the file (e.g., who has contributed a significant amount of code to the file).Note: If you do not follow these guidelines, the gerrit review site will automatically reject your patch. If this happens, update your changeset descriptions to match the required style and resubmit. The following is a useful git command to update the most recent commit (HEAD). git commit --amendPosting a reviewIf you have not signed up for an account on the Gerrit review site (https://gem5-review.googlesource.com), you first have to create an account.Setting up an account Go to https://gem5.googlesource.com/ Click “Sign In” in the upper right corner. Note: You will need a Google account to contribute. After signing in, click “Generate Password” and follow the instructions.Submitting a changeIn gerrit, to submit a review request, you can simply push your git commits to a special named branch. For more information on git push see https://git-scm.com/docs/git-push.There are three ways to push your changes to gerrit.Push change to gerrit reviewgit push origin HEAD:refs/for/masterAssuming origin is https://gem5.googlesource.com/public/gem5 and you want to push the changeset at HEAD, this will create a new review request on top of the master branch. More generally,git push <gem5 gerrit instance> <changeset>:refs/for/<branch>See https://gerrit-review.googlesource.com/Documentation/user-upload.html for more information.Pushing your first changeThe first time you push a change you may get the following error:remote: ERROR: [fb1366b] missing Change-Id in commit message footer...Within the error message, there is a command line you should run. For every new clone of the git repo, you need to run the following command to automatically insert the change id in the the commit (all on one line).curl -Lo `git rev-parse --git-dir`/hooks/commit-msg \\ \thttps://gerrit-review.googlesource.com/tools/hooks/commit-msg ; \\chmod +x `git rev-parse --git-dir`/hooks/commit-msgIf you receive the above error, simply run this command and then amend your changeset.git commit --amendPush change to gerrit as a draftgit push origin HEAD:refs/drafts/masterPush change bypassing gerritOnly maintainers can bypass gerrit review. This should very rarely be used.git push origin HEAD:refs/heads/masterOther gerrit push optionsThere are a number of options you can specify when uploading your changes to gerrit (e.g., reviewers, labels). The gerrit documentation has more information. https://gerrit-review.googlesource.com/Documentation/user-upload.htmlReviewing patchesReviewing patches is done on our gerrit instance at https://gem5-review.googlesource.com/.After logging in with your Google account, you will be able to comment, review, and push your own patches as well as review others’ patches. All gem5 users are encouraged to review patches. The only requirement to review patches is to be polite and respectful of others.There are multiple labels in Gerrit that can be applied to each review detailed below. Code-review: This is used by any gem5 user to review patches. When reviewing a patch you can give it a score of -2 to +2 with the following semantics. -2: This blocks the patch. You believe that this patch should never be committed. This label should be very rarely used. -1: You would prefer this is not merged as is 0: No score +1: This patch seems good, but you aren’t 100% confident that it should be pushed. +2: This is a good patch and should be pushed as is. Maintainer: Currently only PMC members are maintainers. At least one maintainer must review your patch and give it a +1 before it can be merged. Verified: This is automatically generated from the continuous integrated (CI) tests. Each patch must receive at least a +1 from the CI tests before the patch can be merged. The patch will receive a +1 if gem5 builds and runs, and it will receive a +2 if the stats match. Style-Check: This is automatically generated and tests the patch against the gem5 code style (http://www.gem5.org/Coding_Style). The patch must receive a +1 from the style checker to be pushed.Note: Whenever the patch creator updates the patch all reviewers must re-review the patch. There is no longer a “Fix it, then Ship It” option.Once you have received reviews for your patch, you will likely need to make changes. To do this, you should update the original git changeset. Then, you can simply push the changeset again to the same Gerrit branch to update the review request.Please see governance and reviewing patches. git push origin HEAD:refs/for/masterNote: If you have posted a patch and don’t receive any reviews, you may need to prod the reviewers. You can do this by adding a reply to your changeset review on gerrit. It is expected that at least the maintainer will supply a review for your patch.CommittingEach patch must meet the following criteria to be merged: At least one review with +2 At least one maintainer with +1 At least +1 from the CI tests (gem5 must build and run) At least +1 from the style checkerOnce a patch meets the above criteria, the submitter of the patch will be able to merge the patch by pressing the “Submit” button on Gerrit. When the patch is submitted, it is merged into the public gem5 branch.GovernanceOverviewgem5 is a meritocratic, consensus-based community project. Anyone with an interest in the project can join the community, contribute to the project design and participate in the decision-making process. Historically, gem5 development has been carried out both in industry and in academia. This document describes how that participation takes place and how to set about earning merit within the project community.The document is broken into a number of sections. Philosophy describes the ideas behind the gem5 community. The Roadmap section points to the roadmap document for gem5’s development. Users and Responsibilities describes the classes of users that use gem5, the types of gem5 contributors, and their responsibilities. Support describes how the community supports users and the Contribution process describes how to contribute. Finally, the Decision Process describes how decisions are made and then we conclude.PhilosophyThe goal of gem5 is to provide a tool to further the state of the art in computer architecture. gem5 can be used for (but is not limited to) computer-architecture research, advanced development, system-level performance analysis and design-space exploration, hardware-software co-design, and low-level software performance analysis. Another goal of gem5 is to be a common framework for computer architecture. A common framework in the academic community makes it easier for other researchers to share workloads as well as models and to compare and contrast with other architectural techniques.The gem5 community strives to balance the needs of its three user types (academic researchers, industry researchers, and students, detailed below). For instance, gem5 strives to balance adding new features (important to researchers) and a stable code base (important for students). Specific user needs important to the community are enumerated below: Effectively and efficiently emulate the behavior of modern processors in a way that balances simulation performance and accuracy Serve as a malleable baseline infrastructure that can easily be adapted to emulate the desired behaviors Provide a core set of APIs and features that remain relatively stable Incorporate features that make it easy for companies and research groups to stay up to date with the tip and continue contributing to the projectAdditionally, the gem5 community is committed to openness, transparency, and inclusiveness. Participants in the gem5 community of all backgrounds should feel welcome and encouraged to contribute.gem5 RoadmapThe roadmap for gem5 can be found on Roadmap page. The roadmap document details the short and long term goals for the gem5 software. Users of all types are encouraged to contribute to this document and shape the future of gem5. Users are especially encouraged to update the roadmap (and get consensus) before submitting large changes to gem5.Roles And ResponsibilitiesUsersUsers are community members who have a need for the project. They are the most important members of the community and without them the project would have no purpose. Anyone can be a user; there are no special requirements. There are currently three main categories of gem5 users: academic researchers, industry researchers, and students. Individuals may transition between categories, e.g., when a graduate student takes an industry internship, then returns to school; or when a student graduates and takes a job in industry. These three users are described below.Academic ResearchersThis type of user primarily encompasses individuals that use gem5 in academic research. Examples include, but are not limited to, graduate students, research scientists, and post-graduates. This user often uses gem5 as a tool to discover and invent new computer architecture mechanisms. Academic Researchers often are first exposed to gem5 as Students (see below) and transition from Students to Academic Researchers over time.Because of these users’ goals, they primarily add new features to gem5. It is important to the gem5 community to encourage these users to contribute their work to the mainline gem5 repository. By encouraging these users to commit their research contributions, gem5 will make it much easier for other researchers to compare and contrast with other architectural techniques (see Philosophy section).Industry ResearchersThis type of user primarily encompasses individuals working for companies that use gem5. These users are distinguished from academic researchers in two ways. First, industry researchers are often part of a larger team, rather than working individually on gem5. Second, industry researchers often want to incorporate proprietary information into private branches of gem5. Therefore, industry researchers tend to have rather sophisticated software infrastructures built around gem5. For these users, the stability of gem5 features and baseline source code is important. Another key consideration is the fidelity of the models, and their ability to accurately reflect realistic implementations. To enable industry participation, it is critical to maintain licensing terms that do not restrict or burden the use of gem5 in conjunction with proprietary IP.StudentsThis type of user primarily encompasses individuals that are using gem5 in a classroom setting. These users typically have some foundation in computer architecture, but they have little or no background using simulation tools. Additionally, these users may not use gem5 for an extended period of time, after finishing their short-term goals (e.g., a semester-long class).The project asks its users to participate in the project and community as much as possible. User contributions enable the project team to ensure that they are satisfying the needs of those users. Common user contributions include (but are not limited to): evangelising about the project (e.g., a link on a website and word-of-mouth awareness raising) informing developers of strengths and weaknesses from a new user perspective providing moral support (a ‘thank you’ goes a long way) providing financial support (the software is open source, but its developers need to eat)Users who continue to engage with the project and its community will often become more and more involved. Such users may find themselves becoming contributors, as described in the next section.ContributorsContributors are community members who contribute in concrete ways to the project. Anyone can become a contributor, and contributions can take many forms. There are no specific skill requirements and no selection process. There is only one expectation of commitment to the project: contributors must be respectful to each other during the review process and work together to reach compromises. See the “Reviewing Patches” section for more on the process of contributing.In addition to their actions as users, contributors may also find themselves doing one or more of the following: answering questions on the mailing lists, particularly the “easy” questions from new users (existing users are often the best people to support new users), or those that relate to the particular contributor’s experiences reporting bugs identifying requirements providing graphics and web design programming assisting with project infrastructure writing documentation fixing bugs adding features acting as an ambassador and helping to promote the projectContributors engage with the project through the Review Board and mailing list, or by writing or editing documentation. They submit changes to the project source code via patches submitted to Review Board, which will be considered for inclusion in the project by existing committers (see next section). The developer mailing list is the most appropriate place to ask for help when making that first contribution.As contributors gain experience and familiarity with the project, their profile within, and commitment to, the community will increase. At some stage, they may find themselves being nominated for committership.CommittersCommitters are community members who have shown that they are committed to the continued development of the project through ongoing engagement with the community. Committership allows contributors to more easily carry on with their project related activities by giving them direct access to the project’s resources. That is, they can make changes directly to project outputs, although they still have to submit code changes via Review Board. Additionally, committers are expected to have an ongoing record of contributions in terms of code, reviews, and/or discussion.Committers have no more authority over the project than contributors. While committership indicates a valued member of the community who has demonstrated a healthy respect for the project’s aims and objectives, their work continues to be reviewed by the community. The key difference between a committer and a contributor is committers have the extra responsibility of pushing patches to the mainline. Additionally, committers are expected to contribute to discussions on the gem5-dev list and review patches.Anyone can become a committer. The only expectation is that a committer has demonstrated an ability to participate in the project as a team player. Specifically, refer to the 2nd paragraph of the Contributors section.Typically, a potential committer will need to show that they have an understanding of the project, its objectives and its strategy (see Philosophy section). They will also have provided valuable contributions to the project over a period of time.New committers can be nominated by any existing committer. Once they have been nominated, there will be a vote by the project management committee (PMC; see below). Committer nomination and voting is one of the few activities that takes place on the project’s private management list. This is to allow PMC members to freely express their opinions about a nominee without causing embarrassment. Once the vote has been held, the nominee is notified of the result. The nominee is entitled to request an explanation of any ‘no’ votes against them, regardless of the outcome of the vote. This explanation will be provided by the PMC Chair (see below) and will be anonymous and constructive in nature.Nominees may decline their appointment as a committer. However, this is unusual, as the project does not expect any specific time or resource commitment from its community members. The intention behind the role of committer is to allow people to contribute to the project more easily, not to tie them into the project in any formal way.It is important to recognise that commitership is a privilege, not a right. That privilege must be earned and once earned it can be removed by the PMC (see next section) in extreme circumstances. However, under normal circumstances committership exists for as long as the committer wishes to continue engaging with the project.A committer who shows an above-average level of contribution to the project, particularly with respect to its strategic direction and long-term health, may be nominated to become a member of the PMC. This role is described below.Project management committeeThe project management committee consists of those individuals identified as ‘project owners’ on the development site. The PMC has additional responsibilities over and above those of a committer. These responsibilities ensure the smooth running of the project. PMC members are expected to review code contributions, participate in strategic planning, approve changes to the governance model and manage how the software is distributed and licensed.Some PMC members are responsible for specific components of the gem5 project. This includes gem5 source modules (e.g., classic caches, O3CPU model, etc.) and project assets (e.g., the website). A list of the current components and the responsible members can be found on Module owners.Members of the PMC do not have significant authority over other members of the community, although it is the PMC that votes on new committers. It also makes decisions when community consensus cannot be reached. In addition, the PMC has access to the project’s private mailing list. This list is used for sensitive issues, such as votes for new committers and legal matters that cannot be discussed in public. It is never used for project management or planning.Membership of the PMC is by invitation from the existing PMC members. A nomination will result in discussion and then a vote by the existing PMC members. PMC membership votes are subject to consensus approval of the current PMC members. Additions to the PMC require unanimous agreement of the PMC members. Removing someone from the PMC requires N-1 positive votes, where N is the number of PMC members not including the individual who is being voted out.Members Ali Saidi Andreas Hansson Andreas Sandberg Anthony Gutierrez Brad Beckmann Jason Lowe-Power Nathan Binkerg Steve ReinhardtPMC ChairThe PMC Chair is a single individual, voted for by the PMC members. Once someone has been appointed Chair, they remain in that role until they choose to retire, or the PMC casts a two-thirds majority vote to remove them.The PMC Chair has no additional authority over other members of the PMC: the role is one of coordinator and facilitator. The Chair is also expected to ensure that all governance processes are adhered to, and has the casting vote when any project decision fails to reach consensus.SupportAll participants in the community are encouraged to provide support for new users within the project management infrastructure. This support is provided as a way of growing the community. Those seeking support should recognise that all support activity within the project is voluntary and is therefore provided as and when time allows.Contribution ProcessAnyone, capable of showing respect to others, can contribute to the project, regardless of their skills, as there are many ways to contribute. For instance, a contributor might be active on the project mailing list and issue tracker, or might supply patches. The various ways of contributing are described in more detail in a separate document Submitting Contributions.The developer mailing list is the most appropriate place for a contributor to ask for help when making their first contribution. See the Submitting Contributions page on the gem5 wiki for details of the gem5 contribution process. Each new contribution should be submitted as a patch to our Review Board site. Then, other gem5 developers will review your patch, possibly asking for minor changes. After the patch has received consensus (see Decision Making Process), the patch is ready to be committed to the gem5 tree. For committers, this is as simple as pushing the changeset. For contributors, a committer should push the changeset for you. If a committer does not push the changeset within a reasonable window (a couple of days), send a friendly reminder email to the gem5-dev list. Before a patch is committed to gem5, it must receive at least 2 “Ship its” from reviewboard. If there are no reviews on a patch, users should send follow up emails to the gem5-dev list asking for reviews.Reviewing PatchesAn important part of the contribution process is providing feedback on patches that other developers submit. The purpose of reviewing patches is to weed out obvious bugs and to ensure that the code in gem5 is of sufficient quality.All users are encouraged to review the contributions that are posted on Review Board. If you are an active gem5 user, it’s a good idea to keep your eye on the contributions that are posted there (typically by subscribing to the gem5-dev mailing list) so you can speak up when you see a contribution that could impact your use of gem5. It is far more effective to contribute your opinion in a review before a patch gets committed than to complain after the patch is committed, you update your repository, and you find that your simulations no longer work.We greatly value the efforts of reviewers to maintain gem5’s code quality and consistency. However, it is important that reviews balance the desire to maintain the quality of the code in gem5 with the need to be open to accepting contributions from a broader community. People will base their desire to contribute (or continue contributing) on how they and other contributors are received. With that in mind, here are some guidelines for reviewers: Remember that submitting a contribution is a generous act, and is very rarely a requirement for the person submitting it. It’s always a good idea to start a review with something like “thank you for submitting this contribution”. A thank-you is particularly important for new or occasional submitters. Overall, the attitude of a reviewer should be “how can we take this contribution and put it to good use”, not “what shortcomings in this work must the submitter address before the contribution can be considered worthy”. As the saying goes, “the perfect is the enemy of the good”. While we don’t want gem5 to deteriorate, we also don’t want to bypass useful functionality or improvements simply because they are not optimal. If the optimal solution is not likely to happen, then accepting a suboptimal solution may be preferable to having no solution. A suboptimal solution can always be replaced by the optimal solution later. Perhaps the suboptimal solution can be incrementally improved to reach that point. When asking a submitter for additional changes, consider the cost-benefit ratio of those changes. In particular, reviewers should not discount the costs of requested changes just because the cost to the reviewer is near zero. Asking for extensive changes, particularly from someone who is not a long-time gem5 developer, may be imposing a significant burden on someone who is just trying to be helpful by submitting their code. If you as a reviewer really feel that some extensive reworking of a patch is necessary, consider volunteering to make the changes yourself. Not everyone uses gem5 in the same way or has the same needs. It’s easy to reject a solution due to its flaws when it solves a problem you don’t have—so there’s no loss to you if we end up with no solution. That’s probably not an acceptable result for the person submitting the patch though. Another way to look at this point is as the flip side of the previous item: just as your cost-benefit analysis should not discount the costs to the submitter of making changes, just because the costs to you are low, it should also not discount the benefits to the submitter of accepting the submission, just because the benefits to you are low. Be independent and unbiased while commenting on review requests. Do not support a patch just because you or your organization will benefit from it or oppose it because you will need to do more work. Whether you are an individual or someone working with an organization, think about the patch from community’s perspective. Try to keep the arguments technical and the language simple. If you make some claim about a patch, substantiate it.Decision Making ProcessDecisions about the future of the project are made through discussion with all members of the community, from the newest user to the most experienced PMC member. All non-sensitive project management discussion takes place on the gem5-dev mailing list. Occasionally, sensitive discussion occurs on a private list.In order to ensure that the project is not bogged down by endless discussion and continual voting, the project operates a policy of lazy consensus. This allows the majority of decisions to be made without resorting to a formal vote.Lazy consensusDecision making typically involves the following steps: Proposal Discussion Vote (if consensus is not reached through discussion) DecisionAny community member can make a proposal for consideration by the community. In order to initiate a discussion about a new idea, they should send an email to the gem5-dev list or submit a patch implementing the idea to Review Board. This will prompt a review and, if necessary, a discussion of the idea. The goal of this review and discussion is to gain approval for the contribution. Since most people in the project community have a shared vision, there is often little need for discussion in order to reach consensus.In general, as long as nobody explicitly opposes a proposal, it is recognised as having the support of the community. This is called lazy consensus—that is, those who have not stated their opinion explicitly have implicitly agreed to the implementation of the proposal.Lazy consensus is a very important concept within the project. It is this process that allows a large group of people to efficiently reach consensus, as someone with no objections to a proposal need not spend time stating their position, and others need not spend time reading such mails.For lazy consensus to be effective, it is necessary to allow at least two weeks before assuming that there are no objections to the proposal. This requirement ensures that everyone is given enough time to read, digest and respond to the proposal. This time period is chosen so as to be as inclusive as possible of all participants, regardless of their location and time commitments. For Review Board requests, if there are no reviews after two weeks, the submitter should send a reminder email to the mailing list. Reviewers may ask patch submitters to delay submitting a patch when they have a desire to review a patch and need more time to do so. As discussed in the Contributing Section, each patch should have at least two “Ship its” before it is committed.VotingNot all decisions can be made using lazy consensus. Issues such as those affecting the strategic direction or legal standing of the project must gain explicit approval in the form of a vote. Every member of the community is encouraged to express their opinions in all discussion and all votes. However, only project committers and/or PMC members (as defined above) have binding votes for the purposes of decision making. A separate document on the voting within a meritocratic governance model (http://oss-watch.ac.uk/resources/meritocraticgovernancevoting) describes in more detail how voting is conducted in projects following the practice established within the Apache Software Foundation.This document is based on the example (http://oss-watch.ac.uk/resources/meritocraticgovernancemodel) by Ross Gardler and Gabriel Hanganu and is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License", |
| "url": "/contributing/" |
| } |
| , |
| |
| "introduction": { |
| "title": "Introduction", |
| "content": " authors Jason Lowe-PowerIntroductionThis is an intro to this tutorial. It says lots of interesting things.The goal of this document is to give you, the reader, a thoroughintroduction on how to use gem5 and the gem5 codebase. The purpose ofthis document is not to provide a detailed description of every featurein gem5. After reading this document, you should feel comfortable usinggem5 in the classroom and for computer architecture research.Additionally, you should be able to modify and extend gem5 and thencontribute your improvements to the main gem5 repository.This document is colored by my personal experiences with gem5 over thepast six years as a graduate student at the University ofWisconsin-Madison. The examples presented are just one way to do it.Unlike Python, whose mantra is “There should be one– and preferablyonly one –obvious way to do it.” (from The Zen of Python. Seeimport this), in gem5 there are a number of different ways toaccomplish the same thing. Thus, many of the examples presented in thisbook are my opinion of the best way to do things.One important lesson I have learned (the hard way) is when using complextools like gem5, it is important to actually understand how it worksbefore using it.Finish the previous paragraph about how it is a good idea to understandwhat your tools are actually doing.should add a list of terms. Things like “simulated system” vs “hostsystem”, etc.You can find the source for this book on github athttps://github.com/powerjg/learning_gem5.", |
| "url": "/introduction/" |
| } |
| , |
| |
| "building": { |
| "title": "Building gem5", |
| "content": " authors Jason Lowe-PowerBuilding gem5This chapter covers the details of how to set up a gem5 developmmentenvironment and build gem5.Requirements for gem5See gem5 requirementsfor more details.On Ubuntu, you can install all of the required dependencies with thefollowing command. The requirements are detailed below.sudo apt install build-essential git m4 scons zlib1g zlib1g-dev libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev python-dev python git (Git): The gem5 project uses Git for versioncontrol. Git is a distributed versioncontrol system. More information aboutGit can be found by following the link.Git should be installed by default on most platforms. However,to install Git in Ubuntu use sudo apt-get install git gcc 4.8+ You may need to use environment variables to point to anon-default version of gcc. On Ubuntu, you can install a development environment with sudo apt-get install build-essential SCons gem5 uses SCons as its build environment. SCons is like make onsteroids and uses Python scripts for all aspects of the buildprocess. This allows for a very flexible (if slow) build system. To get SCons on Ubuntu use sudo apt-get install scons Python 2.7+ gem5 relies on the Python development libraries. To installthese on Ubuntu use sudo apt-get install python-dev protobuf 2.1+ “Protocol buffers are a language-neutral, platform-neutralextensible mechanism for serializing structured data.” In gem5,the protobuflibrary is used for trace generation and playback.protobuf isnot a required package, unless you plan on using it for tracegeneration and playback. sudo apt-get install libprotobuf-dev python-protobuf protobuf-compiler libgoogle-perftools-dev Getting the codeChange directories to where you want to download the gem5 source. Then,to clone the repository, use the git clone command.git clone https://gem5.googlesource.com/public/gem5You can now change directories to gem5 which contains all of the gem5code.Your first gem5 buildLet’s start by building a basic x86 system. Currently, you must compilegem5 separately for every ISA that you want to simulate. Additionally,if using ruby-intro-chapter, you have to have separate compilations forevery cache coherence protocol.To build gem5, we will use SCons. SCons uses the SConstruct file(gem5/SConstruct) to set up a number of variables and then uses theSConscript file in every subdirectory to find and compile all of thegem5 source.SCons automatically creates a gem5/build directory when firstexecuted. In this directory you’ll find the files generated by SCons,the compiler, etc. There will be a separate directory for each set ofoptions (ISA and cache coherence protocol) that you use to compile gem5.There are a number of default compilations options in the build_optsdirectory. These files specify the parameters passed to SCons wheninitially building gem5. We’ll use the X86 defaults and specify that wewant to compile all of the CPU models. You can look at the filebuild_opts/X86 to see the default values for the Scons options. Youcan also specify these options on the command line to override anydefault.scons build/X86/gem5.opt -j9 gem5 binary types The SCons scripts in gem5 currently have 5 different binaries you canbuild for gem5: debug, opt, fast, prof, and perf. These names aremostly self-explanatory, but detailed below. debug Built with no optimizations and debug symbols. This binary isuseful when using a debugger to debug if the variables you need toview are optimized out in the opt version of gem5. Running withdebug is slow compared to the other binaries. opt This binary is build with most optimizations on (e.g., -O3), butwith debug symbols included. This binary is much faster thandebug, but still contains enough debug information to be able todebug most problems. fast Built with all optimizations on (including link-time optimizationson supported platforms) and with no debug symbols. Additionally,any asserts are removed, but panics and fatals are still included.fast is the highest performing binary, and is much smaller thanopt. However, fast is only appropriate when you feel that it isunlikely your code has major bugs. prof and perf These two binaries are build for profiling gem5. prof includesprofiling information for the GNU profiler (gprof), and perfincludes profiling information for the Google performance tools(gperftools). The main argument passed to SCons is what you want to build,build/X86/gem5.opt. In this case, we are building gem5.opt (anoptimized binary with debug symbols). We want to build gem5 in thedirectory build/X86. Since this directory currently doesn’t exist, SConswill look in build_opts to find the default parameters for X86. (Note:I’m using -j9 here to execute the build on 9 of my 8 cores on mymachine. You should choose an appropriate number for your machine,usually cores+1.)The output should look something like below:Checking for C header file Python.h... yesChecking for C library pthread... yesChecking for C library dl... yesChecking for C library util... yesChecking for C library m... yesChecking for C library python2.7... yesChecking for accept(0,0,0) in C++ library None... yesChecking for zlibVersion() in C++ library z... yesChecking for GOOGLE_PROTOBUF_VERIFY_VERSION in C++ library protobuf... yesChecking for clock_nanosleep(0,0,NULL,NULL) in C library None... yesChecking for timer_create(CLOCK_MONOTONIC, NULL, NULL) in C library None... noChecking for timer_create(CLOCK_MONOTONIC, NULL, NULL) in C library rt... yesChecking for C library tcmalloc... yesChecking for backtrace_symbols_fd((void*)0, 0, 0) in C library None... yesChecking for C header file fenv.h... yesChecking for C header file linux/kvm.h... yesChecking size of struct kvm_xsave ... yesChecking for member exclude_host in struct perf_event_attr...yesBuilding in /local.chinook/gem5/gem5-tutorial/gem5/build/X86Variables file /local.chinook/gem5/gem5-tutorial/gem5/build/variables/X86 not found, using defaults in /local.chinook/gem5/gem5-tutorial/gem5/build_opts/X86scons: done reading SConscript files.scons: Building targets ... [ISA DESC] X86/arch/x86/isa/main.isa -> generated/inc.d [NEW DEPS] X86/arch/x86/generated/inc.d -> x86-deps [ENVIRONS] x86-deps -> x86-environs [ CXX] X86/sim/main.cc -> .o .... .... <lots of output> .... [ SHCXX] nomali/lib/mali_midgard.cc -> .os [ SHCXX] nomali/lib/mali_t6xx.cc -> .os [ SHCXX] nomali/lib/mali_t7xx.cc -> .os [ AR] -> drampower/libdrampower.a [ SHCXX] nomali/lib/addrspace.cc -> .os [ SHCXX] nomali/lib/mmu.cc -> .os [ RANLIB] -> drampower/libdrampower.a [ SHCXX] nomali/lib/nomali_api.cc -> .os [ AR] -> nomali/libnomali.a [ RANLIB] -> nomali/libnomali.a [ CXX] X86/base/date.cc -> .o [ LINK] -> X86/gem5.optscons: done building targets.When compilation is finished you should have a working gem5 executableat build/X86/gem5.opt. The compilation can take a very long time,often 15 minutes or more, especially if you are compiling on a remotefile system like AFS or NFS.Common errorsWrong gcc versionError: gcc version 4.8 or newer required. Installed version: 4.4.7Update your environment variables to point to the right gcc version, orinstall a more up to date version of gcc. Seebuilding-requirements-section.Python in a non-default locationIf you use a non-default version of Python, (e.g., version 2.7 when 2.5is your default), there may be problems when using SCons to build gem5.RHEL6 version of SCons uses a hardcoded location for Python, whichcauses the issue. gem5 often builds successfully in this case, but maynot be able to run. Below is one possible error you may see when you rungem5.Traceback (most recent call last): File \"........../gem5-stable/src/python/importer.py\", line 93, in <module> sys.meta_path.append(importer)TypeError: 'dict' object is not callableTo fix this, you can force SCons to use your environment’s Pythonversion by running python `which scons` build/X86/gem5.opt insteadof scons build/X86/gem5.opt. More information on this can be found onthe gem5 wiki about non-default Python locations: Using a non-defaultPythoninstallation.M4 macro processor not installedIf the M4 macro processor isn’t installed you’ll see an error similar tothis:...Checking for member exclude_host in struct perf_event_attr...yesError: Can't find version of M4 macro processor. Please install M4 and try again.Just installing the M4 macro package may not solve this issue. You maynee to also install all of the autoconf tools. On Ubuntu, you can usethe following command.sudo apt-get install automake", |
| "url": "/building/" |
| } |
| , |
| |
| "cache-config": { |
| "title": "Adding cache to configuration script", |
| "content": " authors Jason Lowe-PowerAdding cache to the configuration scriptUsing the previous configuration script as a starting point,this chapter will walk through a more complex configuration. We will adda cache hierarchy to the system as shown inthe figure below. Additionally, this chapterwill cover understanding the gem5 statistics output and adding commandline parameters to your scripts.Creating cache objectsWe are going to use the classic caches, instead of ruby-intro-chapter,since we are modeling a single CPU system and we don’t care aboutmodeling cache coherence. We will extend the Cache SimObject andconfigure it for our system. First, we must understand the parametersthat are used to configure Cache objects. Classic caches and Ruby gem5 currently has two completely distinct subsystems to model theon-chip caches in a system, the “Classic caches” and “Ruby”. Thehistorical reason for this is that gem5 is a combination of m5 fromMichigan and GEMS from Wisconsin. GEMS used Ruby as its cache model,whereas the classic caches came from the m5 codebase (hence“classic”). The difference between these two models is that Ruby isdesigned to model cache coherence in detail. Part of Ruby is SLICC, alanguage for defining cache coherence protocols. On the other hand,the classic caches implement a simplified and inflexible MOESIcoherence protocol. To choose which model to use, you should ask yourself what you aretrying to model. If you are modeling changes to the cache coherenceprotocol or the coherence protocol could have a first-order impact onyour results, use Ruby. Otherwise, if the coherence protocol isn’timportant to you, use the classic caches. A long-term goal of gem5 is to unify these to cache models into asingle holistic model.CacheThe Cache SimObject declaration can be found in src/mem/cache/Cache.py.This Python file defines the parameters which you can set of theSimObject. Under the hood, when the SimObject is instantiated theseparameters are passed to the C++ implementation of the object. TheCache SimObject inherits from the BaseCache object shown below.Within the BaseCache class, there are a number of parameters. Forinstance, assoc is an integer parameter. Some parameters, likewrite_buffers have a default value, 8 in this case. The defaultparameter is the first argument to Param.*, unless the first argumentis a string. The string argument of each of the parameters is adescription of what the parameter is (e.g.,tag_latency = Param.Cycles(\"Tag lookup latency\") means that the`tag_latency controls “The hit latency for this cache”).Many of these parameters do not have defaults, so we are required to setthese parameters before calling m5.instantiate().Now, to create caches with specific parameters, we are first going tocreate a new file, caches.py, in the same directory as simple.py,configs/tutorial. The first step is to import the SimObject(s) we aregoing to extend in this file.from m5.objects import CacheNext, we can treat the BaseCache object just like any other Python classand extend it. We can name the new cache anything we want. Let’s startby making an L1 cache.class L1Cache(Cache): assoc = 2 tag_latency = 2 data_latency = 2 response_latency = 2 mshrs = 4 tgts_per_mshr = 20Here, we are setting some of the parameters of the BaseCache that do nothave default values. To see all of the possible configuration options,and to find which are required and which are optional, you have to lookat the source code of the SimObject. In this case, we are usingBaseCache.We have extended BaseCache and set most of the parameters that do nothave default values in the BaseCache SimObject. Next, let’s two moresub-classes of L1Cache, an L1DCache and L1ICacheclass L1ICache(L1Cache): size = '16kB'class L1DCache(L1Cache): size = '64kB'Let’s also create an L2 cache with some reasonable parameters.class L2Cache(Cache): size = '256kB' assoc = 8 tag_latency = 20 data_latency = 20 response_latency = 20 mshrs = 20 tgts_per_mshr = 12Now that we have specified all of the necessary parameters required forBaseCache, all we have to do is instantiate our sub-classes andconnect the caches to the interconnect. However, connecting lots ofobjects up to complex interconnects can make configuration files quicklygrow and become unreadable. Therefore, let’s first add some helperfunctions to our sub-classes of Cache. Remember, these are just Pythonclasses, so we can do anything with them that you can do with a Pythonclass.To the L1 cache let’s add two functions, connectCPU to connect a CPUto the cache and connectBus to connect the cache to a bus. We need toadd the following code to the L1Cache class.def connectCPU(self, cpu): # need to define this in a base class! raise NotImplementedErrordef connectBus(self, bus): self.mem_side = bus.slaveNext, we have to define a separate connectCPU function for theinstruction and data caches, since the I-cache and D-cache ports have adifferent names. Our L1ICache and L1DCache classes now become:class L1ICache(L1Cache): size = '16kB' def connectCPU(self, cpu): self.cpu_side = cpu.icache_portclass L1DCache(L1Cache): size = '64kB' def connectCPU(self, cpu): self.cpu_side = cpu.dcache_portFinally, let’s add functions to the L2Cache to connect to thememory-side and CPU-side bus, respectively.def connectCPUSideBus(self, bus): self.cpu_side = bus.masterdef connectMemSideBus(self, bus): self.mem_side = bus.slaveThe full file can be found in the gem5 source atgem5/configs/learning_gem5/part1/caches.py.Adding caches the simple config fileNow, let’s add the caches we just created to the configuration script wecreated in the last chapter <simple-config-chapter>.First, let’s copy the script to a new name.cp simple.py two_level.pyFirst, we need to import the names from the caches.py file into thenamespace. We can add the following to the top of the file (after them5.objects import), as you would with any Python source.from caches import *Now, after creating the CPU, let’s create the L1 caches:system.cpu.icache = L1ICache()system.cpu.dcache = L1DCache()And connect the caches to the CPU ports with the helper function wecreated.system.cpu.icache.connectCPU(system.cpu)system.cpu.dcache.connectCPU(system.cpu)Also, You need to remove the previous lines which connected the cacheports directly to the memory bus.-system.cpu.icache_port = system.membus.slave-system.cpu.dcache_port = system.membus.slaveWe can’t directly connect the L1 caches to the L2 cache since the L2cache only expects a single port to connect to it. Therefore, we need tocreate an L2 bus to connect our L1 caches to the L2 cache. The, we canuse our helper function to connect the L1 caches to the L2 bus.system.l2bus = L2XBar()system.cpu.icache.connectBus(system.l2bus)system.cpu.dcache.connectBus(system.l2bus)Next, we can create out L2 cache and connect it to the L2 bus and thememory bus.system.l2cache = L2Cache()system.l2cache.connectCPUSideBus(system.l2bus)system.l2cache.connectMemSideBus(system.membus)Everything else in the file stays the same! Now we have a completeconfiguration with a two-level cache hierarchy. If you run the currentfile, hello should now finish in 58513000 ticks. The full script canbe found in the gem5 source atgem5/configs/learning_gem5/part1/two_level.py.Adding parameters to your scriptWhen performing experiments with gem5, you don’t want to edit yourconfiguration script every time you want to test the system withdifferent parameters. To get around this, you can add command-lineparameters to your gem5 configuration script. Again, because theconfiguration script is just Python, you can use the Python librariesthat support argument parsing. Although :pyoptparse is officiallydeprecated, many of the configuration scripts that ship with gem5 use itinstead of pyargparse since gem5’s minimum Python version used to be2.5. The minimum Python version is now 2.7, so pyargparse is a betteroption when writing new scripts that don’t need to interact with thecurrent gem5 scripts. To get started using :pyoptparse, you can consultthe online Python documentation.To add options to our two-level cache configuration, after importing ourcaches, let’s add some options.from optparse import OptionParserparser = OptionParser()parser.add_option('--l1i_size', help=\"L1 instruction cache size\")parser.add_option('--l1d_size', help=\"L1 data cache size\")parser.add_option('--l2_size', help=\"Unified L2 cache size\")(options, args) = parser.parse_args()Now, you can runbuild/X86/gem5.opt configs/tutorial/two_level_opts.py --help whichwill display the options you just added.Next, we need to pass these options onto the caches that we create inthe configuration script. To do this, we’ll simply change two_level.pyto pass the options into the caches as a parameter to their constructorand add an appropriate constructor, next.system.cpu.icache = L1ICache(options)system.cpu.dcache = L1DCache(options)...system.l2cache = L2Cache(options)In caches.py, we need to add constructors (__init__ functions inPython) to each of our classes. Starting with our base L1 cache, we’lljust add an empty constructor since we don’t have any parameters whichapply to the base L1 cache. However, we can’t forget to call the superclass’s constructor in this case. If the call to the super classconstructor is skipped, gem5’s SimObject attribute finding function willfail and the result will be“RuntimeError: maximum recursion depth exceeded” when you try toinstantiate the cache object. So, in L1Cache we need to add thefollowing after the static class members.def __init__(self, options=None): super(L1Cache, self).__init__() passNext, in the L1ICache, we need to use the option that we created(l1i_size) to set the size. In the following code, there is guards forif options is not passed to the L1ICache constructor and if nooption was specified on the command line. In these cases, we’ll just usethe default we’ve already specified for the size.def __init__(self, options=None): super(L1ICache, self).__init__(options) if not options or not options.l1i_size: return self.size = options.l1i_sizeWe can use the same code for the L1DCache:def __init__(self, options=None): super(L1DCache, self).__init__(options) if not options or not options.l1d_size: return self.size = options.l1d_sizeAnd the unified L2Cache:def __init__(self, options=None): super(L2Cache, self).__init__() if not options or not options.l2_size: return self.size = options.l2_sizeWith these changes, you can now pass the cache sizes into your scriptfrom the command line like below.build/X86/gem5.opt configs/tutorial/two_level_opts.py --l2_size='1MB' --l1d_size='128kB'gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Sep 6 2015 14:17:02gem5 started Sep 6 2015 15:06:51gem5 executing on galapagos-09.cs.wisc.educommand line: build/X86/gem5.opt ../tutorial/_static/scripts/part1/two_level_opts.py --l2_size=1MB --l1d_size=128kBGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000Beginning simulation!info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 56742000 because target called exit()The full scripts can be found in the gem5 source atgem5/configs/learning_gem5/part1/caches.py andgem5/configs/learning_gem5/part1/two_level.py.", |
| "url": "/cache_config/" |
| } |
| , |
| |
| "example-configs": { |
| "title": "Using the default configuration scripts", |
| "content": " authors Jason Lowe-PowerUsing the default configuration scriptsIn this chapter, we’ll explore using the default configuration scriptsthat come with gem5. gem5 ships with many configuration scripts thatallow you to use gem5 very quickly. However, a common pitfall is to usethese scripts without fully understanding what is being simulated. It isimportant when doing computer architecture research with gem5 to fullyunderstand the system you are simulating. This chapter will walk youthrough some important options and parts of the default configurationscripts.In the last few chapters you have created your own configuration scriptsfrom scratch. This is very powerful, as it allows you to specify everysingle system parameter. However, some systems are very complex to setup (e.g., a full-system ARM or x86 machine). Luckily, the gem5developers have provided many scripts to bootstrap the process ofbuilding systems.A tour of the directory structureAll of gem5’s configuration files can be found in configs/. Thedirectory structure is shown below:configs/boot:ammp.rcS halt.sh micro_tlblat2.rcS netperf-stream-udp-local.rcS...configs/common:Benchmarks.py cpu2000.py Options.pyCaches.py FSConfig.py O3_ARM_v7a.py SysPaths.pyCacheConfig.py CpuConfig.py MemConfig.py Simulation.pyconfigs/dram:sweep.pyconfigs/example:fs.py read_config.py ruby_mem_test.py ruby_random_test.pymemtest.py ruby_direct_test.py ruby_network_test.py se.pyconfigs/ruby:MESI_Three_Level.py MI_example.py MOESI_CMP_token.py Network_test.pyMESI_Two_Level.py MOESI_CMP_directory.py MOESI_hammer.py Ruby.pyconfigs/splash2:cluster.py run.pyconfigs/topologies:BaseTopology.py Cluster.py Crossbar.py MeshDirCorners.py Mesh.py Pt2Pt.py Torus.pyEach directory is briefly described below: boot/ These are rcS files which are used in full-system mode. These filesare loaded by the simulator after Linux boots and are executed bythe shell. Most of these are used to control benchmarks when runningin full-system mode. Some are utility functions, likehack_back_ckpt.rcS. These files are covered in more depth in thechapter on full-system simulation. common/ This directory contains a number of helper scripts and functions tocreate simulated systems. For instance, Caches.py is similar tothe caches.py and caches_opts.py files created in previouschapters. Options.py contains a variety of options that can be set on thecommand line. Like the number of CPUs, system clock, and many, manymore. This is a good place to look to see if the option you want tochange already has a command line parameter. CacheConfig.py contains the options and functions for settingcache parameters for the classic memory system. MemConfig.py provides some helper functions for setting the memorysystem. FSConfig.py contains the necessary functions to set up full-systemsimulation for many different kinds of systems. Full-systemsimulation is discussed further in it’s own chapter. Simulation.py contains many helper functions to set up and rungem5. A lot of the code contained in this file manages saving andrestoring checkpoints. The example configuration files below inexamples/ use the functions in this file to execute the gem5simulation. This file is quite complicated, but it also allows a lotof flexibility in how the simulation is run. dram/ Contains scripts to test DRAM. example/ This directory contains some example gem5 configuration scripts thatcan be used out-of-the-box to run gem5. Specifically, se.py andfs.py are quite useful. More on these files can be found in thenext section. There are also some other utility configurationscripts in this directory. ruby/ This directory contains the configurations scripts for Ruby and itsincluded cache coherence protocols. More details can be found in thechapter on Ruby. splash2/ This directory contains scripts to run the splash2 benchmark suitewith a few options to configure the simulated system. topologies/ This directory contains the implementation of the topologies thatcan be used when creating the Ruby cache hierarchy. More details canbe found in the chapter on Ruby.Using se.py and fs.pyIn this section, I’ll discuss some of the common options that can bepassed on the command line to se.py and fs.py. More details on howto run full-system simulation can be found in the full-system simulationchapter. Here I’ll discuss the options that are common to the two files.Most of the options discussed in this section are found in Options.pyand are registered in the function addCommonOptions. This section doesnot detail all of the options. To see all of the options, run theconfiguration script with --help, or read the script’s source code.First, let’s simply run the hello world program without any parameters:build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/helloAnd we get the following as output:gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 14 2015 16:11:34gem5 started Feb 2 2015 15:22:24gem5 executing on mustardseed.cs.wisc.educommand line: build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/helloGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000**** REAL SIMULATION ****info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 5942000 because target called exit()However, this isn’t a very interesting simulation at all! By default,gem5 uses the atomic CPU and uses atomic memory accesses, so there’s noreal timing data reported! To confirm this, you can look atm5out/config.ini. The CPU is shown on line 46:[system.cpu]type=AtomicSimpleCPUchildren=apic_clk_domain dtb interrupts isa itb tracer workloadbranchPred=Nullchecker=Nullclk_domain=system.cpu_clk_domaincpu_id=0do_checkpoint_insts=truedo_quiesce=truedo_statistics_insts=trueTo actually run gem5 in timing mode, let’s specify a CPU type. Whilewe’re at it, we can also specify sizes for the L1 caches.build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/hello --cpu-type=TimingSimpleCPU --l1d_size=64kB --l1i_size=16kBgem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 14 2015 16:11:34gem5 started Feb 2 2015 15:26:57gem5 executing on mustardseed.cs.wisc.educommand line: build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/hello --cpu-type=TimingSimpleCPU --l1d_size=64kB --l1i_size=16kBGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000**** REAL SIMULATION ****info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 344986500 because target called exit()Now, let’s check the config.ini file and make sure that these optionspropagated correctly to the final system. If you searchm5out/config.ini for “cache”, you’ll find that no caches were created!Even though we specified the size of the caches, we didn’t specify thatthe system should use caches, so they weren’t created. The correctcommand line should be:build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/hello --cpu-type=TimingSimpleCPU --l1d_size=64kB --l1i_size=16kB --cachesgem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 14 2015 16:11:34gem5 started Feb 2 2015 15:29:20gem5 executing on mustardseed.cs.wisc.educommand line: build/X86/gem5.opt configs/example/se.py --cmd=tests/test-progs/hello/bin/x86/linux/hello --cpu-type=TimingSimpleCPU --l1d_size=64kB --l1i_size=16kB --cachesGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000**** REAL SIMULATION ****info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 29480500 because target called exit()On the last line, we see that the total time went from 344986500 ticksto 29480500, much faster! Looks like caches are probably enabled now.But, it’s always a good idea to double check the config.ini file.[system.cpu.dcache]type=BaseCachechildren=tagsaddr_ranges=0:18446744073709551615assoc=2clk_domain=system.cpu_clk_domaindemand_mshr_reserve=1eventq_index=0forward_snoops=truehit_latency=2is_top_level=truemax_miss_count=0mshrs=4prefetch_on_access=falseprefetcher=Nullresponse_latency=2sequential_access=falsesize=65536system=systemtags=system.cpu.dcache.tagstgts_per_mshr=20two_queue=falsewrite_buffers=8cpu_side=system.cpu.dcache_portmem_side=system.membus.slave[2]Some common options se.py and fs.pyAll of the possible options are printed when you run:build/X86/gem5.opt configs/example/se.py --helpBelow is a few important options from that list.", |
| "url": "/example_configs/" |
| } |
| , |
| |
| "gem5-stats": { |
| "title": "Understanding gem5 statistics and output", |
| "content": " authors Jason Lowe-PowerUnderstanding gem5 statistics and outputIn addition to any information which your simulation script prints out,after running gem5, there are three files generated in a directorycalled m5out: config.ini Contains a list of every SimObject created for the simulation andthe values for its parameters. config.json The same as config.ini, but in json format. stats.txt A text representation of all of the gem5 statistics registered forthe simulation.Where these files are created can be controlled byconfig.iniThis file is the definitive version of what was simulated. All of theparameters for each SimObject that is simulated, whether they were setin the configuration scripts or the defaults were used, are shown inthis file.Below is pulled from the config.ini generated when the simple.pyconfiguration file from simple-config-chapter is run.[root]type=Rootchildren=systemeventq_index=0full_system=falsesim_quantum=0time_sync_enable=falsetime_sync_period=100000000000time_sync_spin_threshold=100000000[system]type=Systemchildren=clk_domain cpu dvfs_handler mem_ctrl membusboot_osflags=acache_line_size=64clk_domain=system.clk_domaindefault_p_state=UNDEFINEDeventq_index=0exit_on_work_items=falseinit_param=0kernel=kernel_addr_check=truekernel_extras=kvm_vm=Nullload_addr_mask=18446744073709551615load_offset=0mem_mode=timing...[system.membus]type=CoherentXBarchildren=snoop_filterclk_domain=system.clk_domaindefault_p_state=UNDEFINEDeventq_index=0forward_latency=4frontend_latency=3p_state_clk_gate_bins=20p_state_clk_gate_max=1000000000000p_state_clk_gate_min=1000point_of_coherency=truepoint_of_unification=truepower_model=response_latency=2snoop_filter=system.membus.snoop_filtersnoop_response_latency=4system=systemuse_default_range=falsewidth=16master=system.cpu.interrupts.pio system.cpu.interrupts.int_slave system.mem_ctrl.portslave=system.cpu.icache_port system.cpu.dcache_port system.cpu.interrupts.int_master system.system_port[system.membus.snoop_filter]type=SnoopFiltereventq_index=0lookup_latency=1max_capacity=8388608system=systemHere we see that at the beginning of the description of each SimObjectis first it’s name as created in the configuration file surrounded bysquare brackets (e.g., [system.membus]).Next, every parameter of the SimObject is shown with it’s value,including parameters not explicitly set in the configuration file. Forinstance, the configuration file sets the clock domain to be 1 GHz (1000ticks in this case). However, it did not set the cache line size (whichis 64 in the system) object.The config.ini file is a valuable tool for ensuring that you aresimulating what you think you’re simulating. There are many possibleways to set default values, and to override default values, in gem5. Itis a “best-practice” to always check the config.ini as a sanity checkthat values set in the configuration file are propagated to the actualSimObject instantiation.stats.txtgem5 has a flexible statistics generating system. gem5 statistics iscovered in some detail on the gem5 wikisite. Each instantiation of a SimObjecthas it’s own statistics. At the end of simulation, or when specialstatistic-dumping commands are issued, the current state of thestatistics for all SimObjects is dumped to a file.First, the statistics file contains general statistics about theexecution:---------- Begin Simulation Statistics ----------sim_seconds 0.000346 # Number of seconds simulatedsim_ticks 345518000 # Number of ticks simulatedfinal_tick 345518000 # Number of ticks from beginning of simulation (restored from checkpoints and never reset)sim_freq 1000000000000 # Frequency of simulated tickshost_inst_rate 144400 # Simulator instruction rate (inst/s)host_op_rate 260550 # Simulator op (including micro ops) rate (op/s)host_tick_rate 8718625183 # Simulator tick rate (ticks/s)host_mem_usage 778640 # Number of bytes of host memory usedhost_seconds 0.04 # Real time elapsed on the hostsim_insts 5712 # Number of instructions simulatedsim_ops 10314 # Number of ops (including micro ops) simulated———- Begin Simulation Statistics ———-sim_seconds 0.000508# Number of seconds simulated sim_ticks 507841000 # Number of tickssimulated final_tick 507841000 # Number of ticks from beginning ofsimulation (restored from checkpoints and never reset) sim_freq1000000000000 # Frequency of simulated ticks host_inst_rate 157744 #Simulator instruction rate (inst/s) host_op_rate 284736 # Simulatorop (including micro ops) rate (op/s) host_tick_rate 14017997125 #Simulator tick rate (ticks/s) host_mem_usage 642808 # Number of bytesof host memory used host_seconds 0.04 # Real time elapsed on the hostsim_insts 5712 # Number of instructions simulated sim_ops 10313 #Number of ops (including micro ops) simulatedThe statistic dump begins with---------- Begin Simulation Statistics ----------. There may bemultiple of these in a single file if there are multiple statistic dumpsduring the gem5 execution. This is common for long running applications,or when restoring from checkpoints.Each statistic has a name (first column), a value (second column), and adescription (last column preceded by #).Most of the statistics are self explanatory from their descriptions. Acouple of important statistics are sim_seconds which is the totalsimulated time for the simulation, sim_insts which is the number ofinstructions committed by the CPU, and host_inst_rate which tells youthe performance of gem5.Next, the SimObjects’ statistics are printed. For instance, the memorycontroller statistics. This has information like the bytes read by eachcomponent and the average bandwidth used by those components.system.clk_domain.voltage_domain.voltage 1 # Voltage in Voltssystem.clk_domain.clock 1000 # Clock period in tickssystem.mem_ctrl.pwrStateResidencyTicks::UNDEFINED 507841000 # Cumulative time (in ticks) in various power statessystem.mem_ctrl.bytes_read::cpu.inst 58264 # Number of bytes read from this memorysystem.mem_ctrl.bytes_read::cpu.data 7167 # Number of bytes read from this memorysystem.mem_ctrl.bytes_read::total 65431 # Number of bytes read from this memorysystem.mem_ctrl.bytes_inst_read::cpu.inst 58264 # Number of instructions bytes read from this memorysystem.mem_ctrl.bytes_inst_read::total 58264 # Number of instructions bytes read from this memorysystem.mem_ctrl.bytes_written::cpu.data 7160 # Number of bytes written to this memorysystem.mem_ctrl.bytes_written::total 7160 # Number of bytes written to this memorysystem.mem_ctrl.num_reads::cpu.inst 7283 # Number of read requests responded to by this memorysystem.mem_ctrl.num_reads::cpu.data 1084 # Number of read requests responded to by this memorysystem.mem_ctrl.num_reads::total 8367 # Number of read requests responded to by this memorysystem.mem_ctrl.num_writes::cpu.data 941 # Number of write requests responded to by this memorysystem.mem_ctrl.num_writes::total 941 # Number of write requests responded to by this memorysystem.mem_ctrl.bw_read::cpu.inst 114728823 # Total read bandwidth from this memory (bytes/s)system.mem_ctrl.bw_read::cpu.data 14112685 # Total read bandwidth from this memory (bytes/s)system.mem_ctrl.bw_read::total 128841507 # Total read bandwidth from this memory (bytes/s)system.mem_ctrl.bw_inst_read::cpu.inst 114728823 # Instruction read bandwidth from this memory (bytes/s)system.mem_ctrl.bw_inst_read::total 114728823 # Instruction read bandwidth from this memory (bytes/s)system.mem_ctrl.bw_write::cpu.data 14098901 # Write bandwidth from this memory (bytes/s)system.mem_ctrl.bw_write::total 14098901 # Write bandwidth from this memory (bytes/s)system.mem_ctrl.bw_total::cpu.inst 114728823 # Total bandwidth to/from this memory (bytes/s)system.mem_ctrl.bw_total::cpu.data 28211586 # Total bandwidth to/from this memory (bytes/s)system.mem_ctrl.bw_total::total 142940409 # Total bandwidth to/from this memory (bytes/s)Later in the file is the CPU statistics, which contains information onthe number of syscalls, the number of branches, total committedinstructions, etc.system.cpu.dtb.walker.pwrStateResidencyTicks::UNDEFINED 507841000 # Cumulative time (in ticks) in various power statessystem.cpu.dtb.rdAccesses 1084 # TLB accesses on read requestssystem.cpu.dtb.wrAccesses 941 # TLB accesses on write requestssystem.cpu.dtb.rdMisses 9 # TLB misses on read requestssystem.cpu.dtb.wrMisses 7 # TLB misses on write requestssystem.cpu.apic_clk_domain.clock 16000 # Clock period in tickssystem.cpu.interrupts.pwrStateResidencyTicks::UNDEFINED 507841000 # Cumulative time (in ticks) in various power statessystem.cpu.itb.walker.pwrStateResidencyTicks::UNDEFINED 507841000 # Cumulative time (in ticks) in various power statessystem.cpu.itb.rdAccesses 0 # TLB accesses on read requestssystem.cpu.itb.wrAccesses 7284 # TLB accesses on write requestssystem.cpu.itb.rdMisses 0 # TLB misses on read requestssystem.cpu.itb.wrMisses 31 # TLB misses on write requestssystem.cpu.workload.numSyscalls 11 # Number of system callssystem.cpu.pwrStateResidencyTicks::ON 507841000 # Cumulative time (in ticks) in various power statessystem.cpu.numCycles 507841 # number of cpu cycles simulatedsystem.cpu.numWorkItemsStarted 0 # number of work items this cpu startedsystem.cpu.numWorkItemsCompleted 0 # number of work items this cpu completedsystem.cpu.committedInsts 5712 # Number of instructions committedsystem.cpu.committedOps 10313 # Number of ops (including micro ops) committedsystem.cpu.num_int_alu_accesses 10204 # Number of integer alu accessessystem.cpu.num_fp_alu_accesses 0 # Number of float alu accessessystem.cpu.num_vec_alu_accesses 0 # Number of vector alu accessessystem.cpu.num_func_calls 221 # number of times a function call or return occuredsystem.cpu.num_conditional_control_insts 986 # number of instructions that are conditional controlssystem.cpu.num_int_insts 10204 # number of integer instructionssystem.cpu.num_fp_insts 0 # number of float instructionssystem.cpu.num_vec_insts 0 # number of vector instructionssystem.cpu.num_int_register_reads 19293 # number of times the integer registers were readsystem.cpu.num_int_register_writes 7976 # number of times the integer registers were writtensystem.cpu.num_fp_register_reads 0 # number of times the floating registers were readsystem.cpu.num_fp_register_writes 0 # number of times the floating registers were writtensystem.cpu.num_vec_register_reads 0 # number of times the vector registers were readsystem.cpu.num_vec_register_writes 0 # number of times the vector registers were writtensystem.cpu.num_cc_register_reads 7020 # number of times the CC registers were readsystem.cpu.num_cc_register_writes 3825 # number of times the CC registers were writtensystem.cpu.num_mem_refs 2025 # number of memory refssystem.cpu.num_load_insts 1084 # Number of load instructionssystem.cpu.num_store_insts 941 # Number of store instructionssystem.cpu.num_idle_cycles 0 # Number of idle cyclessystem.cpu.num_busy_cycles 507841 # Number of busy cyclessystem.cpu.not_idle_fraction 1 # Percentage of non-idle cyclessystem.cpu.idle_fraction 0 # Percentage of idle cyclessystem.cpu.Branches 1306 # Number of branches fetched", |
| "url": "/gem5_stats/" |
| } |
| , |
| |
| "simple-config": { |
| "title": "Creating a simple configuration script", |
| "content": " authors Jason Lowe-PowerCreating a simple configuration scriptThis chapter of the tutorial will walk you through how to set up asimple simulation script for gem5 and to run gem5 for the first time.It’s assumed that you’ve completed the first chapter of the tutorial andhave successfully built gem5 with an executable build/X86/gem5.opt.Our configuration script is going to model a very simple system. We’llhave just one simple CPU core. This CPU core will be connected to asystem-wide memory bus. And we’ll have a single DDR3 memory channel,also connected to the memory bus.gem5 configuration scriptsThe gem5 binary takes, as a parameter, a python script which sets up andexecutes the simulation. In this script, you create a system tosimulate, create all of the components of the system, and specify all ofthe parameters for the system components. Then, from the script, you canbegin the simulation.This script is completely user-defined. You can choose to use any validPython code in the configuration scripts. This book provides on exampleof a style that relies heavily classes and inheritance in Python. As agem5 user, it’s up to you how simple or complicated to make yourconfiguration scripts.There are a number of example configuration scripts that ship with gem5in configs/examples. Most of these scripts are all-encompassing andallow users to specify almost all options on the command line. Insteadof starting with these complex script, in this book we are going tostart with the most simple script that can run gem5 and build fromthere. Hopefully, by the end of this section you’ll have a good idea ofhow simulation scripts work. An aside on SimObjects gem5’s modular design is built around the SimObject type. Most ofthe components in the simulated system are SimObjects: CPUs, caches,memory controllers, buses, etc. gem5 exports all of these objects fromtheir C++ implementation to python. Thus, from the pythonconfiguration script you can create any SimObject, set its parameters,and specify the interactions between SimObjects. See http://www.gem5.org/SimObjects for more information.Creating a config fileLet’s start by creating a new config file and opening it:mkdir configs/tutorialtouch configs/tutorial/simple.pyThis is just a normal python file that will be executed by the embeddedpython in the gem5 executable. Therefore, you can use any features andlibraries available in python.The first thing we’ll do in this file is import the m5 library and allSimObjects that we’ve compiled.import m5from m5.objects import *Next, we’ll create the first SimObject: the system that we are going tosimulate. The System object will be the parent of all the otherobjects in our simulated system. The System object contains a lot offunctional (not timing-level) information, like the physical memoryranges, the root clock domain, the root voltage domain, the kernel (infull-system simulation), etc. To create the system SimObject, we simplyinstantiate it like a normal python class:system = System()Now that we have a reference to the system we are going to simulate,let’s set the clock on the system. We first have to create a clockdomain. Then we can set the clock frequency on that domain. Settingparameters on a SimObject is exactly the same as setting members of anobject in python, so we can simply set the clock to 1 GHz, for instance.Finally, we have to specify a voltage domain for this clock domain.Since we don’t care about system power right now, we’ll just use thedefault options for the voltage domain.system.clk_domain = SrcClockDomain()system.clk_domain.clock = '1GHz'system.clk_domain.voltage_domain = VoltageDomain()Once we have a system, let’s set up how the memory will be simulated. Weare going to use timing mode for the memory simulation. You willalmost always use timing mode for the memory simulation, except inspecial cases like fast-forwarding and restoring from a checkpoint. Wewill also set up a single memory range of size 512 MB, a very smallsystem. Note that in the python configuration scripts, whenever a sizeis required you can specify that size in common vernacular and unitslike '512MB'. Similarly, with time you can use time units (e.g.,'5ns'). These will automatically be converted to a commonrepresentation, respectively.system.mem_mode = 'timing'system.mem_ranges = [AddrRange('512MB')]Now, we can create a CPU. We’ll start with the most simple timing-basedCPU in gem5, TimingSimpleCPU. This CPU model executes each instructionin a single clock cycle to execute, except memory requests, which flowthrough the memory system. To create the CPU you can simply justinstantiate the object:system.cpu = TimingSimpleCPU()Next, we’re going to create the system-wide memory bus:system.membus = SystemXBar()Now that we have a memory bus, let’s connect the cache ports on the CPUto it. In this case, since the system we want to simulate doesn’t haveany caches, we will connect the I-cache and D-cache ports directly tothe membus. In this example system, we have no caches.system.cpu.icache_port = system.membus.slavesystem.cpu.dcache_port = system.membus.slave An aside on gem5 ports To connect memory system components together, gem5 uses a portabstraction. Each memory object can have two kinds of ports, masterports and slave ports. Requests are sent from a master port to aslave port, and responses are sent from a slave port to a master port.When connecting ports, you must connect a master port to a slave port. Connecting ports together is easy to do from the python configurationfiles. You can simply set the master port = to the slave port andthey will be connected. For instance: memobject1.master = memobject2.slave The master and slave can be on either side of the = and the sameconnection will be made. After making the connection, the master cansend requests to the slave port. There is a lot of magic going onbehind the scenes to set up the connection, the details of which areunimportant for most users. We will discuss ports and MemObject in more detail inmemoryobject-chapter.Next, we need to connect up a few other ports to make sure that oursystem will function correctly. We need to create an I/O controller onthe CPU and connect it to the memory bus. Also, we need to connect aspecial port in the system up to the membus. This port is afunctional-only port to allow the system to read and write memory.Connecting the PIO and interrupt ports to the memory bus is anx86-specific requirement. Other ISAs (e.g., ARM) do not require these 3extra lines.system.cpu.createInterruptController()system.cpu.interrupts[0].pio = system.membus.mastersystem.cpu.interrupts[0].int_master = system.membus.slavesystem.cpu.interrupts[0].int_slave = system.membus.mastersystem.system_port = system.membus.slaveNext, we need to create a memory controller and connect it to themembus. For this system, we’ll use a simple DDR3 controller and it willbe responsible for the entire memory range of our system.system.mem_ctrl = DDR3_1600_8x8()system.mem_ctrl.range = system.mem_ranges[0]system.mem_ctrl.port = system.membus.masterAfter those final connections, we’ve finished instantiating oursimulated system! Our system should look like simple-config-fig.Next, we need to set up the process we want the CPU to execute. Since weare executing in syscall emulation mode (SE mode), we will just pointthe CPU at the compiled executable. We’ll execute a simple “Hello world”program. There’s already one that is compiled that ships with gem5, sowe’ll use that. You can specify any application built for x86 and that’sbeen statically compiled. Full system vs syscall emulation gem5 can run in two different modes called “syscall emulation” and“full system” or SE and FS modes. In full system mode (covered laterfull-system-part), gem5 emulates the entire hardware system and runsan unmodified kernel. Full system mode is similar to running a virtualmachine. Syscall emulation mode, on the other hand, does not emulate all of thedevices in a system and focuses on simulating the CPU and memorysystem. Syscall emulation is much easier to configure since you arenot required to instantiate all of the hardware devices required in areal system. However, syscall emulation only emulates Linux systemcalls, and thus only models user-mode code. If you do not need to model the operating system for your researchquestions, and you want extra performance, you should use SE mode.However, if you need high fidelity modeling of the system, or OSinteraction like page table walks are important, then you should useFS mode.First, we have to create the process (another SimObject). Then we setthe processes command to the command we want to run. This is a listsimilar to argv, with the executable in the first position and thearguments to the executable in the rest of the list. Then we set the CPUto use the process as it’s workload, and finally create the functionalexecution contexts in the CPU.process = Process()process.cmd = ['tests/test-progs/hello/bin/x86/linux/hello']system.cpu.workload = processsystem.cpu.createThreads()The final thing we need to do is instantiate the system and beginexecution. First, we create the Root object. Then we instantiate thesimulation. The instantiation process goes through all of the SimObjectswe’ve created in python and creates the C++ equivalents.As a note, you don’t have to instantiate the python class then specifythe parameters explicitly as member variables. You can also pass theparameters as named arguments, like the Root object below.root = Root(full_system = False, system = system)m5.instantiate()Finally, we can kick off the actual simulation! As a side now, gem5 isnow using Python 3-style print functions, so print is no longer astatement and must be called as a function.print(\"Beginning simulation!\")exit_event = m5.simulate()And once simulation finishes, we can inspect the state of the system.print('Exiting @ tick {} because {}' .format(m5.curTick(), exit_event.getCause()))Running gem5Now that we’ve created a simple simulation script (the full version ofwhich can be found at gem5/configs/learning_gem5/part1/simple.py) we’reready to run gem5. gem5 can take many parameters, but requires just onepositional argument, the simulation script. So, we can simply run gem5from the root gem5 directory as:build/X86/gem5.opt configs/tutorial/simple.pyThe output should be:gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Mar 16 2018 10:24:24gem5 started Mar 16 2018 15:53:27gem5 executing on amarillo, pid 41697command line: build/X86/gem5.opt configs/tutorial/simple.pyGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb: listening for remote gdb on port 7000Beginning simulation!info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 507841000 because exiting with last active thread contextParameters in the configuration file can be changed and the resultsshould be different. For instance, if you double the system clock, thesimulation should finish faster. Or, if you change the DDR controller toDDR4, the performance should be better.Additionally, you can change the CPU model to MinorCPU to model anin-order CPU, or DerivO3CPU to model an out-of-order CPU. However,note that DerivO3CPU currently does not work with simple.py, becauseDerivO3CPU requires a system with separate instruction and data caches(DerivO3CPU does work with the configuration in the next section).Next, we will add caches to our configuration file to model a morecomplex system.", |
| "url": "/simple_config/" |
| } |
| , |
| |
| "debugging": { |
| "title": "Debugging gem5", |
| "content": " authors Jason Lowe-PowerDebugging gem5In the previous chapters <hello-simobject-chapter> we covered how tocreate a very simple SimObject. In this chapter, we will replace thesimple print to stdout with gem5’s debugging support.gem5 provides support for printf-style tracing/debugging of your codevia debug flags. These flags allow every component to have manydebug-print statements, without all of them enabled at the same time.When running gem5, you can specify which debug flags to enable from thecommand line.Using debug flagsFor instance, when running the first simple.py script fromsimple-config-chapter, if you enable the DRAM debug flag, you get thefollowing output. Note that this generates a lot of output to theconsole (about 7 MB).``` {.sourceCode .sh}build/X86/gem5.opt –debug-flags=DRAM configs/learning_gem5/part1/simple.py | head -n 50 gem5 Simulator System. http://gem5.org DRAM device capacity (gem5 is copyrighted software; use the --copyright option for details. gem5 compiled Jan 3 2017 16:03:38 gem5 started Jan 3 2017 16:09:53 gem5 executing on chinook, pid 19223 command line: build/X86/gem5.opt --debug-flags=DRAM configs/learning_gem5/part1/simple.py Global frequency set at 1000000000000 ticks per second 0: system.mem_ctrl: Memory capacity 536870912 (536870912) bytes 0: system.mem_ctrl: Row buffer size 8192 bytes with 128 columns per row buffer 0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000 Beginning simulation! info: Entering event queue @ 0. Starting simulation... 0: system.mem_ctrl: recvTimingReq: request ReadReq addr 400 size 8 0: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 0: system.mem_ctrl: Address: 400 Rank 0 Bank 0 Row 0 0: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 0: system.mem_ctrl: Adding to read queue 0: system.mem_ctrl: Request scheduled immediately 0: system.mem_ctrl: Single request, going to a free rank 0: system.mem_ctrl: Timing access to addr 400, rank/bank/row 0 0 0 0: system.mem_ctrl: Activate at tick 0 0: system.mem_ctrl: Activate bank 0, rank 0 at tick 0, now got 1 active 0: system.mem_ctrl: Access to 400, ready at 46250 bus busy until 46250. 46250: system.mem_ctrl: processRespondEvent(): Some req has reached its readyTime 46250: system.mem_ctrl: number of read entries for rank 0 is 0 46250: system.mem_ctrl: Responding to Address 400.. 46250: system.mem_ctrl: Done 77000: system.mem_ctrl: recvTimingReq: request ReadReq addr 400 size 8 77000: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 77000: system.mem_ctrl: Address: 400 Rank 0 Bank 0 Row 0 77000: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 77000: system.mem_ctrl: Adding to read queue 77000: system.mem_ctrl: Request scheduled immediately 77000: system.mem_ctrl: Single request, going to a free rank 77000: system.mem_ctrl: Timing access to addr 400, rank/bank/row 0 0 0 77000: system.mem_ctrl: Access to 400, ready at 101750 bus busy until 101750. 101750: system.mem_ctrl: processRespondEvent(): Some req has reached its readyTime 101750: system.mem_ctrl: number of read entries for rank 0 is 0 101750: system.mem_ctrl: Responding to Address 400.. 101750: system.mem_ctrl: Done 132000: system.mem_ctrl: recvTimingReq: request ReadReq addr 400 size 8 132000: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 132000: system.mem_ctrl: Address: 400 Rank 0 Bank 0 Row 0 132000: system.mem_ctrl: Read queue limit 32, current size 0, entries needed 1 132000: system.mem_ctrl: Adding to read queue 132000: system.mem_ctrl: Request scheduled immediately 132000: system.mem_ctrl: Single request, going to a free rank 132000: system.mem_ctrl: Timing access to addr 400, rank/bank/row 0 0 0 132000: system.mem_ctrl: Access to 400, ready at 156750 bus busy until 156750. 156750: system.mem_ctrl: processRespondEvent(): Some req has reached its readyTime 156750: system.mem_ctrl: number of read entries for rank 0 is 0Or, you may want to debug based on the exact instruction the CPU isexecuting. For this, the `Exec` debug flag may be useful. This debugflags shows details of how each instruction is executed by the simulatedCPU.``` {.sourceCode .sh}build/X86/gem5.opt --debug-flags=Exec configs/learning_gem5/part1/simple.py | head -n 50gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 3 2017 16:03:38gem5 started Jan 3 2017 16:11:47gem5 executing on chinook, pid 19234command line: build/X86/gem5.opt --debug-flags=Exec configs/learning_gem5/part1/simple.pyGlobal frequency set at 1000000000000 ticks per second 0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000warn: ClockedObject: More than one power state change request encountered within the same simulation tickBeginning simulation!info: Entering event queue @ 0. Starting simulation... 77000: system.cpu T0 : @_start : xor rbp, rbp 77000: system.cpu T0 : @_start.0 : XOR_R_R : xor rbp, rbp, rbp : IntAlu : D=0x0000000000000000 132000: system.cpu T0 : @_start+3 : mov r9, rdx 132000: system.cpu T0 : @_start+3.0 : MOV_R_R : mov r9, r9, rdx : IntAlu : D=0x0000000000000000 187000: system.cpu T0 : @_start+6 : pop rsi 187000: system.cpu T0 : @_start+6.0 : POP_R : ld t1, SS:[rsp] : MemRead : D=0x0000000000000001 A=0x7fffffffee30 250000: system.cpu T0 : @_start+6.1 : POP_R : addi rsp, rsp, 0x8 : IntAlu : D=0x00007fffffffee38 250000: system.cpu T0 : @_start+6.2 : POP_R : mov rsi, rsi, t1 : IntAlu : D=0x0000000000000001 360000: system.cpu T0 : @_start+7 : mov rdx, rsp 360000: system.cpu T0 : @_start+7.0 : MOV_R_R : mov rdx, rdx, rsp : IntAlu : D=0x00007fffffffee38 415000: system.cpu T0 : @_start+10 : and rax, 0xfffffffffffffff0 415000: system.cpu T0 : @_start+10.0 : AND_R_I : limm t1, 0xfffffffffffffff0 : IntAlu : D=0xfffffffffffffff0 415000: system.cpu T0 : @_start+10.1 : AND_R_I : and rsp, rsp, t1 : IntAlu : D=0x0000000000000000 470000: system.cpu T0 : @_start+14 : push rax 470000: system.cpu T0 : @_start+14.0 : PUSH_R : st rax, SS:[rsp + 0xfffffffffffffff8] : MemWrite : D=0x0000000000000000 A=0x7fffffffee28 491000: system.cpu T0 : @_start+14.1 : PUSH_R : subi rsp, rsp, 0x8 : IntAlu : D=0x00007fffffffee28 546000: system.cpu T0 : @_start+15 : push rsp 546000: system.cpu T0 : @_start+15.0 : PUSH_R : st rsp, SS:[rsp + 0xfffffffffffffff8] : MemWrite : D=0x00007fffffffee28 A=0x7fffffffee20 567000: system.cpu T0 : @_start+15.1 : PUSH_R : subi rsp, rsp, 0x8 : IntAlu : D=0x00007fffffffee20 622000: system.cpu T0 : @_start+16 : mov r15, 0x40a060 622000: system.cpu T0 : @_start+16.0 : MOV_R_I : limm r8, 0x40a060 : IntAlu : D=0x000000000040a060 732000: system.cpu T0 : @_start+23 : mov rdi, 0x409ff0 732000: system.cpu T0 : @_start+23.0 : MOV_R_I : limm rcx, 0x409ff0 : IntAlu : D=0x0000000000409ff0 842000: system.cpu T0 : @_start+30 : mov rdi, 0x400274 842000: system.cpu T0 : @_start+30.0 : MOV_R_I : limm rdi, 0x400274 : IntAlu : D=0x0000000000400274 952000: system.cpu T0 : @_start+37 : call 0x9846 952000: system.cpu T0 : @_start+37.0 : CALL_NEAR_I : limm t1, 0x9846 : IntAlu : D=0x0000000000009846 952000: system.cpu T0 : @_start+37.1 : CALL_NEAR_I : rdip t7, %ctrl153, : IntAlu : D=0x00000000004001ba 952000: system.cpu T0 : @_start+37.2 : CALL_NEAR_I : st t7, SS:[rsp + 0xfffffffffffffff8] : MemWrite : D=0x00000000004001ba A=0x7fffffffee18 973000: system.cpu T0 : @_start+37.3 : CALL_NEAR_I : subi rsp, rsp, 0x8 : IntAlu : D=0x00007fffffffee18 973000: system.cpu T0 : @_start+37.4 : CALL_NEAR_I : wrip , t7, t1 : IntAlu :1042000: system.cpu T0 : @__libc_start_main : push r151042000: system.cpu T0 : @__libc_start_main.0 : PUSH_R : st r15, SS:[rsp + 0xfffffffffffffff8] : MemWrite : D=0x0000000000000000 A=0x7fffffffee101063000: system.cpu T0 : @__libc_start_main.1 : PUSH_R : subi rsp, rsp, 0x8 : IntAlu : D=0x00007fffffffee101118000: system.cpu T0 : @__libc_start_main+2 : movsxd rax, rsi1118000: system.cpu T0 : @__libc_start_main+2.0 : MOVSXD_R_R : sexti rax, rsi, 0x1f : IntAlu : D=0x00000000000000011173000: system.cpu T0 : @__libc_start_main+5 : mov r15, r91173000: system.cpu T0 : @__libc_start_main+5.0 : MOV_R_R : mov r15, r15, r9 : IntAlu : D=0x00000000000000001228000: system.cpu T0 : @__libc_start_main+8 : push r14In fact, the Exec flag is actually an agglomeration of multiple debugflags. You can see this, and all of the available debug flags, byrunning gem5 with the --debug-help parameter.``` {.sourceCode .sh}build/X86/gem5.opt –debug-help Base Flags: Activity: None AddrRanges: None Annotate: State machine annotation debugging AnnotateQ: State machine annotation queue debugging AnnotateVerbose: Dump all state machine annotation details BaseXBar: None Branch: None Bridge: None CCRegs: None CMOS: Accesses to CMOS devices Cache: None CachePort: None CacheRepl: None CacheTags: None CacheVerbose: None Checker: None Checkpoint: None ClockDomain: None ... Compound Flags: AnnotateAll: All Annotation flags Annotate, AnnotateQ, AnnotateVerbose CacheAll: None Cache, CachePort, CacheRepl, CacheVerbose, HWPrefetch DiskImageAll: None DiskImageRead, DiskImageWrite ... XBar: None BaseXBar, CoherentXBar, NoncoherentXBar, SnoopFilter XBar: None BaseXBar, CoherentXBar, NoncoherentXBar, SnoopFilterAdding a new debug flag-----------------------In the previous chapters \\<hello-simobject-chapter\\>, we used a simple`std::cout` to print from our SimObject. While it is possible to use thenormal C/C++ I/O in gem5, it is highly discouraged. So, we are now goingto replace this and use gem5's debugging facilities instead.When creating a new debug flag, we first have to declare it in aSConscript file. Add the following to the SConscript file in thedirectory with your hello object code (src/learning\\_gem5/).``` {.sourceCode .python}DebugFlag('Hello')This declares a debug flag of “Hello”. Now, we can use this in debugstatements in our SimObject.By declaring the flag in the SConscript file, a debug header isautomatically generated that allows us to use the debug flag. The headerfile is in the debug directory and has the same name (andcapitalization) as what we declare in the SConscript file. Therefore, weneed to include the automatically generated header file in any fileswhere we plan to use the debug flag.In the hello_object.cc file, we need to include the header file.``` {.sourceCode .c++}#include “debug/Hello.hh”Now that we have included the necessary header file, let's replace the`std::cout` call with a debug statement like so.``` {.sourceCode .c++}DPRINTF(Hello, \"Created the hello object\\n\");DPRINTF is a C++ macro. The first parameter is a debug flag that hasbeen declared in a SConscript file. We can use the flag Hello since wedeclared it in the src/learning_gem5/SConscript file. The rest of thearguments are variable and can be anything you would pass to a printfstatement.Now, if you recompile gem5 and run it with the “Hello” debug flag, youget the following result.{.sourceCode .sh}build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.pygem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 4 2017 09:40:10gem5 started Jan 4 2017 09:41:01gem5 executing on chinook, pid 29078command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.pyGlobal frequency set at 1000000000000 ticks per second 0: hello: Created the hello objectBeginning simulation!info: Entering event queue @ 0. Starting simulation...Exiting @ tick 18446744073709551615 because simulate() limit reachedYou can find the updated SConcript filehere <../_static/scripts/part2/debugging/SConscript> and the updatedhello object codehere <../_static/scripts/part2/debugging/hello_object.cc>.Debug outputFor each dynamic DPRINTF execution, three things are printed tostdout. First, the current tick when the DPRINTF is executed.Second, the name of the SimObject that called DPRINTF. This name isusually the Python variable name from the Python config file. However,the name is whatever the SimObject name() function returns. Finally,you see whatever format string you passed to the DPRINTF function.You can control where the debug output goes with the --debug-fileparameter. By default, all of the debugging output is printed tostdout. However, you can redirect the output to any file. The file isstored relative to the main gem5 output directory, not the currentworking directory.Using functions other than DPRINTFDPRINTF is the most commonly used debugging function in gem5. However,gem5 provides a number of other functions that are useful in specificcircumstances. These functions are like the previous functions :cppDDUMP,:cppDPRINTF, and :cppDPRINTFR except they do not take a flag as aparameter. Therefore, these statements will always print wheneverdebugging is enabled.All of these functions are only enabled if you compile gem5 in “opt” or“debug” mode. All other modes use empty placeholder macros for the abovefunctions. Therefore, if you want to use debug flags, you must useeither “gem5.opt” or “gem5.debug”.", |
| "url": "/debugging/" |
| } |
| , |
| |
| "environment": { |
| "title": "Setting up your development environment", |
| "content": " authors Jason Lowe-PowerSetting up your development environmentThis is going to talk about getting started developing gem5.gem5-style guidelinesWhen modifying any open source project, it is important to follow theproject’s style guidelines. Details on gem5 style can be found on thegem5 wiki page.To help you conform to the style guidelines, gem5 includes a scriptwhich runs whenever you commit a changeset in git. This script should beautomatically added to your .git/config file by SCons the first time youbuild gem5. Please do not ignore these warnings/errors. However, in therare case where you are trying to commit a file that doesn’t conform tothe gem5 style guidelines (e.g., something from outside the gem5 sourcetree) you can use the git option --no-verify to skip running the stylechecker.The key takeaways from the style guide are: Use 4 spaces, not tabs Sort the includes Use capitalized camel case for class names, camel case for membervariables, and underscores for local variables. Document your codegit branchesMost people developing with gem5 use the branch feature of git to tracktheir changes. This makes it quite simple to commit your changes back togem5. Additionally, using branches can make it easier to update gem5with new changes that other people make while keeping your own changesseparate. The Git book has a greatchapterdescribing the details of how to use branches.", |
| "url": "/environment/" |
| } |
| , |
| |
| "events": { |
| "title": "Event-driven programming", |
| "content": " authors Jason Lowe-PowerEvent-driven programminggem5 is an event-driven simulator. In this chapter, we will explore howto create and schedule events. We will be building from the simpleHelloObject from hello-simobject-chapter.Creating a simple event callbackIn gem5’s event-driven model, each event has a callback function inwhich the event is processed. Generally, this is a class that inheritsfrom :cppEvent. However, gem5 provides a wrapper function for creatingsimple events.In the header file for our HelloObject, we simply need to declare anew function that we want to execute every time the event fires(processEvent()). This function must take no parameters and returnnothing.Next, we add an Event instance. In this case, we will use anEventFunctionWrapper which allows us to execute any function.We also add a startup() function that will be explained below.``` {.sourceCode .c++}class HelloObject : public SimObject{ private: void processEvent();EventFunctionWrapper event;public: HelloObject(HelloObjectParams *p);void startup(); }; ```Next, we must construct this event in the constructor of HelloObject.The EventFuntionWrapper takes two parameters, a function to executeand a name. The name is usually the name of the SimObject that owns theevent. When printing the name, there will be an automatic“.wrapped_function_event” appended to the end of the name.The first parameter is simply a function that takes no parameters andhas no return value (std::function<void(void)>). Usually, this is asimple lambda function that calls a member function. However, it can beany function you want. Below, we captute this in the lambda ([this])so we can call member functions of the instance of the class.``` {.sourceCode .c++}HelloObject::HelloObject(HelloObjectParams *params) : SimObject(params), event([this]{processEvent();}, name()){ DPRINTF(Hello, “Created the hello object\\n”);}We also must define the implementation of the process function. In thiscase, we'll simply print something if we are debugging.``` {.sourceCode .c++}voidHelloObject::processEvent(){ DPRINTF(Hello, \"Hello world! Processing the event!\\n\");}Scheduling eventsFinally, for the event to be processed, we first have to schedule theevent. For this we use the :cppschedule function. This functionschedules some instance of an Event for some time in the future(event-driven simulation does not allow events to execute in the past).We will initially schedule the event in the startup() function weadded to the HelloObject class. The startup() function is whereSimObjects are allowed to schedule internal events. It does not getexecuted until the simulation begins for the first time (i.e. thesimulate() function is called from a Python config file).``` {.sourceCode .c++}voidHelloObject::startup(){ schedule(event, 100);}Here, we simply schedule the event to execute at tick 100. Normally, youwould use some offset from `curTick()`, but since we know the startup()function is called when the time is currently 0, we can use an explicittick value.The output when you run gem5 with the \"Hello\" debug flag is now gem5 Simulator System. http://gem5.org gem5 is copyrighted software; use the --copyright option for details. gem5 compiled Jan 4 2017 11:01:46 gem5 started Jan 4 2017 13:41:38 gem5 executing on chinook, pid 1834 command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.py Global frequency set at 1000000000000 ticks per second 0: hello: Created the hello object Beginning simulation! info: Entering event queue @ 0. Starting simulation... 100: hello: Hello world! Processing the event! Exiting @ tick 18446744073709551615 because simulate() limit reachedMore event scheduling---------------------We can also schedule new events within an event process action. Forinstance, we are going to add a latency parameter to the `HelloObject`and a parameter for how many times to fire the event. In the [nextchapter](parameters-chapter) we will make these parameters accessiblefrom the Python config files.To the HelloObject class declaration, add a member variable for thelatency and number of times to fire.``` {.sourceCode .c++}class HelloObject : public SimObject{ private: void processEvent(); EventFunctionWrapper event; Tick latency; int timesLeft; public: HelloObject(HelloObjectParams *p); void startup();};Then, in the constructor add default values for the latency andtimesLeft.``` {.sourceCode .c++}HelloObject::HelloObject(HelloObjectParams *params) : SimObject(params), event([this]{processEvent();}, name()), latency(100), timesLeft(10){ DPRINTF(Hello, “Created the hello object\\n”);}Finally, update `startup()` and `processEvent()`.``` {.sourceCode .c++}voidHelloObject::startup(){ schedule(event, latency);}voidHelloObject::processEvent(){ timesLeft--; DPRINTF(Hello, \"Hello world! Processing the event! %d left\\n\", timesLeft); if (timesLeft <= 0) { DPRINTF(Hello, \"Done firing!\\n\"); } else { schedule(event, curTick() + latency); }}Now, when we run gem5, the event should fire 10 times, and thesimulation will end after 1000 ticks. The output should now look likethe following.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 4 2017 13:53:35gem5 started Jan 4 2017 13:54:11gem5 executing on chinook, pid 2326command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.pyGlobal frequency set at 1000000000000 ticks per second 0: hello: Created the hello objectBeginning simulation!info: Entering event queue @ 0. Starting simulation... 100: hello: Hello world! Processing the event! 9 left 200: hello: Hello world! Processing the event! 8 left 300: hello: Hello world! Processing the event! 7 left 400: hello: Hello world! Processing the event! 6 left 500: hello: Hello world! Processing the event! 5 left 600: hello: Hello world! Processing the event! 4 left 700: hello: Hello world! Processing the event! 3 left 800: hello: Hello world! Processing the event! 2 left 900: hello: Hello world! Processing the event! 1 left 1000: hello: Hello world! Processing the event! 0 left 1000: hello: Done firing!Exiting @ tick 18446744073709551615 because simulate() limit reachedYou can find the updated header filehere <../_static/scripts/part2/events/hello_object.hh> and theimplementation filehere <../_static/scripts/part2/events/hello_object.cc>.", |
| "url": "/events/" |
| } |
| , |
| |
| "helloobject": { |
| "title": "Creating a very simple SimObject", |
| "content": " authors Jason Lowe-PowerCreating a very simple SimObjectAlmost all objects in gem5 inherit from the base SimObject type.SimObjects export the main interfaces to all objects in gem5. SimObjectsare wrapped C++ objects that are accessible from the Pythonconfiguration scripts.SimObjects can have many parameters, which are set via the Pythonconfiguration files. In addition to simple parameters like integers andfloating point numbers, they can also have other SimObjects asparameters. This allows you to create complex system hierarchies, likereal machines.In this chapter, we will walk through creating a simple “HelloWorld”SimObject. The goal is to introduce you to how SimObjects are createdand the required boilerplate code for all SimObjects. We will alsocreate a simple Python configuration script which instantiates ourSimObject.In the next few chapters, we will take this simple SimObject and expandon it to include debugging support, dynamicevents, and parameters. Using git branches It is common to use a new git branch for each new feature you add togem5. The first step when adding a new feature or modifying something ingem5, is to create a new branch to store your changes. Details on gitbranches can be found in the Git book_. {.sourceCode .sh}git checkout -b hello-simobjectStep 1: Create a Python class for your new SimObjectEach SimObject has a Python class which is associated with it. ThisPython class describes the parameters of your SimObject that can becontrolled from the Python configuration files. For our simpleSimObject, we are just going to start out with no parameters. Thus, wesimply need to declare a new class for our SimObject and set it’s nameand the C++ header that will define the C++ class for the SimObject.We can create a file, HelloObject.py, in src/learning_gem5``` {.sourceCode .python}from m5.params import *from m5.SimObject import SimObjectclass HelloObject(SimObject): type = ‘HelloObject’ cxx_header = “learning_gem5/hello_object.hh”You can find the complete filehere \\<../\\_static/scripts/part2/helloobject/HelloObject.py\\>.It is not required that the `type` be the same as the name of the class,but it is convention. The `type` is the C++ class that you are wrappingwith this Python SimObject. Only in special circumstances should the`type` and the class name be different.The `cxx_header` is the file that contains the declaration of the classused as the `type` parameter. Again, the convention is to use the nameof the SimObject with all lowercase and underscores, but this is onlyconvention. You can specify any header file here.Step 2: Implement your SimObject in C++---------------------------------------Next, we need to create `hello_object.hh` and `hello_object.cc` whichwill implement the hello object.We'll start with the header file for our `C++` object. By convention,gem5 wraps all header files in `#ifndef/#endif` with the name of thefile and the directory its in so there are no circular includes.The only thing we need to do in the file is to declare our class. Since`HelloObject` is a SimObject, it must inherit from the C++ SimObjectclass. Most of the time, your SimObject's parent will be a subclass ofSimObject, not SimObject itself.The SimObject class specifies many virtual functions. However, none ofthese functions are pure virtual, so in the simplest case, there is noneed to implement any functions except for the constructor.The constructor for all SimObjects assumes it will take a parameterobject. This parameter object is automatically created by the buildsystem and is based on the `Python` class for the SimObject, like theone we created above. The name for this parameter type is generatedautomatically from the name of your object. For our \"HelloObject\" theparameter type's name is \"HelloObject\\**Params*\\*\".The code required for our simple header file is listed below.``` {.sourceCode .c++}#ifndef __LEARNING_GEM5_HELLO_OBJECT_HH__#define __LEARNING_GEM5_HELLO_OBJECT_HH__#include \"params/HelloObject.hh\"#include \"sim/sim_object.hh\"class HelloObject : public SimObject{ public: HelloObject(HelloObjectParams *p);};#endif // __LEARNING_GEM5_HELLO_OBJECT_HH__You can find the complete filehere <../_static/scripts/part2/helloobject/hello_object.hh>.Next, we need to implement two functions in the .cc file, not justone. The first function, is the constructor for the HelloObject. Herewe simply pass the parameter object to the SimObject parent and print“Hello world!”Normally, you would never use std::cout in gem5. Instead, youshould use debug flags. In the next chapter, wewill modify this to use debug flags instead. However, for now, we’llsimply use std::cout because it is simple.``` {.sourceCode .c++}#include “learning_gem5/hello_object.hh”#include HelloObject::HelloObject(HelloObjectParams *params) : SimObject(params){ std::cout « “Hello World! From a SimObject!” « std::endl;}There is another function that we have to implement as well for theSimObject to be complete. We must implement one function for theparameter type that is implicitly created from the SimObject `Python`declaration, namely, the `create` function. This function simply returnsa new instantiation of the SimObject. Usually this function is verysimple (as below).``` {.sourceCode .c++}HelloObject*HelloObjectParams::create(){ return new HelloObject(this);}You can find the complete filehere <../_static/scripts/part2/helloobject/hello_object.cc>.If you forget to add the create function for your SimObject, you willget a linker error when you compile. It will look something like thefollowing.build/X86/python/m5/internal/param_HelloObject_wrap.o: In function `_wrap_HelloObjectParams_create':/local.chinook/gem5/gem5-tutorial/gem5/build/X86/python/m5/internal/param_HelloObject_wrap.cc:3096: undefined reference to `HelloObjectParams::create()'collect2: error: ld returned 1 exit statusscons: *** [build/X86/gem5.opt] Error 1scons: building terminated because of errors.This undefined reference to `HelloObjectParams::create()' meansyou need to implement the create function for your SimObject.Step 3: Register the SimObject and C++ fileIn order for the C++ file to be compiled and the Python file to beparsed we need to tell the build system about these files. gem5 usesSCons as the build system, so you simply have to create a SConscriptfile in the directory with the code for the SimObject. If there isalready a SConscript file for that directory, simply add the followingdeclarations to that file.This file is simply a normal Python file, so you can write anyPython code you want in this file. Some of the scripting can becomequite complicated. gem5 leverages this to automatically create code forSimObjects and to compile the domain-specific languages like SLICC andthe ISA language.In the SConscript file, there are a number of functions automaticallydefined after you import them. See the section on that…To get your new SimObject to compile, you simply need to create a newfile with the name “SConscript” in the src/learning_gem5 directory. Inthis file, you have to declare the SimObject and the .cc file. Belowis the required code.``` {.sourceCode .python}Import(‘*’)SimObject(‘HelloObject.py’)Source(‘hello_object.cc’)You can find the complete filehere \\<../\\_static/scripts/part2/helloobject/SConscript\\>.Step 4: (Re)-build gem5-----------------------To compile and link your new files you simply need to recompile gem5.The below example assumes you are using the x86 ISA, but nothing in ourobject requires an ISA so, this will work with any of gem5's ISAs.``` {.sourceCode .sh}scons build/X86/gem5.optStep 5: Create the config scripts to use your new SimObjectNow that you have implemented a SimObject, and it has been compiled intogem5, you need to create or modify a Python config file to instantiateyour object. Since your object is very simple a system object is notrequired! CPUs are not needed, or caches, or anything, except a Rootobject. All gem5 instances require a Root object.Walking through creating a very simple configuration script, first,import m5 and all of the objects you have compiled.``` {.sourceCode .python}import m5from m5.objects import *Next, you have to instantiate the `Root` object, as required by all gem5instances.``` {.sourceCode .python}root = Root(full_system = False)Now, you can instantiate the HelloObject you created. All you need todo is call the Python “constructor”. Later, we will look at how tospecify parameters via the Python constructor. In addition to creatingan instantiation of your object, you need to make sure that it is achild of the root object. Only SimObjects that are children of theRoot object are instantiated in C++.``` {.sourceCode .python}root.hello = HelloObject()Finally, you need to call `instantiate` on the `m5` module and actuallyrun the simulation!``` {.sourceCode .python}m5.instantiate()print(\"Beginning simulation!\")exit_event = m5.simulate()print('Exiting @ tick {} because {}' .format(m5.curTick(), exit_event.getCause()))You can find the complete filehere <../_static/scripts/part2/helloobject/run_hello.py>.The output should look something like the followinggem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled May 4 2016 11:37:41gem5 started May 4 2016 11:44:28gem5 executing on mustardseed.cs.wisc.edu, pid 22480command line: build/X86/gem5.opt configs/learning_gem5/run_hello.pyGlobal frequency set at 1000000000000 ticks per secondHello World! From a SimObject!Beginning simulation!info: Entering event queue @ 0. Starting simulation...Exiting @ tick 18446744073709551615 because simulate() limit reachedCongrats! You have written your first SimObject. In the next chapters,we will extend this SimObject and explore what you can do withSimObjects.", |
| "url": "/helloobject/" |
| } |
| , |
| |
| "memoryobject": { |
| "title": "Creating SimObjects in the memory system", |
| "content": " authors Jason Lowe-PowerCreating SimObjects in the memory systemIn this chapter, we will create a simple memory object that sits betweenthe CPU and the memory bus. In the next chapter <simplecache-chapter>we will take this simple memory object and add some logic to it to makeit a very simple blocking uniprocessor cache.gem5 master and slave portsBefore diving into the implementation of a memory object, we shouldfirst understand gem5’s master and slave port interface. As previouslydiscussed in simple-config-chapter, all memory objects are connectedtogether via ports. These ports provide a rigid interface between thesememory objects.These ports implement three different memory system modes: timing,atomic, and functional. The most important mode is timing mode. Timingmode is the only mode that produces correct simulation results. Theother modes are only used in special circumstances.Atomic mode is useful for fastforwarding simulation to a region ofinterest and warming up the simulator. This mode assumes that no eventswill be generated in the memory system. Instead, all of the memoryrequests execute through a single long callchain. It is not required toimplement atomic accesses for a memory object unless it will be usedduring fastforward or during simulator warmup.Functional mode is better described as debugging mode. Functionalmode is used for things like reading data from the host into thesimulator memory. It is used heavily in syscall emulation mode. Forinstance, functional mode is used to load the binary in theprocess.cmd from the host into the simulated system’s memory so thesimulated system can access it. Functional accesses should return themost up-to-date data on a read, no matter where the data is, and shouldupdate all possible valid data on a write (e.g., in a system with cachesthere may be multiple valid cache blocks with the same address).PacketsIn gem5, Packets are sent across ports. A Packet is made up of aMemReq which is the memory request object. The MemReq holdsinformation about the original request that initiated the packet such asthe requestor, the address, and the type of request (read, write, etc.).Packets also have a MemCmd, which is the current command of thepacket. This command can change throughout the life of the packet (e.g.,requests turn into responses once the memory command is satisfied). Themost common MemCmd are ReadReq (read request), ReadResp (readresponse), WriteReq (write request), WriteResp (write response).There are also writeback requests (WritebackDirty, WritebackClean)for caches and many other command types.Packets also either keep the data for the request, or a pointer to thedata. There are options when creating the packet whether the data isdynamic (explicitly allocated and deallocated), or static (allocated anddeallocated by the packet object).Finally, packets are used in the classic caches as the unit to trackcoherency. Therefore, much of the packet code is specific to the classiccache coherence protocol. However, packets are used for allcommunication between memory objects in gem5, even if they are notdirectly involved in coherence (e.g., DRAM controllers and the CPUmodels).All of the port interface functions accept a Packet pointer as aparameter. Since this pointer is so common, gem5 includes a typedef forit: PacketPtr.Port interfaceThere are two types of ports in gem5: master ports and slave ports.Whenever you implement a memory object, you will implement at least oneof these types of ports. To do this, you create a new class thatinherits from either MasterPort or SlavePort for master and slaveports, respectively. Master ports send requests (and receive response),and slave ports receive requests (and send responses).master-slave-1-fig outlines the simplest interaction between a masterand slave port. This figure shows the interaction in timing mode. Theother modes are much simpler and use a simple callchain between themaster and the slave.As mentioned above, all of the port interfaces require a PacketPtr asa parameter. Each of these functions (sendTimingReq, recvTimingReq,etc.), accepts a single parameter, a PacketPtr. This packet is therequest or response to send or receive.To send a request packet, the master calls sendTimingReq. In turn,(and in the same callchain), the function recvTimingReq is called onthe slave with the same PacketPtr as its sole parameter.The recvTimingReq has a return type of bool. This boolean returnvalue is directly returned to the calling master. A return value oftrue signifies that the packet was accepted by the slave. A returnvalue of false, on the other hand, means that the slave was unable toaccept and the request must be retried sometime in the future.In master-slave-1-fig, first, the master sends a timing request bycalling sendTimingReq, which in turn calls recvTimingResp. Theslave, returns true from recvTimingResp, which is returned from thecall to sendTimingReq. The master continue executing, and the slavedoes whatever is necessary to complete the request (e.g., if it is acache, it looks up the tags to see if there is a match to the address inthe request).Once the slave completes the request, it can send a response to themaster. The slave calls sendTimingResp with the response packet (thisshould be the same PacketPtr as the request, but it should now be aresponse packet). In turn, the master function recvTimingResp iscalled. The master’s recvTimingResp function returns true, which isthe return value of sendTimingResp in the slave. Thus, the interactionfor that request is complete.Later in master-slave-example-section we will show the example code forthese functions.It is possible that the master or slave is busy when they receive arequest or a response. master-slave-2-fig shows the case where the slaveis busy when the original request was sent.In this case, the slave returns false from the recvTimingReqfunction. When a master receives false after calling sendTimingReq, itmust wait until the its function recvReqRetry is executed. Only whenthis function is called is the master allowed to retry callingsendTimingRequest. The above figure shows the timing request failingonce, but it could fail any number of times. Note: it is up to themaster to track the packet that fails, not the slave. The slave doesnot keep the pointer to the packet that fails.Similarly, master-slave-3-fig shows the case when the master is busy atthe time the slave tries to send a response. In this case, the slavecannot call sendTimingResp until it receives a recvRespRetry.Importantly, in both of these cases, the retry codepath can be a singlecall stack. For instance, when the master calls sendRespRetry,recvTimingReq can also be called in the same call stack. Therefore, itis easy to incorrectly create an infinite recursion bug, or other bugs.It is important that before a memory object sends a retry, that it isready at that instant to accept another packet.Simple memory object exampleIn this section, we will build a simple memory object. Initially, itwill simply pass requests through from the CPU-side (a simple CPU) tothe memory-side (a simple memory bus). See Figure simple-memobj-figure.It will have a single master port, to send requests to the memory bus,and two cpu-side ports for the instruction and data cache ports of theCPU. In the next chapter <simplecache-chapter>, we will add the logicto make this object a cache.Declare the SimObjectJust like when we were creating the simple SimObject inhello-simobject-chapter, the first step is to create a SimObject Pythonfile. We will call this simple memory object SimpleMemobj and createthe SimObject Python file in src/learning_gem5/simple_memobj.``` {.sourceCode .python}from m5.params import *from m5.proxy import *from MemObject import MemObjectclass SimpleMemobj(MemObject): type = ‘SimpleMemobj’ cxx_header = “learning_gem5/simple_memobj/simple_memobj.hh”inst_port = SlavePort(\"CPU side port, receives requests\")data_port = SlavePort(\"CPU side port, receives requests\")mem_side = MasterPort(\"Memory side port, sends requests\") ```For this object, we inherit from MemObject, not SimObject since weare creating an object that will interact with the memory system. TheMemObject class has two pure virtual functions that we will have todefine in our C++ implementation, getMasterPort and getSlavePort.This object’s parameters are three ports. Two ports for the CPU toconnect the instruction and data ports and a port to connect to thememory bus. These ports do not have a default value, and they have asimple description.It is important to remember the names of these ports. We will explicitlyuse these names when implementing SimpleMemobj and defining thegetMasterPort and getSlavePort functions.You can download the SimObject filehere <../_static/scripts/part2/memoryobject/SimpleMemobj.py>Of course, you also need to create a SConscript file in the newdirectory as well that declares the SimObject Python file. You candownload the SConscript filehere <../_static/scripts/part2/memoryobject/SConscript>Define the SimpleMemobj classNow, we create a header file for SimpleMemobj.``` {.sourceCode .c++}class SimpleMemobj : public MemObject{ private:public:/** constructor */SimpleMemobj(SimpleMemobjParams *params); }; ```Define a slave port typeNow, we need to define classes for our two kinds of ports: the CPU-sideand the memory-side ports. For this, we will declare these classesinside the SimpleMemobj class since no other object will ever usethese classes.Let’s start with the slave port, or the CPU-side port. We are going toinherit from the SlavePort class. The following is the required codeto override all of the pure virtual functions in the SlavePort class.``` {.sourceCode .c++}class CPUSidePort : public SlavePort{ private: SimpleMemobj *owner;public: CPUSidePort(const std::string& name, SimpleMemobj *owner) : SlavePort(name, owner), owner(owner) { }AddrRangeList getAddrRanges() const override;protected: Tick recvAtomic(PacketPtr pkt) override { panic(“recvAtomic unimpl.”); } void recvFunctional(PacketPtr pkt) override; bool recvTimingReq(PacketPtr pkt) override; void recvRespRetry() override;};This object requires five functions to be defined.This object also has a single member variable, its owner, so it can callfunctions on that object.### Define a master port typeNext, we need to define a master port type. This will be the memory-sideport which will forward request from the CPU-side to the rest of thememory system.``` {.sourceCode .c++}class MemSidePort : public MasterPort{ private: SimpleMemobj *owner; public: MemSidePort(const std::string& name, SimpleMemobj *owner) : MasterPort(name, owner), owner(owner) { } protected: bool recvTimingResp(PacketPtr pkt) override; void recvReqRetry() override; void recvRangeChange() override;};This class only has three pure virtual functions that we must override.Defining the MemObject interfaceNow that we have defined these two new types CPUSidePort andMemSidePort, we can declare our three ports as part of SimpleMemobj.We also need to declare the two pure virtual functions in theMemObject class, getMasterPort and getSlavePort. These twofunctions are used by gem5 during the initialization phase to connectmemory objects together via ports.``` {.sourceCode .c++}class SimpleMemobj : public MemObject{ private:<CPUSidePort declaration><MemSidePort declaration>CPUSidePort instPort;CPUSidePort dataPort;MemSidePort memPort;public: SimpleMemobj(SimpleMemobjParams *params);BaseMasterPort& getMasterPort(const std::string& if_name, PortID idx = InvalidPortID) override;BaseSlavePort& getSlavePort(const std::string& if_name, PortID idx = InvalidPortID) override;};You can download the header file for the `SimpleMemobj`here \\<../\\_static/scripts/part2/memoryobject/simple\\_memobj.hh\\>### Implementing basic MemObject functionsFor the constructor of `SimpleMemobj`, we will simply call the`MemObject` constructor. We also need to initialize all of the ports.Each port's constructor takes two parameters: the name and a pointer toits owner, as we defined in the header file. The name can be any string,but by convention, it is the same name as in the Python SimObject file.``` {.sourceCode .c++}SimpleMemobj::SimpleMemobj(SimpleMemobjParams *params) : MemObject(params), instPort(params->name + \".inst_port\", this), dataPort(params->name + \".data_port\", this), memPort(params->name + \".mem_side\", this){}Next, we need to implement the interfaces to get the ports. Thisinterface is made of two functions getMasterPort and getSlavePort.These functions take two parameters. The if_name is the Pythonvariable name of the interface for this object. In the case of themaster port it will be mem_side since this is what we declared as aMasterPort in the Python SimObject file.To implement getMasterPort, we compare the if_name and check to seeif it is mem_side as specified in our Python SimObject file. If it is,then we return the memPort object. If not, then we pass the requestname to our parent. However, it will be an error if we try to connect aslave port to any other named port since the parent class has no portsdefined.``` {.sourceCode .c++}BaseMasterPort&SimpleMemobj::getMasterPort(const std::string& if_name, PortID idx){ if (if_name == “mem_side”) { return memPort; } else { return MemObject::getMasterPort(if_name, idx); }}To implement `getSlavePort`, we similarly check if the `if_name` matcheseither of the names we defined for our slave ports in the PythonSimObject file. If the name is `\"inst_port\"`, then we return theinstPort, and if the name is `data_port` we return the data port.``` {.sourceCode .c++}BaseSlavePort&SimpleMemobj::getSlavePort(const std::string& if_name, PortID idx){ if (if_name == \"inst_port\") { return instPort; } else if (if_name == \"data_port\") { return dataPort; } else { return MemObject::getSlavePort(if_name, idx); }}Implementing slave and master port functionsThe implementation of both the slave and master port is relativelysimple. For the most part, each of the port functions just forwards theinformation to the main memory object (SimpleMemobj).Starting with two simple functions, getAddrRanges and recvFunctionalsimply call into the SimpleMemobj.``` {.sourceCode .c++}AddrRangeListSimpleMemobj::CPUSidePort::getAddrRanges() const{ return owner->getAddrRanges();}voidSimpleMemobj::CPUSidePort::recvFunctional(PacketPtr pkt){ return owner->handleFunctional(pkt);}The implementation of these functions in the `SimpleMemobj` are equallysimple. These implementations just pass through the request to thememory side. We can use `DPRINTF` calls here to track what is happeningfor debug purposes as well.``` {.sourceCode .c++}voidSimpleMemobj::handleFunctional(PacketPtr pkt){ memPort.sendFunctional(pkt);}AddrRangeListSimpleMemobj::getAddrRanges() const{ DPRINTF(SimpleMemobj, \"Sending new ranges\\n\"); return memPort.getAddrRanges();}Similarly for the MemSidePort, we need to implement recvRangeChangeand forward the request through the SimpleMemobj to the slave port.``` {.sourceCode .c++}voidSimpleMemobj::MemSidePort::recvRangeChange(){ owner->sendRangeChange();}``` {.sourceCode .c++}voidSimpleMemobj::sendRangeChange(){ instPort.sendRangeChange(); dataPort.sendRangeChange();}Implementing receiving requestsThe implementation of recvTimingReq is slightly more complicated. Weneed to check to see if the SimpleMemobj can accept the request. TheSimpleMemobj is a very simple blocking structure; we only allow asingle request outstanding at a time. Therefore, if we get a requestwhile another request is outstanding, the SimpleMemobj will block thesecond request.To simplify the implementation, the CPUSidePort stores all of theflow-control information for the port interface. Thus, we need to add anextra member variable, needRetry, to the CPUSidePort, a boolean thatstores whether we need to send a retry whenever the SimpleMemobjbecomes free. Then, if the SimpleMemobj is blocked on a request, weset that we need to send a retry sometime in the future.``` {.sourceCode .c++}boolSimpleMemobj::CPUSidePort::recvTimingReq(PacketPtr pkt){ if (!owner->handleRequest(pkt)) { needRetry = true; return false; } else { return true; }}To handle the request for the `SimpleMemobj`, we first check if the`SimpleMemobj` is already blocked waiting for a response to anotherrequest. If it is blocked, then we return `false` to signal the callingmaster port that we cannot accept the request right now. Otherwise, wemark the port as blocked and send the packet out of the memory port. Forthis, we can define a helper function in the `MemSidePort` object tohide the flow control from the `SimpleMemobj` implementation. We willassume the `memPort` handles all of the flow control and always return`true` from `handleRequest` since we were successful in consuming therequest.``` {.sourceCode .c++}boolSimpleMemobj::handleRequest(PacketPtr pkt){ if (blocked) { return false; } DPRINTF(SimpleMemobj, \"Got request for addr %#x\\n\", pkt->getAddr()); blocked = true; memPort.sendPacket(pkt); return true;}Next, we need to implement the sendPacket function in theMemSidePort. This function will handle the flow control in case itspeer slave port cannot accept the request. For this, we need to add amember to the MemSidePort to store the packet in case it is blocked.It is the responsibility of the sender to store the packet if thereceiver cannot receive the request (or response).This function simply send the packet by calling the functionsendTimingReq. If the send fails, then this object store the packet inthe blockedPacket member function so it can send the packet later(when it receives a recvReqRetry). This function also contains somedefensive code to make sure there is not a bug and we never try tooverwrite the blockedPacket variable incorrectly.``` {.sourceCode .c++}voidSimpleMemobj::MemSidePort::sendPacket(PacketPtr pkt){ panic_if(blockedPacket != nullptr, “Should never try to send if blocked!”); if (!sendTimingReq(pkt)) { blockedPacket = pkt; }}Next, we need to implement the code to resend the packet. In thisfunction, we try to resend the packet by calling the `sendPacket`function we wrote above.``` {.sourceCode .c++}voidSimpleMemobj::MemSidePort::recvReqRetry(){ assert(blockedPacket != nullptr); PacketPtr pkt = blockedPacket; blockedPacket = nullptr; sendPacket(pkt);}Implementing receiving responsesThe response codepath is similar to the receiving codepath. When theMemSidePort gets a response, we forward the response through theSimpleMemobj to the appropriate CPUSidePort.``` {.sourceCode .c++}boolSimpleMemobj::MemSidePort::recvTimingResp(PacketPtr pkt){ return owner->handleResponse(pkt);}In the `SimpleMemobj`, first, it should always be blocked when wereceive a response since the object is blocking. Before sending thepacket back to the CPU side, we need to mark that the object no longerblocked. This must be done *before calling `sendTimingResp`*. Otherwise,it is possible to get stuck in an infinite loop as it is possible thatthe master port has a single callchain between receiving a response andsending another request.After unblocking the `SimpleMemobj`, we check to see if the packet is aninstruction or data packet and send it back across the appropriate port.Finally, since the object is now unblocked, we may need to notify theCPU side ports that they can now retry their requests that failed.``` {.sourceCode .c++}boolSimpleMemobj::handleResponse(PacketPtr pkt){ assert(blocked); DPRINTF(SimpleMemobj, \"Got response for addr %#x\\n\", pkt->getAddr()); blocked = false; // Simply forward to the memory port if (pkt->req->isInstFetch()) { instPort.sendPacket(pkt); } else { dataPort.sendPacket(pkt); } instPort.trySendRetry(); dataPort.trySendRetry(); return true;}Similar to how we implemented a convenience function for sending packetsin the MemSidePort, we can implement a sendPacket function in theCPUSidePort to send the responses to the CPU side. This function callssendTimingResp which will in turn call recvTimingResp on the peermaster port. If this call fails and the peer port is currently blocked,then we store the packet to be sent later.``` {.sourceCode .c++}voidSimpleMemobj::CPUSidePort::sendPacket(PacketPtr pkt){ panic_if(blockedPacket != nullptr, “Should never try to send if blocked!”);if (!sendTimingResp(pkt)) { blockedPacket = pkt;} } ```We will send this blocked packet later when we receive arecvRespRetry. This function is exactly the same as the recvReqRetryabove and simply tries to resend the packet, which may be blocked again.``` {.sourceCode .c++}voidSimpleMemobj::CPUSidePort::recvRespRetry(){ assert(blockedPacket != nullptr);PacketPtr pkt = blockedPacket;blockedPacket = nullptr;sendPacket(pkt); } ```Finally, we need to implement the extra function trySendRetry for theCPUSidePort. This function is called by the SimpleMemobj wheneverthe SimpleMemobj may be unblocked. trySendRetry checks to see if aretry is needed which we marked in recvTimingReq whenever theSimpleMemobj was blocked on a new request. Then, if the retry isneeded, this function calls sendRetryReq, which in turn callsrecvReqRetry on the peer master port (the CPU in this case).``` {.sourceCode .c++}voidSimpleMemobj::CPUSidePort::trySendRetry(){ if (needRetry && blockedPacket == nullptr) { needRetry = false; DPRINTF(SimpleMemobj, “Sending retry req for %d\\n”, id); sendRetryReq(); }}You can download the implementation for the `SimpleMemobj`here \\<../\\_static/scripts/part2/memoryobject/simple\\_memobj.cc\\>The following figure, memobj-api-figure, shows the relationships betweenthe `CPUSidePort`, `MemSidePort`, and `SimpleMemobj`. This figure showshow the peer ports interact with the implementation of the`SimpleMemobj`. Each bold function is one that we had to implement, andthe non-bold functions are the port interfaces to the peer ports. Thecolors highlight one API path through the object (e.g., receiving arequest or updating the memory ranges).![](../_static/figures/memobj_api.png)> width> : 100 %>> alt> : Interaction between SimpleMemobj and its ports>> Interaction between SimpleMemobj and its portsFor this simple memory object, packets are just forwarded from theCPU-side to the memory side. However, by modifying `handleRequest` and`handleResponse`, we can create rich featureful objects, like a cache inthe next chapter \\<simplecache-chapter\\>.### Create a config fileThis is all of the code needed to implement a simple memory object! Inthe next chapter \\<simplecache-chapter\\>, we will take this frameworkand add some caching logic to make this memory object into a simplecache. However, before that, let's look at the config file to add theSimpleMemobj to your system.This config file builds off of the simple config file insimple-config-chapter. However, instead of connecting the CPU directlyto the memory bus, we are going to instantiate a `SimpleMemobj` andplace it between the CPU and the memory bus.``` {.sourceCode .python}import m5from m5.objects import *system = System()system.clk_domain = SrcClockDomain()system.clk_domain.clock = '1GHz'system.clk_domain.voltage_domain = VoltageDomain()system.mem_mode = 'timing'system.mem_ranges = [AddrRange('512MB')]system.cpu = TimingSimpleCPU()system.memobj = SimpleMemobj()system.cpu.icache_port = system.memobj.inst_portsystem.cpu.dcache_port = system.memobj.data_portsystem.membus = SystemXBar()system.memobj.mem_side = system.membus.slavesystem.cpu.createInterruptController()system.cpu.interrupts[0].pio = system.membus.mastersystem.cpu.interrupts[0].int_master = system.membus.slavesystem.cpu.interrupts[0].int_slave = system.membus.mastersystem.mem_ctrl = DDR3_1600_8x8()system.mem_ctrl.range = system.mem_ranges[0]system.mem_ctrl.port = system.membus.mastersystem.system_port = system.membus.slaveprocess = Process()process.cmd = ['tests/test-progs/hello/bin/x86/linux/hello']system.cpu.workload = processsystem.cpu.createThreads()root = Root(full_system = False, system = system)m5.instantiate()print \"Beginning simulation!\"exit_event = m5.simulate()print 'Exiting @ tick %i because %s' % (m5.curTick(), exit_event.getCause())You can download this config scripthere <../_static/scripts/part2/memoryobject/simple_memobj.py>Now, when you run this config file you get the following output.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 5 2017 13:40:18gem5 started Jan 9 2017 10:17:17gem5 executing on chinook, pid 5138command line: build/X86/gem5.opt configs/learning_gem5/part2/simple_memobj.pyGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000warn: CoherentXBar system.membus has no snooping ports attached!warn: ClockedObject: More than one power state change request encountered within the same simulation tickBeginning simulation!info: Entering event queue @ 0. Starting simulation...Hello world!Exiting @ tick 507841000 because target called exit()If you run with the SimpleMemobj debug flag, you can see all of thememory requests and responses from and to the CPU.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 5 2017 13:40:18gem5 started Jan 9 2017 10:18:51gem5 executing on chinook, pid 5157command line: build/X86/gem5.opt --debug-flags=SimpleMemobj configs/learning_gem5/part2/simple_memobj.pyGlobal frequency set at 1000000000000 ticks per secondBeginning simulation!info: Entering event queue @ 0. Starting simulation... 0: system.memobj: Got request for addr 0x190 77000: system.memobj: Got response for addr 0x190 77000: system.memobj: Got request for addr 0x190 132000: system.memobj: Got response for addr 0x190 132000: system.memobj: Got request for addr 0x190 187000: system.memobj: Got response for addr 0x190 187000: system.memobj: Got request for addr 0x94e30 250000: system.memobj: Got response for addr 0x94e30 250000: system.memobj: Got request for addr 0x190 ...You may also want to change the CPU model to the out-of-order model(DerivO3CPU). When using the out-of-order CPU you will potentially seea different address stream since it allows multiple memory requestsoutstanding at a once. When using the out-of-order CPU, there will nowbe many stalls because the SimpleMemobj is blocking.", |
| "url": "/memoryobject/" |
| } |
| , |
| |
| "parameters": { |
| "title": "Adding parameters to SimObjects and more events", |
| "content": " authors Jason Lowe-PowerAdding parameters to SimObjects and more eventsOne of the most powerful parts of gem5’s Python interface is the abilityto pass parameters from Python to the C++ objects in gem5. In thischapter, we will explore some of the kinds of parameters for SimObjectsand how to use them building off of the simple HelloObject from theprevious chapters <events-chapter>.Simple parametersFirst, we will add parameters for the latency and number of times tofire the event in the HelloObject. To add a parameter, modify theHelloObject class in the SimObject Python file(src/learning_gem5/HelloObject.py). Parameters are set by adding newstatements to the Python class that include a Param type.For instance, the following code as a parameter time_to_wait which isa “Latency” parameter and number_of_fires which is an integerparameter.``` {.sourceCode .python}class HelloObject(SimObject): type = ‘HelloObject’ cxx_header = “learning_gem5/hello_object.hh”time_to_wait = Param.Latency(\"Time before firing the event\")number_of_fires = Param.Int(1, \"Number of times to fire the event before \" \"goodbye\") ```Param.<TypeName> declares a parameter of type TypeName. Common typesare Int for integers, Float for floats, etc. These types act likeregular Python classes.Each parameter declaration takes one or two parameters. When given twoparameters (like number_of_fires above), the first parameter is thedefault value for the parameter. In this case, if you instantiate aHelloObject in your Python config file without specifying any valuefor number_of_fires, it will take the default value of 1.The second parameter to the parameter declaration is a short descriptionof the parameter. This must be a Python string. If you only specify asingle parameter to the parameter declaration, it is the description (asfor time_to_wait).gem5 also supports many complex parameter types that are not justbuiltin types. For instance, time_to_wait is a Latency. Latencytakes a value as a time value as a string and converts it into simulatorticks. For instance, with a default tick rate of 1 picosecond(10\\^12 ticks per second or 1 THz), \"1ns\" is automatically convertedto 1000. There are other convenience parameters like Percent,Cycles, MemorySize and many more.Once you have declared these parameters in the SimObject file, you needto copy their values to your C++ class in its constructor. The followingcode shows the changes to the HelloObject constructor.``` {.sourceCode .c++}HelloObject::HelloObject(HelloObjectParams params) : SimObject(params), event(this), myName(params->name), latency(params->time_to_wait), timesLeft(params->number_of_fires){ DPRINTF(Hello, “Created the hello object with the name %s\\n”, myName);}Here, we use the parameter's values for the default values of latencyand timesLeft. Additionally, we store the `name` from the parameterobject to use it later in the member variable `myName`. Each `params`instantiation has a name which comes from the Python config file when itis instantiated.However, assigning the name here is just an example of using the paramsobject. For all SimObjects, there is a `name()` function that alwaysreturns the name. Thus, there is never a need to store the name likeabove.To the HelloObject class declaration, add a member variable for thename.``` {.sourceCode .c++}class HelloObject : public SimObject{ private: void processEvent(); EventWrapper<HelloObject, &HelloObject::processEvent> event; std::string myName; Tick latency; int timesLeft; public: HelloObject(HelloObjectParams *p); void startup();};When we run gem5 with the above, we get the following error:gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 4 2017 14:46:36gem5 started Jan 4 2017 14:46:52gem5 executing on chinook, pid 3422command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.pyGlobal frequency set at 1000000000000 ticks per secondfatal: hello.time_to_wait without default or user set valueThis is because the time_to_wait parameter does not have a defaultvalue. Therefore, we need to update the Python config file(run_hello.py) to specify this value.``` {.sourceCode .python}root.hello = HelloObject(time_to_wait = ‘2us’)Or, we can specify `time_to_wait` as a member variable. Either option isexactly the same because the C++ objects are not created until`m5.instantiate()` is called.``` {.sourceCode .python}root.hello = HelloObject()root.hello.time_to_wait = '2us'The output of this simple script is the following when running the theHello debug flag.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 4 2017 14:46:36gem5 started Jan 4 2017 14:50:08gem5 executing on chinook, pid 3455command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/run_hello.pyGlobal frequency set at 1000000000000 ticks per second 0: hello: Created the hello object with the name helloBeginning simulation!info: Entering event queue @ 0. Starting simulation...2000000: hello: Hello world! Processing the event! 0 left2000000: hello: Done firing!Exiting @ tick 18446744073709551615 because simulate() limit reachedYou can also modify the config script to fire the event multiple times.Other SimObjects as parametersYou can also specify other SimObjects as parameters. To demonstratethis, we are going to create a new SimObject, GoodbyeObject. Thisobject is going to have a simple function that says “Goodbye” to anotherSimObject. To make it a little more interesting, the GoodbyeObject isgoing to have a buffer to write the message, and a limited bandwidth towrite the message.First, declare the SimObject in the SConscript file:``` {.sourceCode .python}Import(‘*’)SimObject(‘HelloObject.py’)Source(‘hello_object.cc’)Source(‘goodbye_object.cc’)DebugFlag(‘Hello’)The new SConscript file can be downloadedhere \\<../\\_static/scripts/part2/parameters/SConscript\\>Next, you need to declare the new SimObject in a SimObject Python file.Since the `GoodbyeObject` is highly related to the `HelloObject`, wewill use the same file. You can add the following code to`HelloObject.py`.This object has two parameters, both with default values. The firstparameter is the size of a buffer and is a `MemorySize` parameter.Second is the `write_bandwidth` which specifies the speed to fill thebuffer. Once the buffer is full, the simulation will exit.``` {.sourceCode .python}class GoodbyeObject(SimObject): type = 'GoodbyeObject' cxx_header = \"learning_gem5/goodbye_object.hh\" buffer_size = Param.MemorySize('1kB', \"Size of buffer to fill with goodbye\") write_bandwidth = Param.MemoryBandwidth('100MB/s', \"Bandwidth to fill \" \"the buffer\")The updated HelloObject.py file can be downloadedhere <../_static/scripts/part2/parameters/HelloObject.py>Now, we need to implement the GoodbyeObject.``` {.sourceCode .c++}#ifndef LEARNING_GEM5_GOODBYE_OBJECT_HH#define LEARNING_GEM5_GOODBYE_OBJECT_HH#include #include “params/GoodbyeObject.hh”#include “sim/sim_object.hh”class GoodbyeObject : public SimObject{ private: void processEvent();/** * Fills the buffer for one iteration. If the buffer isn't full, this * function will enqueue another event to continue filling. */void fillBuffer();EventWrapper<GoodbyeObject, &GoodbyeObject::processEvent> event;/// The bytes processed per tickfloat bandwidth;/// The size of the buffer we are going to fillint bufferSize;/// The buffer we are putting our message inchar *buffer;/// The message to put into the buffer.std::string message;/// The amount of the buffer we've used so far.int bufferUsed;public: GoodbyeObject(GoodbyeObjectParams *p); ~GoodbyeObject();/** * Called by an outside object. Starts off the events to fill the buffer * with a goodbye message. * * @param name the name of the object we are saying goodbye to. */void sayGoodbye(std::string name); };#endif // LEARNING_GEM5_GOODBYE_OBJECT_HH``` {.sourceCode .c++}#include \"learning_gem5/goodbye_object.hh\"#include \"debug/Hello.hh\"#include \"sim/sim_exit.hh\"GoodbyeObject::GoodbyeObject(GoodbyeObjectParams *params) : SimObject(params), event(*this), bandwidth(params->write_bandwidth), bufferSize(params->buffer_size), buffer(nullptr), bufferUsed(0){ buffer = new char[bufferSize]; DPRINTF(Hello, \"Created the goodbye object\\n\");}GoodbyeObject::~GoodbyeObject(){ delete[] buffer;}voidGoodbyeObject::processEvent(){ DPRINTF(Hello, \"Processing the event!\\n\"); fillBuffer();}voidGoodbyeObject::sayGoodbye(std::string other_name){ DPRINTF(Hello, \"Saying goodbye to %s\\n\", other_name); message = \"Goodbye \" + other_name + \"!! \"; fillBuffer();}voidGoodbyeObject::fillBuffer(){ // There better be a message assert(message.length() > 0); // Copy from the message to the buffer per byte. int bytes_copied = 0; for (auto it = message.begin(); it < message.end() && bufferUsed < bufferSize - 1; it++, bufferUsed++, bytes_copied++) { // Copy the character into the buffer buffer[bufferUsed] = *it; } if (bufferUsed < bufferSize - 1) { // Wait for the next copy for as long as it would have taken DPRINTF(Hello, \"Scheduling another fillBuffer in %d ticks\\n\", bandwidth * bytes_copied); schedule(event, curTick() + bandwidth * bytes_copied); } else { DPRINTF(Hello, \"Goodbye done copying!\\n\"); // Be sure to take into account the time for the last bytes exitSimLoop(buffer, 0, curTick() + bandwidth * bytes_copied); }}GoodbyeObject*GoodbyeObjectParams::create(){ return new GoodbyeObject(this);}The header file can be downloadedhere <../_static/scripts/part2/parameters/goodbye_object.hh> and theimplementation can be downloadedhere <../_static/scripts/part2/parameters/goodbye_object.cc>The interface to this GoodbyeObject is simple a function sayGoodbyewhich takes a string as a parameter. When this function is called, thesimulator builds the message and saves it in a member variable. Then, webegin filling the buffer.To model the limited bandwidth, each time we write the message to thebuffer, we pause for the latency it takes to write the message. We use asimple event to model this pause.Since we used a MemoryBandwidth parameter in the SimObjectdeclaration, the bandwidth variable is automatically converted intoticks per byte, so calculating the latency is simply the bandwidth timesthe bytes we want to write the buffer.Finally, when the buffer is full, we call the function exitSimLoop,which will exit the simulation. This function takes three parameters,the first is the message to return to the Python config script(exit_event.getCause()), the second is the exit code, and the third iswhen to exit.Adding the GoodbyeObject as a parameter to the HelloObjectFirst, we will also add a GoodbyeObject as a parameter to theHelloObject. To do this, you simply specify the SimObject class nameas the TypeName of the Param. You can have a default, or not, justlike a normal parameter.``` {.sourceCode .python}class HelloObject(SimObject): type = ‘HelloObject’ cxx_header = “learning_gem5/hello_object.hh”time_to_wait = Param.Latency(\"Time before firing the event\")number_of_fires = Param.Int(1, \"Number of times to fire the event before \" \"goodbye\")goodbye_object = Param.GoodbyeObject(\"A goodbye object\") ```The updated HelloObject.py file can be downloadedhere <../_static/scripts/part2/parameters/HelloObject.py>Second, we will add a reference to a GoodbyeObject to theHelloObject class.``` {.sourceCode .c++}class HelloObject : public SimObject{ private: void processEvent();EventWrapper<HelloObject, &HelloObject::processEvent> event;/// Pointer to the corresponding GoodbyeObject. Set via Pythonconst GoodbyeObject* goodbye;/// The name of this object in the Python config fileconst std::string myName;/// Latency between calling the event (in ticks)const Tick latency;/// Number of times left to fire the event before goodbyeint timesLeft;public: HelloObject(HelloObjectParams *p);void startup(); }; ```Then, we need to update the constructor and the process event functionof the HelloObject. We also add a check in the constructor to makesure the goodbye pointer is valid. It is possible to pass a nullpointer as a SimObject via the parameters by using the NULL specialPython SimObject. We should panic when this happens since it is not acase this object has been coded to accept.``` {.sourceCode .c++}#include “learning_gem5/part2/hello_object.hh”#include “base/misc.hh”#include “debug/Hello.hh”HelloObject::HelloObject(HelloObjectParams params) : SimObject(params), event(this), goodbye(params->goodbye_object), myName(params->name), latency(params->time_to_wait), timesLeft(params->number_of_fires){ DPRINTF(Hello, “Created the hello object with the name %s\\n”, myName); panic_if(!goodbye, “HelloObject must have a non-null GoodbyeObject”);}Once we have processed the number of event specified by the parameter,we should call the `sayGoodbye` function in the `GoodbyeObject`.``` {.sourceCode .c++}voidHelloObject::processEvent(){ timesLeft--; DPRINTF(Hello, \"Hello world! Processing the event! %d left\\n\", timesLeft); if (timesLeft <= 0) { DPRINTF(Hello, \"Done firing!\\n\"); goodbye.sayGoodbye(myName); } else { schedule(event, curTick() + latency); }}You can find the updated header filehere <../_static/scripts/part2/parameters/hello_object.hh> and theimplementation filehere <../_static/scripts/part2/parameters/hello_object.cc>.Updating the config scriptLastly, we need to add the GoodbyeObject to the config script. Createa new config script, hello_goodbye.py and instantiate both the helloand the goodbye objects. For instance, one possible script is thefollowing.``` {.sourceCode .python}import m5from m5.objects import *root = Root(full_system = False)root.hello = HelloObject(time_to_wait = ‘2us’, number_of_fires = 5)root.hello.goodbye_object = GoodbyeObject(buffer_size=’100B’)m5.instantiate()print “Beginning simulation!”exit_event = m5.simulate()print ‘Exiting @ tick %i because %s’ % (m5.curTick(), exit_event.getCause())```You can download this scripthere <../_static/scripts/part2/parameters/hello_goodbye.py>Running this script generates the following output.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Jan 4 2017 15:17:14gem5 started Jan 4 2017 15:18:41gem5 executing on chinook, pid 3838command line: build/X86/gem5.opt --debug-flags=Hello configs/learning_gem5/part2/hello_goodbye.pyGlobal frequency set at 1000000000000 ticks per second 0: hello.goodbye_object: Created the goodbye object 0: hello: Created the hello objectBeginning simulation!info: Entering event queue @ 0. Starting simulation...2000000: hello: Hello world! Processing the event! 4 left4000000: hello: Hello world! Processing the event! 3 left6000000: hello: Hello world! Processing the event! 2 left8000000: hello: Hello world! Processing the event! 1 left10000000: hello: Hello world! Processing the event! 0 left10000000: hello: Done firing!10000000: hello.goodbye_object: Saying goodbye to hello10000000: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10152592: hello.goodbye_object: Processing the event!10152592: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10305184: hello.goodbye_object: Processing the event!10305184: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10457776: hello.goodbye_object: Processing the event!10457776: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10610368: hello.goodbye_object: Processing the event!10610368: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10762960: hello.goodbye_object: Processing the event!10762960: hello.goodbye_object: Scheduling another fillBuffer in 152592 ticks10915552: hello.goodbye_object: Processing the event!10915552: hello.goodbye_object: Goodbye done copying!Exiting @ tick 10944163 because Goodbye hello!! Goodbye hello!! Goodbye hello!! Goodbye hello!! Goodbye hello!! Goodbye hello!! GooYou can modify the parameters to these two SimObjects and see how theoverall execution time (Exiting @ tick 10944163) changes. To runthese tests, you may want to remove the debug flag so there is lessoutput to the terminal.In the next chapters, we will create a more complex and more usefulSimObject, culminating with a simple blocking uniprocessor cacheimplementation.", |
| "url": "/parameters/" |
| } |
| , |
| |
| "simplecache": { |
| "title": "Creating a simple cache object", |
| "content": " authors Jason Lowe-PowerCreating a simple cache objectIn this chapter, we will take the framework for a memory object wecreated in the last chapter <memoryobject-chapter> and add cachinglogic to it.SimpleCache SimObjectAfter creating the SConscript file, that you can downloadhere <../_static/scripts/part2/simplecache/SConscript>, we can createthe SimObject Python file. We will call this simple memory objectSimpleCache and create the SimObject Python file insrc/learning_gem5/simple_cache.``` {.sourceCode .python}from m5.params import *from m5.proxy import *from MemObject import MemObjectclass SimpleCache(MemObject): type = ‘SimpleCache’ cxx_header = “learning_gem5/simple_cache/simple_cache.hh”cpu_side = VectorSlavePort(\"CPU side port, receives requests\")mem_side = MasterPort(\"Memory side port, sends requests\")latency = Param.Cycles(1, \"Cycles taken on a hit or to resolve a miss\")size = Param.MemorySize('16kB', \"The size of the cache\")system = Param.System(Parent.any, \"The system this cache is part of\") ```There are a couple of differences between this SimObject file and theone from the previous chapter <memoryobject-chapter>. First, we have acouple of extra parameters. Namely, a latency for cache accesses and thesize of the cache. parameters-chapter goes into more detail about thesekinds of SimObject parameters.Next, we include a System parameter, which is a pointer to the mainsystem this cache is connected to. This is needed so we can get thecache block size from the system object when we are initializing thecache. To reference the system object this cache is connected to, we usea special proxy parameter. In this case, we use Parent.any.In the Python config file, when a SimpleCache is instantiated, thisproxy parameter searches through all of the parents of the SimpleCacheinstance to find a SimObject that matches the System type. Since weoften use a System as the root SimObject, you will often see asystem parameter resolved with this proxy parameter.The third and final difference between the SimpleCache and theSimpleMemobj is that instead of having two named CPU ports(inst_port and data_port), the SimpleCache use another specialparameter: the VectorPort. VectorPorts behave similarly to regularports (e.g., they are resolved via getMasterPort and getSlavePort),but they allow this object to connect to multiple peers. Then, in theresolution functions the parameter we ignored before (PortID idx) isused to differentiate between the different ports. By using a vectorport, this cache can be connected into the system more flexibly than theSimpleMemobj.Implementing the SimpleCacheMost of the code for the `SimpleCache is the same as theSimpleMemobj. There are a couple of changes in the constructor and thekey memory object functions.First, we need to create the CPU side ports dynamically in theconstructor and initialize the extra member functions based on theSimObject parameters.``` {.sourceCode .c++}SimpleCache::SimpleCache(SimpleCacheParams *params) : MemObject(params), latency(params->latency), blockSize(params->system->cacheLineSize()), capacity(params->size / blockSize), memPort(params->name + “.mem_side”, this), blocked(false), outstandingPacket(nullptr), waitingPortId(-1){ for (int i = 0; i < params->port_cpu_side_connection_count; ++i) { cpuPorts.emplace_back(name() + csprintf(“.cpu_side[%d]”, i), i, this); }}In this function, we use the `cacheLineSize` from the system parametersto set the `blockSize` for this cache. We also initialize the capacitybased on the block size and the parameter and initialize other membervariables we will need below. Finally, we must create a number of`CPUSidePorts` based on the number of connections to this object. Sincethe `cpu_side` port was declared as a `VectorSlavePort` in the SimObjectPython file, the parameter automatically has a variable`port_cpu_side_connection_count`. This is based on the Python name ofthe parameter. For each of these connections we add a new `CPUSidePort`to a `cpuPorts` vector declared in the `SimpleCache` class.We also add one extra member variable to the `CPUSidePort` to save itsid, and we add this as a parameter to its constructor.Next, we need to implement `getMasterPort` and `getSlavePort`. The`getMasterPort` is exactly the same as the `SimpleMemobj`. For`getSlavePort`, we now need to return the port based on the idrequested.``` {.sourceCode .c++}BaseSlavePort&SimpleCache::getSlavePort(const std::string& if_name, PortID idx){ if (if_name == \"cpu_side\" && idx < cpuPorts.size()) { return cpuPorts[idx]; } else { return MemObject::getSlavePort(if_name, idx); }}The implementation of the CPUSidePort and the MemSidePort is almostthe same as in the SimpleMemobj. The only difference is we need to addan extra parameter to handleRequest that is the id of the port whichthe request originated. Without this id, we would not be able to forwardthe response to the correct port. The SimpleMemobj knew which port tosend replies based on whether the original request was an instruction ordata accesses. However, this information is not useful to theSimpleCache since it uses a vector of ports and not named ports.The new handleRequest function does two different things than thehandleRequest function in the SimpleMemobj. First, it stores theport id of the request as discussed above. Since the SimpleCache isblocking and only allows a single request outstanding at a time, we onlyneed to save a single port id.Second, it takes time to access a cache. Therefore, we need to take intoaccount the latency to access the cache tags and the cache data for arequest. We added an extra parameter to the cache object for this, andin handleRequest we now use an event to stall the request for theneeded amount of time. We schedule a new event for latency cycles inthe future. The clockEdge function returns the tick that the nthcycle in the future occurs on.``` {.sourceCode .c++}boolSimpleCache::handleRequest(PacketPtr pkt, int port_id){ if (blocked) { return false; } DPRINTF(SimpleCache, “Got request for addr %#x\\n”, pkt->getAddr());blocked = true;waitingPortId = port_id;schedule(new AccessEvent(this, pkt), clockEdge(latency));return true; } ```The AccessEvent is a little more complicated than the EventWrapperwe used in events-chapter. Instead of using an EventWrapper, in theSimpleCache we will use a new class. The reason we cannot use anEventWrapper, is that we need to pass the packet (pkt) fromhandleRequest to the event handler function. The following code is theAccessEvent class. We only need to implement the process function,that calls the function we want to use as our event handler, in thiscase accessTming. We also pass the flag AutoDelete to the eventconstructor so we do not need to worry about freeing the memory for thedynamically created object. The event code will automatically delete theobject after the process function has executed.``` {.sourceCode .c++}class AccessEvent : public Event{ private: SimpleCache *cache; PacketPtr pkt; public: AccessEvent(SimpleCache *cache, PacketPtr pkt) : Event(Default_Pri, AutoDelete), cache(cache), pkt(pkt) { } void process() override { cache->accessTiming(pkt); }};Now, we need to implement the event handler, `accessTiming`.``` {.sourceCode .c++}voidSimpleCache::accessTiming(PacketPtr pkt){ bool hit = accessFunctional(pkt); if (hit) { pkt->makeResponse(); sendResponse(pkt); } else { <miss handling> }}This function first functionally accesses the cache. This functionaccessFunctional (described below) performs the functional access ofthe cache and either reads or writes the cache on a hit or returns thatthe access was a miss.If the access is a hit, we simply need to respond to the packet. Torespond, you first must call the function makeResponse on the packet.This converts the packet from a request packet to a response packet. Forinstance, if the memory command in the packet was a ReadReq this getsconverted into a ReadResp. Writes behave similarly. Then, we can sendthe response back to the CPU.The sendResponse function does the same things as the handleResponsefunction in the SimpleMemobj except that it uses the waitingPortIdto send the packet to the right port. In this function, we need to markthe SimpleCache unblocked before calling sendPacket in case the peeron the CPU side immediately calls sendTimingReq. Then, we try to sendretries to the CPU side ports if the SimpleCache can now receiverequests and the ports need to be sent retries.``` {.sourceCode .c++}void SimpleCache::sendResponse(PacketPtr pkt){ int port = waitingPortId;blocked = false;waitingPortId = -1;cpuPorts[port].sendPacket(pkt);for (auto& port : cpuPorts) { port.trySendRetry();} } ```Back to the accessTiming function, we now need to handle the cachemiss case. On a miss, we first have to check to see if the missingpacket is to an entire cache block. If the packet is aligned and thesize of the request is the size of a cache block, then we can simplyforward the request to memory, just like in the SimpleMemobj.However, if the packet is smaller than a cache block, then we need tocreate a new packet to read the entire cache block from memory. Here,whether the packet is a read or a write request, we send a read requestto memory to load the data for the cache block into the cache. In thecase of a write, it will occur in the cache after we have loaded thedata from memory.Then, we create a new packet, that is blockSize in size and we callthe allocate function to allocate memory in the Packet object forthe data that we will read from memory. Note: this memory is freed whenwe free the packet. We use the original request object in the packet sothe memory-side objects know the original requestor and the originalrequest type for statistics.Finally, we save the original packet pointer (pkt) in a membervariable outstandingPacket so we can recover it when the SimpleCachereceives a response. Then, we send the new packet across the memory sideport.``` {.sourceCode .c++}voidSimpleCache::accessTiming(PacketPtr pkt){ bool hit = accessFunctional(pkt); if (hit) { pkt->makeResponse(); sendResponse(pkt); } else { Addr addr = pkt->getAddr(); Addr block_addr = pkt->getBlockAddr(blockSize); unsigned size = pkt->getSize(); if (addr == block_addr && size == blockSize) { DPRINTF(SimpleCache, “forwarding packet\\n”); memPort.sendPacket(pkt); } else { DPRINTF(SimpleCache, “Upgrading packet to block size\\n”); panic_if(addr - block_addr + size > blockSize, “Cannot handle accesses that span multiple cache lines”); assert(pkt->needsResponse()); MemCmd cmd; if (pkt->isWrite() || pkt->isRead()) { cmd = MemCmd::ReadReq; } else { panic(\"Unknown packet type in upgrade size\"); } PacketPtr new_pkt = new Packet(pkt->req, cmd, blockSize); new_pkt->allocate(); outstandingPacket = pkt; memPort.sendPacket(new_pkt); }} } ```On a response from memory, we know that this was caused by a cache miss.The first step is to insert the responding packet into the cache.Then, either there is an outstandingPacket, in which case we need toforward that packet to the original requestor, or there is nooutstandingPacket which means we should forward the pkt in theresponse to the original requestor.If the packet we are receiving as a response was an upgrade packetbecause the original request was smaller than a cache line, then we needto copy the new data to the outstandingPacket packet or write to thecache on a write. Then, we need to delete the new packet that we made inthe miss handling logic.``` {.sourceCode .c++}boolSimpleCache::handleResponse(PacketPtr pkt){ assert(blocked); DPRINTF(SimpleCache, “Got response for addr %#x\\n”, pkt->getAddr()); insert(pkt);if (outstandingPacket != nullptr) { accessFunctional(outstandingPacket); outstandingPacket->makeResponse(); delete pkt; pkt = outstandingPacket; outstandingPacket = nullptr;} // else, pkt contains the data it needssendResponse(pkt);return true; } ```Functional cache logicNow, we need to implement two more functions: accessFunctional andinsert. These two functions make up the key components of the cachelogic.First, to functionally update the cache, we first need storage for thecache contents. The simplest possible cache storage is a map (hashtable)that maps from addresses to data. Thus, we will add the following memberto the SimpleCache.``` {.sourceCode .c++}std::unordered_map<Addr, uint8_t*> cacheStore;To access the cache, we first check to see if there is an entry in themap which matches the address in the packet. We use the `getBlockAddr`function of the `Packet` type to get the block-aligned address. Then, wesimply search for that address in the map. If we do not find theaddress, then this function returns `false`, the data is not in thecache, and it is a miss.Otherwise, if the packet is a write request, we need to update the datain the cache. To do this, we write the data from the packet to thecache. We use the `writeDataToBlock` function which writes the data inthe packet to the write offset into a potentially larger block of data.This function takes the cache block offset and the block size (as aparameter) and writes the correct offset into the pointer passed as thefirst parameter.If the packet is a read request, we need to update the packet's datawith the data from the cache. The `setDataFromBlock` function performsthe same offset calculation as the `writeDataToBlock` function, butwrites the packet with the data from the pointer in the first parameter.``` {.sourceCode .c++}boolSimpleCache::accessFunctional(PacketPtr pkt){ Addr block_addr = pkt->getBlockAddr(blockSize); auto it = cacheStore.find(block_addr); if (it != cacheStore.end()) { if (pkt->isWrite()) { pkt->writeDataToBlock(it->second, blockSize); } else if (pkt->isRead()) { pkt->setDataFromBlock(it->second, blockSize); } else { panic(\"Unknown packet type!\"); } return true; } return false;}Finally, we also need to implement the insert function. This functionis called every time the memory side port responds to a request.The first step is to check if the cache is currently full. If the cachehas more entries (blocks) than the capacity of the cache as set by theSimObject parameter, then we need to evict something. The following codeevicts a random entry by leveraging the hashtable implementation of theC++ unordered_map.On an eviction, we need to write the data back to the backing memory incase it has been updated. For this, we create a new Request-Packetpair. The packet uses a new memory command: MemCmd::WritebackDirty.Then, we send the packet across the memory side port (memPort) anderase the entry in the cache storage map.Then, after a block has potentially been evicted, we add the new addressto the cache. For this we simply allocate space for the block and add anentry to the map. Finally, we write the data from the response packet into the newly allocated block. This data is guaranteed to be the size ofthe cache block since we made sure to make a new packet in the cachemiss logic if the packet was smaller than a cache block.``` {.sourceCode .c++}voidSimpleCache::insert(PacketPtr pkt){ if (cacheStore.size() >= capacity) { // Select random thing to evict. This is a little convoluted since we // are using a std::unordered_map. See http://bit.ly/2hrnLP2 int bucket, bucket_size; do { bucket = random_mt.random(0, (int)cacheStore.bucket_count() - 1); } while ( (bucket_size = cacheStore.bucket_size(bucket)) == 0 ); auto block = std::next(cacheStore.begin(bucket), random_mt.random(0, bucket_size - 1)); RequestPtr req = new Request(block->first, blockSize, 0, 0); PacketPtr new_pkt = new Packet(req, MemCmd::WritebackDirty, blockSize); new_pkt->dataDynamic(block->second); // This will be deleted later DPRINTF(SimpleCache, \"Writing packet back %s\\n\", pkt->print()); memPort.sendTimingReq(new_pkt); cacheStore.erase(block->first);}uint8_t *data = new uint8_t[blockSize];cacheStore[pkt->getAddr()] = data;pkt->writeDataToBlock(data, blockSize); } ```Creating a config file for the cacheThe last step in our implementation is to create a new Python configscript that uses our cache. We can use the outline from thelast chapter <memoryobject-chapter> as a starting point. The onlydifference is we may want to set the parameters of this cache (e.g., setthe size of the cache to 1kB) and instead of using the named ports(data_port and inst_port), we just use the cpu_side port twice.Since cpu_side is a VectorPort, it will automatically createmultiple port connections.``` {.sourceCode .python}import m5from m5.objects import *…system.cache = SimpleCache(size=’1kB’)system.cpu.icache_port = system.cache.cpu_sidesystem.cpu.dcache_port = system.cache.cpu_sidesystem.membus = SystemXBar()system.cache.mem_side = system.membus.slave…The Python config file can be downloadedhere \\<../\\_static/scripts/part2/simplecache/simple\\_cache.py\\>Running this script should produce the expected output from the hellobinary. gem5 Simulator System. http://gem5.org gem5 is copyrighted software; use the --copyright option for details. gem5 compiled Jan 10 2017 17:38:15 gem5 started Jan 10 2017 17:40:03 gem5 executing on chinook, pid 29031 command line: build/X86/gem5.opt configs/learning_gem5/part2/simple_cache.py Global frequency set at 1000000000000 ticks per second warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) 0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000 warn: CoherentXBar system.membus has no snooping ports attached! warn: ClockedObject: More than one power state change request encountered within the same simulation tick Beginning simulation! info: Entering event queue @ 0. Starting simulation... Hello world! Exiting @ tick 56082000 because target called exit()Modifying the size of the cache, for instance to 128 KB, should improvethe performance of the system. gem5 Simulator System. http://gem5.org gem5 is copyrighted software; use the --copyright option for details. gem5 compiled Jan 10 2017 17:38:15 gem5 started Jan 10 2017 17:41:10 gem5 executing on chinook, pid 29037 command line: build/X86/gem5.opt configs/learning_gem5/part2/simple_cache.py Global frequency set at 1000000000000 ticks per second warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) 0: system.remote_gdb.listener: listening for remote gdb #0 on port 7000 warn: CoherentXBar system.membus has no snooping ports attached! warn: ClockedObject: More than one power state change request encountered within the same simulation tick Beginning simulation! info: Entering event queue @ 0. Starting simulation... Hello world! Exiting @ tick 32685000 because target called exit()Adding statistics to the cache------------------------------Knowing the overall execution time of the system is one importantmetric. However, you may want to include other statistics as well, suchas the hit and miss rates of the cache. To do this, we need to add somestatistics to the `SimpleCache` object.First, we need to declare the statistics in the `SimpleCache` object.They are part of the `Stats` namespace. In this case, we'll make fourstatistics. The number of `hits` and the number of `misses` are justsimple `Scalar` counts. We will also add a `missLatency` which is ahistogram of the time it takes to satisfy a miss. Finally, we'll add aspecial statistic called a `Formula` for the `hitRatio` that is acombination of other statistics (the number of hits and misses).``` {.sourceCode .c++}class SimpleCache : public MemObject{ private: ... Tick missTime; // To track the miss latency Stats::Scalar hits; Stats::Scalar misses; Stats::Histogram missLatency; Stats::Formula hitRatio; public: ... void regStats() override;};Next, we have to define the function to override the regStats functionso the statistics are registered with gem5’s statistics infrastructure.Here, for each statistic, we give it a name based on the “parent”SimObject name and a description. For the histogram statistic, we alsoneed to initialize it with how many buckets we want in the histogram.Finally, for the formula, we simply need to write the formula down incode.``` {.sourceCode .c++}voidSimpleCache::regStats(){ // If you don’t do this you get errors about uninitialized stats. MemObject::regStats();hits.name(name() + \".hits\") .desc(\"Number of hits\") ;misses.name(name() + \".misses\") .desc(\"Number of misses\") ;missLatency.name(name() + \".missLatency\") .desc(\"Ticks for misses to the cache\") .init(16) // number of buckets ;hitRatio.name(name() + \".hitRatio\") .desc(\"The ratio of hits to the total accesses to the cache\") ;hitRatio = hits / (hits + misses);}Finally, we need to use update the statistics in our code. In the`accessTiming` class, we can increment the `hits` and `misses` on a hitand miss respectively. Additionally, on a miss, we save the current timeso we can measure the latency.``` {.sourceCode .c++}voidSimpleCache::accessTiming(PacketPtr pkt){ bool hit = accessFunctional(pkt); if (hit) { hits++; // update stats pkt->makeResponse(); sendResponse(pkt); } else { misses++; // update stats missTime = curTick(); ...Then, when we get a response, we need to add the measured latency to ourhistogram. For this, we use the sample function. This adds a singlepoint to the histogram. This histogram automatically resizes the bucketsto fit the data it receives.``` {.sourceCode .c++}boolSimpleCache::handleResponse(PacketPtr pkt){ insert(pkt);missLatency.sample(curTick() - missTime);... ```The complete code for the SimpleCache header file can be downloadedhere <../_static/scripts/part2/simplecache/simple_cache.hh>, and thecomplete code for the implementation of the SimpleCache can bedownloadedhere <../_static/scripts/part2/simplecache/simple_cache.cc>.Now, if we run the above config file, we can check on the statistics inthe stats.txt file. For the 1 KB case, we get the followingstatistics. 91% of the accesses are hits and the average miss latency is53334 ticks (or 53 ns).system.cache.hits 8431 # Number of hitssystem.cache.misses 877 # Number of missessystem.cache.missLatency::samples 877 # Ticks for misses to the cachesystem.cache.missLatency::mean 53334.093501 # Ticks for misses to the cachesystem.cache.missLatency::gmean 44506.409356 # Ticks for misses to the cachesystem.cache.missLatency::stdev 36749.446469 # Ticks for misses to the cachesystem.cache.missLatency::0-32767 305 34.78% 34.78% # Ticks for misses to the cachesystem.cache.missLatency::32768-65535 365 41.62% 76.40% # Ticks for misses to the cachesystem.cache.missLatency::65536-98303 164 18.70% 95.10% # Ticks for misses to the cachesystem.cache.missLatency::98304-131071 12 1.37% 96.47% # Ticks for misses to the cachesystem.cache.missLatency::131072-163839 17 1.94% 98.40% # Ticks for misses to the cachesystem.cache.missLatency::163840-196607 7 0.80% 99.20% # Ticks for misses to the cachesystem.cache.missLatency::196608-229375 0 0.00% 99.20% # Ticks for misses to the cachesystem.cache.missLatency::229376-262143 0 0.00% 99.20% # Ticks for misses to the cachesystem.cache.missLatency::262144-294911 2 0.23% 99.43% # Ticks for misses to the cachesystem.cache.missLatency::294912-327679 4 0.46% 99.89% # Ticks for misses to the cachesystem.cache.missLatency::327680-360447 1 0.11% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::360448-393215 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::393216-425983 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::425984-458751 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::458752-491519 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::491520-524287 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::total 877 # Ticks for misses to the cachesystem.cache.hitRatio 0.905780 # The ratio of hits to the total accessAnd when using a 128 KB cache, we get a slightly higher hit ratio. Itseems like our cache is working as expected!system.cache.hits 8944 # Number of hitssystem.cache.misses 364 # Number of missessystem.cache.missLatency::samples 364 # Ticks for misses to the cachesystem.cache.missLatency::mean 64222.527473 # Ticks for misses to the cachesystem.cache.missLatency::gmean 61837.584812 # Ticks for misses to the cachesystem.cache.missLatency::stdev 27232.443748 # Ticks for misses to the cachesystem.cache.missLatency::0-32767 0 0.00% 0.00% # Ticks for misses to the cachesystem.cache.missLatency::32768-65535 254 69.78% 69.78% # Ticks for misses to the cachesystem.cache.missLatency::65536-98303 106 29.12% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::98304-131071 0 0.00% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::131072-163839 0 0.00% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::163840-196607 0 0.00% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::196608-229375 0 0.00% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::229376-262143 0 0.00% 98.90% # Ticks for misses to the cachesystem.cache.missLatency::262144-294911 2 0.55% 99.45% # Ticks for misses to the cachesystem.cache.missLatency::294912-327679 1 0.27% 99.73% # Ticks for misses to the cachesystem.cache.missLatency::327680-360447 1 0.27% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::360448-393215 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::393216-425983 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::425984-458751 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::458752-491519 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::491520-524287 0 0.00% 100.00% # Ticks for misses to the cachesystem.cache.missLatency::total 364 # Ticks for misses to the cachesystem.cache.hitRatio 0.960894 # The ratio of hits to the total access", |
| "url": "/simplecache/" |
| } |
| , |
| |
| "msibuilding": { |
| "title": "Compiling a SLICC protocol", |
| "content": " authors Jason Lowe-PowerCompiling a SLICC protocolThe SLICC fileNow that we have finished implementing the protocol, we need to compileit. You can download the complete SLICC files below: MSI-cache.sm <../../_static/scripts/part3/MSI_protocol/MSI-cache.sm> MSI-dir.sm <../../_static/scripts/part3/MSI_protocol/MSI-dir.sm> MSI-msg.sm <../../_static/scripts/part3/MSI_protocol/MSI-msg.sm>Before building the protocol, we need to create one more file:MSI.slicc. This file tells the SLICC compiler which state machinefiles to compile for this protocol. The first line contains the name ofour protocol. Then, the file has a number of include statements. Eachinclude statement has a file name. This filename can come from any ofthe protocol_dirs directories. We declared the current directory aspart of the protocol_dirs in the SConsopts file(protocol_dirs.append(str(Dir('.').abspath))). The other directory issrc/mem/protocol/. These files are included like C++h header files.Effectively, all of the files are processed as one large SLICC file.Thus, any files that declare types that are used in other files mustcome before the files they are used in (e.g., MSI-msg.sm must comebefore MSI-cache.sm since MSI-cache.sm uses the RequestMsg type).``` {.sourceCode .c++}protocol “MSI”;include “RubySlicc_interfaces.slicc”;include “MSI-msg.sm”;include “MSI-cache.sm”;include “MSI-dir.sm”;You can download the fill filehere \\<../../\\_static/scripts/part3/MSI\\_protocol/MSI.slicc\\>Compiling a protocol with SCons-------------------------------Most SCons defaults (found in `build_opts/`) specify the protocol as`MI_example`, an example, but poor performing protocol. Therefore, wecannot simply use a default build name (e.g., `X86` or `ARM`). We haveto specify the SCons options on the command line. The command line belowwill build our new protocol with the X86 ISA.``` {.sourceCode .sh}scons build/X86_MSI/gem5.opt --default=X86 PROTOCOL=MSI SLICC_HTML=TrueThis command will build gem5.opt in the directory build/X86_MSI. Youcan specify any directory here. This command line has two newparameters: --default and PROTOCOL. First, --default specifieswhich file to use in build_opts for defaults for all of the SConsvariables (e.g., ISA, CPU_MODELS). Next, PROTOCOL overrides anydefault for the PROTOCOL SCons variable in the default specified.Thus, we are telling SCons to specifically compile our new protocol, notwhichever protocol was specified in build_opts/X86.There is one more variable on this command line to build gem5:SLICC_HTML=True. When you specify this on the building command line,SLICC will generate the HTML tables for your protocol. You can find theHTML tables in <build directory>/mem/protocol/html. By default, theSLICC compiler skips building the HTML tables because it impacts theperformance of compiling gem5, especially when compiling on a networkfile system.After gem5 finishes compiling, you will have a gem5 binary with your newprotocol! If you want to build another protocol into gem5, you have tochange the PROTOCOL SCons variable. Thus, it is a good idea to use adifferent build directory for each protocol, especially if you will becomparing protocols.When building your protocol, you will likely encounter errors in yourSLICC code reported by the SLICC compiler. Most errors include the fileand line number of the error. Sometimes, this line number is the lineafter the error occurs. In fact, the line number can be far below theactual error. For instance, if the curly brackets do not matchcorrectly, the error will report the last line in the file as thelocation.", |
| "url": "/MSIbuilding/" |
| } |
| , |
| |
| "msidebugging": { |
| "title": "Debugging SLICC Protocols", |
| "content": " authors Jason Lowe-PowerDebugging SLICC ProtocolsIn this section, I present the steps that I took while debugging the MSIprotocol implemented earlier in this chapter. Learning to debugcoherence protocols is a challenge. The best way is by working withothers who have written SLICC protocols in the past. However, since you,the reader, cannot look over my shoulder while I am debugging aprotocol, I am trying to present the next-best thing.Here, I first present some high-level suggestions to tackling protocolerrors. Next, I discuss some details about deadlocks, and how tounderstand protocol traces that can be used to fix them. Then, I presentmy experience debugging the MSI protocol in this chapter in astream-of-consciousness style. I will show the error that was generated,then the solution to the error, sometimes with some commentary of thedifferent tactics I tried to solve the error.General debugging tipsRuby has many useful debug flags. However, the most useful, by far, isProtocolTrace. Below, you will see several examples of using theprotocol trace to debug a protocol. The protocol trace prints everytransition for all controllers. Thus, you can simply trace the entireexecution of the cache system.Other useful debug flags include: RubyGenerated Prints a bunch of stuff from the ruby generated code. RubyPort/RubySequencer See the details of sending/receiving messages into/out of ruby. RubyNetwork Prints entire network messages including the sender/receiver and thedata within the message for all messages. This flag is useful whenthere is a data mismatch.The first step to debugging a Ruby protocol is to run it with the Rubyrandom tester. The random tester issues semi-random requests into theRuby system and checks to make sure the returned data is correct. Tomake debugging faster, the random tester issues read requests from onecontroller for a block and a write request for the same cache block (buta different byte) from a different controller. Thus, the Ruby randomtester does a good job exercising the transient states and raceconditions in the protocol.Unfortunately, the random tester’s configuration is slightly differentthan when using normal CPUs. Thus, we need to use a differentMyCacheSystem than before. You can download this different cachesystem filehere <../../_static/scripts/part3/configs/test_caches.py> and youcan download the modified run scripthere <../../_static/scripts/part3/configs/ruby_test.py>. The testrun script is mostly the same as the simple run script, but creates theRubyRandomTester instead of CPUs.It is often a good idea to first run the random tester with a single“CPU”. Then, increase the number of loads from the default of 100 tosomething that takes a few minutes to execute on your host system. Next,if there are no errors, then increase the number of “CPUs” to two andreduce the number of loads to 100 again. Then, start increasing thenumber of loads. Finally, you can increase the number of CPUs tosomething reasonable for the system you are trying to simulate. If youcan run the random tester for 10-15 minutes, you can be slightlyconfident that the random tester isn’t going to find any other bugs.Once you have your protocol working with the random tester, you can moveon to using real applications. It is likely that real applications willexpose even more bugs in the protocol. If at all possible, it is mucheasier to debug your protocol with the random tester than with realapplications!Understanding Protocol TracesUnfortunately, despite extensive effort to catch bugs in them, coherenceprotocols (even heavily tested ones) will have bugs. Sometimes thesebugs are relatively simple fixes, while other times the bugs will bevery insidious and difficult to track down. In the worst case, the bugswill manifest themselves as deadlocks: bugs that literally prevent theapplication from making progress. Another similar problem is livelocks:where the program runs forever due to a cycle somewhere in the system.Whenever livelocks or deadlocks occur, the next thing to do is generatea protocol trace. Traces print a running list of every transition thatis happening in the memory system: memory requests starting andcompleting, L1 and directory transitions, etc. You can then use thesetraces to identify why the deadlock is occurring. However, as we willdiscuss in more detail below, debugging deadlocks in protocol traces isoften extremely challenging.Here, we discuss what appears in the protocol trace to help explain whatis happening. To start with, lets look at a small snippet of a protocoltrace (we will discuss the details of this trace further below):...4541 0 L1Cache Replacement MI_A>MI_A [0x4ac0, line 0x4ac0]4542 0 L1Cache PutAck MI_A>I [0x4ac0, line 0x4ac0]4549 0 Directory MemAck MI_M>I [0x4ac0, line 0x4ac0]4641 0 Seq Begin > [0x4aec, line 0x4ac0] LD4652 0 L1Cache Load I>IS_D [0x4ac0, line 0x4ac0]4657 0 Directory GetS I>S_M [0x4ac0, line 0x4ac0]4669 0 Directory MemData S_M>S [0x4ac0, line 0x4ac0]4674 0 Seq Done > [0x4aec, line 0x4ac0] 33 cycles4674 0 L1Cache DataDirNoAcks IS_D>S [0x4ac0, line 0x4ac0]5321 0 Seq Begin > [0x4aec, line 0x4ac0] ST5322 0 L1Cache Store S>SM_AD [0x4ac0, line 0x4ac0]5327 0 Directory GetM S>M_M [0x4ac0, line 0x4ac0]Every line in this trace has a set pattern in terms of what informationappears on that line. Specifically, the fields are: Current Tick: the tick the print is occurs in Machine Version: The number of the machine where this request iscoming from. For example, if there are 4 L1 caches, then the numberswould be 0-3. Assuming you have 1 L1 Cache per core, you can thinkof this as representing the core the request is coming from. Component: which part of the system is doing the print. Generally,Seq is shorthand for Sequencer, L1Cache represents the L1 Cache,“Directory” represents the directory, and so on. For L1 caches andthe directory, this represents the name of the machine type (i.e.,what is after “MachineType:” in the machine() definition). Action: what the component is doing. For example, “Begin” means theSequencer has received a new request, “Done” means that theSequencer is completing a previous request, and “DataDirNoAcks”means that our DataDirNoAcks event is being triggered. Transition (e.g., MI_A>MI_A): what state transition this actionis doing (format: “currentState>nextState”). If no transition ishappening, this is denoted with “>”. Address (e.g., [0x4ac0, line 0x4ac0]): the physical address of therequest (format: [wordAddress, lineAddress]). This address willalways be cache-block aligned except for requests from theSequencer and mandatoryQueue. (Optional) Comments: optionally, there is one additional field topass comments. For example, the “LD” , “ST”, and “33 cycles” linesuse this extra field to pass additional information to the trace –such as identifying the request as a load or store. For SLICCtransitions, APPEND_TRANSITION_COMMENT often use this, as wediscussed previously <MSI-actions-section>.Generally, spaces are used to separate each of these fields (the spacebetween the fields are added implicitly, you do not need to add them).However, sometimes if a field is very long, there may be no spaces orthe line may be shifted compared to other lines.Using this information, let’s analyze the above snippet. The first(tick) field tells us that this trace snippet is showing what washappening in the memory system between ticks 4541 and 5327. In thissnippet, all of the requests are coming from L1Cache-0 (core 0) andgoing to Directory-0 (the first bank of the directory). During thistime, we see several memory requests and state transitions for the cacheline 0x4ac0, both at the L1 caches and the directory. For example, intick 5322, the core executes a store to 0x4ac0. However, it currentlydoes not have that line in Modified in its cache (it is in Shared afterthe core loaded it from ticks 4641-4674), so it needs to requestownership for that line from the directory (which receives this requestin tick 5327). While waiting for ownership, L1Cache-0 transitions from S(Shared) to SM_AD (a transient state – was in S, going to M, waitingfor Ack and Data).To add a print to the protocol trace, you will need to add a print withthese fields with the ProtocolTrace flag. For example, if you look atsrc/mem/ruby/system/Sequencer.cc, you can see where theSeq Begin and Seq Done trace printscome from (search for ProtocolTrace).Errors I ran into debugging MSIgem5.opt: build/MSI/mem/ruby/system/Sequencer.cc:423: void Sequencer::readCallback(Addr, DataBlock&, bool, MachineType, Cycles, Cycles, Cycles): Assertion `m_readRequestTable.count(makeLineAddress(address))' failed.I’m an idiot, it was that I called readCallback in externalStoreHitinstead of writeCallback. It’s good to start simple!gem5.opt: build/MSI/mem/ruby/network/MessageBuffer.cc:220: Tick MessageBuffer::dequeue(Tick, bool): Assertion `isReady(current_time)' failed.I ran gem5 in GDB to get more information. Look atL1Cache_Controller::doTransitionWorker. The current transition is:event=L1Cache_Event_PutAck, state=L1Cache_State_MI_A,<next_state=@0x7fffffffd0a0>: L1Cache_State_FIRST This is more simplyMI_A->I on a PutAck See it’s in popResponseQueue.The problem is that the PutAck is on the forward network, not theresponse network.panic: Invalid transitionsystem.caches.controllers0 time: 3594 addr: 3264 event: DataDirAcks state: IS_DHmm. I think this shouldn’t have happened. The needed acks should alwaysbe 0 or you get data from the owner. Ah. So I implemented sendDataToReqat the directory to always send the number of sharers. If we get thisresponse in IS_D we don’t care whether or not there are sharers. Thus,to make things more simple, I’m just going to transition to S onDataDirAcks. This is a slight difference from the originalimplementation in Sorin et al.Well, actually, I think it’s that we send the request after we addourselves to the sharer list. The above is incorrect. Sorin et al.were not wrong! Let’s try not doing that!So, I fixed this by checking to see if the requestor is the ownerbefore sending the data to the requestor at the directory. Only if therequestor is the owner do we include the number of sharers. Otherwise,it doesn’t matter at all and we just set the sharers to 0. :: panic: Invalid transition system.caches.controllers0 time: 5332addr: 0x4ac0 event: Inv state: SM_ADFirst, let’s look at where Inv is triggered. If you get an invalidate…only then. Maybe it’s that we are on the sharer list and shouldn’t be?We can use protocol trace and grep to find what’s going on.``` {.sourceCode .sh}build/MSI/gem5.opt –debug-flags=ProtocolTrace configs/learning_gem5/part6/ruby_test.py | grep 0x4ac0 ... 4541 0 L1Cache Replacement MI_A>MI_A [0x4ac0, line 0x4ac0] 4542 0 L1Cache PutAck MI_A>I [0x4ac0, line 0x4ac0] 4549 0 Directory MemAck MI_M>I [0x4ac0, line 0x4ac0] 4641 0 Seq Begin > [0x4aec, line 0x4ac0] LD 4652 0 L1Cache Load I>IS_D [0x4ac0, line 0x4ac0] 4657 0 Directory GetS I>S_M [0x4ac0, line 0x4ac0] 4669 0 Directory MemData S_M>S [0x4ac0, line 0x4ac0] 4674 0 Seq Done > [0x4aec, line 0x4ac0] 33 cycles 4674 0 L1Cache DataDirNoAcks IS_D>S [0x4ac0, line 0x4ac0] 5321 0 Seq Begin > [0x4aec, line 0x4ac0] ST 5322 0 L1Cache Store S>SM_AD [0x4ac0, line 0x4ac0] 5327 0 Directory GetM S>M_M [0x4ac0, line 0x4ac0]Maybe there is a sharer in the sharers list when there shouldn't be? Wecan add a defensive assert in clearOwner and setOwner.``` {.sourceCode .c++}action(setOwner, \"sO\", desc=\"Set the owner\") { assert(getDirectoryEntry(address).Sharers.count() == 0); peek(request_in, RequestMsg) { getDirectoryEntry(address).Owner.add(in_msg.Requestor); }}action(clearOwner, \"cO\", desc=\"Clear the owner\") { assert(getDirectoryEntry(address).Sharers.count() == 0); getDirectoryEntry(address).Owner.clear();}Now, I get the following error:panic: Runtime Error at MSI-dir.sm:301: assert failure.This is in setOwner. Well, actually this is OK since we need to have thesharers still set until we count them to send the ack count to therequestor. Let’s remove that assert and see what happens. Nothing. Thatdidn’t help anything.When are invalidations sent from the directory? Only on S->M_M. So,here, we need to remove ourselves from the invalidation list. I think weneed to keep ourselves in the sharer list since we subtract one whensending the number of acks.Note: I’m coming back to this a little later. It turns out that both ofthese asserts are wrong. I found this out when running with more thanone CPU below. The sharers are set before clearing the Owner in M->S_Don a GetS.So, onto the next problem!panic: Deadlock detected: current_time: 56091 last_progress_time: 6090 difference: 50001 processor: 0Deadlocks are the worst kind of error. Whatever caused the deadlock isancient history (i.e., likely happened many cycles earlier), and oftenvery hard to track down.Looking at the tail of the protocol trace (note: sometimes you must putthe protocol trace into a file because it grows very big) I see thatthere is an address that is trying to be replaced. Let’s start there.56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]56091 0 L1Cache Replacement SM_A>SM_A [0x5ac0, line 0x5ac0]Before this replacement got stuck I see the following in the protocoltrace. Note: this is 50000 cycles in the past!...5592 0 L1Cache Store S>SM_AD [0x5ac0, line 0x5ac0]5597 0 Directory GetM S>M_M [0x5ac0, line 0x5ac0]...5641 0 Directory MemData M_M>M [0x5ac0, line 0x5ac0]...5646 0 L1Cache DataDirAcks SM_AD>SM_A [0x5ac0, line 0x5ac0]Ah! This clearly should not be DataDirAcks since we only have a singleCPU! So, we seem to not be subtracting properly. Going back to theprevious error, I was wrong about needing to keep ourselves in the list.I forgot that we no longer had the -1 thing. So, let’s remove ourselvesfrom the sharing list before sending the invalidations when weoriginally get the S->M request.So! With those changes the Ruby tester completes with a single core.Now, to make it harder we need to increase the number of loads we do andthen the number of cores.And, of course, when I increase it to 10,000 loads there is a deadlock.Fun!What I’m seeing at the end of the protocol trace is the following.144684 0 L1Cache Replacement MI_A>MI_A [0x5bc0, line 0x5bc0]...144685 0 Directory GetM MI_M>MI_M [0x54c0, line 0x54c0]...144685 0 L1Cache Replacement MI_A>MI_A [0x5bc0, line 0x5bc0]...144686 0 Directory GetM MI_M>MI_M [0x54c0, line 0x54c0]...144686 0 L1Cache Replacement MI_A>MI_A [0x5bc0, line 0x5bc0]...144687 0 Directory GetM MI_M>MI_M [0x54c0, line 0x54c0]...This is repeated for a long time.It seems that there is a circular dependence or something like thatcausing this deadlock.Well, it seems that I was correct. The order of the in_ports reallymatters! In the directory, I previously had the order: request,response, memory. However, there was a memory packet that was blockedbecause the request queue was blocked, which caused the circulardependence and the deadlock. The order should be memory, response, andrequest. I believe the memory/response order doesn’t matter since noresponses depend on memory and vice versa.Now, let’s try with two CPUs. First thing I run into is an assertfailure. I’m seeing the first assert in setState fail.``` {.sourceCode .c++}void setState(Addr addr, State state) { if (directory.isPresent(addr)) { if (state == State:M) { assert(getDirectoryEntry(addr).Owner.count() == 1); assert(getDirectoryEntry(addr).Sharers.count() == 0); } getDirectoryEntry(addr).DirState := state; if (state == State:I) { assert(getDirectoryEntry(addr).Owner.count() == 0); assert(getDirectoryEntry(addr).Sharers.count() == 0); } }}To track this problem down, let's add a debug statement (DPRINTF) andrun with protocol trace. First I added the following line just beforethe assert. Note that you are required to use the RubySlicc debug flag.This is the only debug flag included in the generated SLICC files.``` {.sourceCode .c++}DPRINTF(RubySlicc, \"Owner %s\\n\", getDirectoryEntry(addr).Owner);Then, I see the following output when running with ProtocolTrace andRubySlicc.118 0 Directory MemData M_M>M [0x400, line 0x400]118: system.caches.controllers2: MSI-dir.sm:160: Owner [NetDest (16) 1 0 - - - 0 - - - - - - - - - - - - - ]118 0 Directory GetM M>M [0x400, line 0x400]118: system.caches.controllers2: MSI-dir.sm:160: Owner [NetDest (16) 1 1 - - - 0 - - - - - - - - - - - - - ]It looks like when we process the GetM when in state M we need to firstclear the owner before adding the new owner. The other options is insetOwner we could have Set the Owner specifically instead of adding itto the NetDest.Oooo! This is a new error!panic: Runtime Error at MSI-dir.sm:229: Unexpected message type..What is this message that fails? Let’s use the RubyNetwork debug flag totry to track down what message is causing this error. A few lines abovethe error I see the following message whose destination is thedirectory.The destination is a NetDest which is a bitvector of MachineIDs. Theseare split into multiple sections. I know I’m running with two CPUs, sothe first two 0’s are for the CPUs, and the other 1 must be fore thedirectory.2285: PerfectSwitch-2: Message: [ResponseMsg: addr = [0x8c0, line 0x8c0] Type = InvAck Sender = L1Cache-1 Destination = [NetDest (16) 0 0 - - - 1 - - - - - - - - - - - - - ] DataBlk = [ 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0xb1 0xb2 0xb3 0xb4 0xca 0xcb 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 ] MessageSize = Control Acks = 0 ]This message has the type InvAck, which is clearly wrong! It seems thatwe are setting the requestor wrong when we send the invalidate (Inv)message to the L1 caches from the directory.Yes. This is the problem. We need to make the requestor the originalrequestor. This was already correct for the FwdGetS/M, but I missed theinvalidate somehow. On to the next error!panic: Invalid transitionsystem.caches.controllers0 time: 2287 addr: 0x8c0 event: LastInvAck state: SM_ADThis seems to be that I am not counting the acks correctly. It couldalso be that the directory is much slower than the other caches atresponding since it has to get the data from memory.If it’s the latter (which I should be sure to verify), what we could dois include an ack requirement for the directory, too. Then, when thedirectory sends the data (and the owner, too) decrement the needed acksand trigger the event based on the new ack count.Actually, that first hypothesis was not quite right. I printed out thenumber of acks whenever we receive an InvAck and what’s happening isthat the other cache is responding with an InvAck before the directoryhas told it how many acks to expect.So, what we need to do is something like what I was talking about above.First of all, we will need to let the acks drop below 0 and add thetotal acks to it from the directory message. Then, we are going to haveto complicate the logic for triggering last ack, etc.Ok. So now we’re letting the tbe.Acks drop below 0 and then adding thedirectory acks whenever they show up.Next error: This is a tough one. The error is now that the data doesn’tmatch as it should. Kind of like the deadlock, the data could have beencorrupted in the ancient past. I believe the address is the last one inthe protocol trace. :: panic: Action/check failure: proc: 0 address: 19688 data: 0x779e6d0byte_number: 0 m_value+byte_number: 53 byte: 0 [19688, value: 53,status: Check_Pending, initiating node: 0, store_count: 4]Time:5843So, it could be something to do with ack counts, though I don’t thinkthis is the issue. Either way, it’s a good idea to annotate the protocoltrace with the ack information. To do this, we can add comments to thetransition with APPEND_TRANSITION_COMMENT.{.sourceCode .c++}action(decrAcks, \"da\", desc=\"Decrement the number of acks\") { assert(is_valid(tbe)); tbe.Acks := tbe.Acks - 1; APPEND_TRANSITION_COMMENT(\"Acks: \"); APPEND_TRANSITION_COMMENT(tbe.Acks);}5737 1 L1Cache InvAck SM_AD>SM_AD [0x400, line 0x400] Acks: -1For these data issues, the debug flag RubyNetwork is useful because itprints the value of the data blocks at every point it is in the network.For instance, for the address in question above, it looks like the datablock is all 0’s after loading from main-memory. I believe this shouldhave valid data. In fact, if we go back in time some we see that therewas some non-zero elements.5382 1 L1Cache Inv S>I [0x4cc0, line 0x4cc0] 5383: PerfectSwitch-1: Message: [ResponseMsg: addr = [0x4cc0, line0x4cc0] Type = InvAck Sender = L1Cache-1 Destination = [NetDest (16) 10 - - - 0 - - - - - - - - - - - - - ] DataBlk = [ 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 0x35 0x36 0x37 0x61 0x6d 0x6e 0x6f 0x70 0x0 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 ] MessageSize = Control Acks =0 ] … … … 5389 0 Directory MemData M_M>M [0x4cc0, line 0x4cc0]5390: PerfectSwitch-2: incoming: 0 5390: PerfectSwitch-2: Message:[ResponseMsg: addr = [0x4cc0, line 0x4cc0] Type = Data Sender =Directory-0 Destination = [NetDest (16) 1 0 - - - 0 - - - - - - - - - ] DataBlk = [ 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x00x0 ] MessageSize = Data Acks = 1 ] It seems that memory is not being updated correctly on the M->Stransition. After lots of digging and using the MemoryAccess debug flagto see exactly what was being read and written to main memory, I foundthat in sendDataToMem I was using the request_in. This is right forPutM, but not right for Data. We need to have another action to senddata from response queue!panic: Invalid transitionsystem.caches.controllers0 time: 44381 addr: 0x7c0 event: Inv state: SM_ADInvalid transition is my personal favorite kind of SLICC error. For thiserror, you know exactly what address caused it, and it’s very easy totrace through the protocol trace to find what went wrong. However, inthis case, nothing went wrong, I just forgot to put this transition in!Easy fix!", |
| "url": "/MSIdebugging/" |
| } |
| , |
| |
| "msiintro": { |
| "title": "Introduction to Ruby", |
| "content": " authors Jason Lowe-PowerIntroduction to RubyRuby comes from the multifacet GEMSproject. Ruby provides a detailedcache memory and cache coherence models as well as a detailed networkmodel (Garnet).Ruby is flexible. It can model many different kinds of coherenceimplementations, including broadcast, directory, token, region-basedcoherence, and is simple to extend to new coherence models.Ruby is a mostly drop-in replacement for the classic memory system.There are interfaces between the classic gem5 MemObjects and Ruby, butfor the most part, the classic caches and Ruby are not compatible.In this part of the book, we will first go through creating an exampleprotocol from the protocol description to debugging and running theprotocol.Before diving into a protocol, we will first talk about some of thearchitecture of Ruby. The most important structure in Ruby is thecontroller, or state machine. Controllers are implemented by writing aSLICC state machine file.SLICC is a domain-specific language (Specification Language includingCache Coherence) for specifying coherence protocols. SLICC files end in“.sm” because they are state machine files. Each file describesstates, transitions from a begin to an end state on some event, andactions to take during the transition.Each coherence protocol is made up of multiple SLICC state machinefiles. These files are compiled with the SLICC compiler which is writtenin Python and part of the gem5 source. The SLICC compiler takes thestate machine files and output a set of C++ files that are compiled withall of gem5’s other files. These files include the SimObject declarationfile as well as implementation files for SimObjects and other C++objects.Currently, gem5 supports compiling only a single coherence protocol at atime. For instance, you can compile MI_example into gem5 (the default,poor performance, protocol), or you can use MESI_Two_Level. But, touse MESI_Two_Level, you have to recompile gem5 so the SLICC compilercan generate the correct files for the protocol. We discuss this furtherin the compilation section <MSI-building-section>Now, let’s dive into implementing our first coherence protocol!", |
| "url": "/MSIintro/" |
| } |
| , |
| |
| "cache-actions": { |
| "title": "Action code blocks", |
| "content": " authors Jason Lowe-PowerAction code blocksThe next section of the state machine file is the action blocks. Theaction blocks are executed during a transition from one state toanother, and are called by the transition code blocks (which we willdiscuss in the next section <MSI-transitions-section>). Actions aresingle action blocks. Some examples are “send a message to thedirectory” and “pop the head of the buffer”. Each action should be smalland only perform a single action.The first action we will implement is an action to send a GetS requestto the directory. We need to send a GetS request to the directorywhenever we want to read some data that is not in the Modified or Sharedstates in our cache. As previously mentioned, there are three variablesthat are automatically populated inside the action block (like thein_msg in peek blocks). address is the address that was passedinto the trigger function, cache_entry is the cache entry passedinto the trigger function, and tbe is the TBE passed into thetrigger function.``` {.sourceCode .c++}action(sendGetS, ‘gS’, desc=”Send GetS to the directory”) { enqueue(request_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:GetS; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); // See mem/protocol/RubySlicc_Exports.sm for possible sizes. out_msg.MessageSize := MessageSizeType:Control; // Set that the requestor is this machine so we get the response. out_msg.Requestor := machineID; }}When specifying the action block, there are two parameters: adescription and a \"shorthand\". These two parameters are used in the HTMLtable generation. The shorthand shows up in the transition cell, so itshould be as short as possible. SLICC provides a special syntax to allowfor bold (''), superscript ('\\^'), and spaces ('\\_') in the shorthand tohelp keep them short. Second, the description also shows up in the HTMLtable when you click on a particular action. The description can belonger and help explain what the action does.Next, in this action we are going to send a message to the directory onthe `request_out` port as declared above the `in_port` blocks. The`enqueue` function is similar to the `peek` function since it requires acode block. `enqueue`, however, has the special variable `out_msg`. Inthe `enqueue` block, you can modify the `out_msg` with the current data.The `enqueue` block takes three parameters, the message buffer to sendthe message, the type of the message, and a latency. This latency (1cycle in the example above and throughout this cache controller) is the*cache latency*. This is where you specify the latency of accessing thecache, in this case for a miss. Below we will see that specifying thelatency for a hit is similar.Inside the `enqueue` block is where the message data is populated. Forthe address of the request, we can use the automatically populated`address` variable. We are sending a GetS message, so we use thatmessage type. Next, we need to specify the destination of the message.For this, we use the `mapAddressToMachine` function that takes theaddress and the machine type we are sending to. This will look up in thecorrect `MachineID` based on the address. We call `Destination.add`because `Destination` is a `NetDest` object, or a bitmap of all`MachineID`.Finally, we need to specify the message size (from`mem/protocol/RubySlicc_Exports.sm`) and set ourselves as the requestor.By setting this `machineID` as the requestor, it will allow thedirectory to respond to this cache or forward it to another cache torespond to this request.Similarly, we can create actions for sending other get and put requests.Note that get requests represent requests for data and put requestsrepresent requests where we downgrading or evicting our copy of thedata.``` {.sourceCode .c++}action(sendGetM, \"gM\", desc=\"Send GetM to the directory\") { enqueue(request_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:GetM; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Control; out_msg.Requestor := machineID; }}action(sendPutS, \"pS\", desc=\"Send PutS to the directory\") { enqueue(request_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:PutS; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Control; out_msg.Requestor := machineID; }}action(sendPutM, \"pM\", desc=\"Send putM+data to the directory\") { enqueue(request_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:PutM; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.DataBlk := cache_entry.DataBlk; out_msg.MessageSize := MessageSizeType:Data; out_msg.Requestor := machineID; }}Next, we need to specify an action to send data to another cache in thecase that we get a forwarded request from the directory for anothercache. In this case, we have to peek into the request queue to get otherdata from the requesting message. This peek code block is exactly thesame as the ones in the in_port. When you nest an enqueue block in apeek block both in_msg and out_msg variables are available. Thisis needed so we know which other cache to send the data to.Additionally, in this action we use the cache_entry variable to getthe data to send to the other cache.``` {.sourceCode .c++}action(sendCacheDataToReq, “cdR”, desc=”Send cache data to requestor”) { assert(is_valid(cache_entry)); peek(forward_in, RequestMsg) { enqueue(response_out, ResponseMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:Data; out_msg.Destination.add(in_msg.Requestor); out_msg.DataBlk := cache_entry.DataBlk; out_msg.MessageSize := MessageSizeType:Data; out_msg.Sender := machineID; } }}Next, we specify actions for sending data to the directory and sendingan invalidation ack to the original requestor on a forward request whenthis cache does not have the data.``` {.sourceCode .c++}action(sendCacheDataToDir, \"cdD\", desc=\"Send the cache data to the dir\") { enqueue(response_out, ResponseMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:Data; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.DataBlk := cache_entry.DataBlk; out_msg.MessageSize := MessageSizeType:Data; out_msg.Sender := machineID; }}action(sendInvAcktoReq, \"iaR\", desc=\"Send inv-ack to requestor\") { peek(forward_in, RequestMsg) { enqueue(response_out, ResponseMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:InvAck; out_msg.Destination.add(in_msg.Requestor); out_msg.DataBlk := cache_entry.DataBlk; out_msg.MessageSize := MessageSizeType:Control; out_msg.Sender := machineID; } }}Another required action is to decrement the number of acks we arewaiting for. This is used when we get a invalidation ack from anothercache to track the total number of acks. For this action, we assume thatthere is a valid TBE and modify the implicit tbe variable in theaction block.Additionally, we have another example of making debugging easier inprotocols: APPEND_TRANSITION_COMMENT. This function takes a string, orsomething that can easily be converted to a string (e.g., int) as aparameter. It modifies the protocol trace output, which we willdiscuss in the debugging section <MSI-debugging-section>. On eachprotocol trace line that executes this action it will print the totalnumber of acks this cache is still waiting on. This is useful since thenumber of remaining acks is part of the cache block state.``` {.sourceCode .c++}action(decrAcks, “da”, desc=”Decrement the number of acks”) { assert(is_valid(tbe)); tbe.AcksOutstanding := tbe.AcksOutstanding - 1; APPEND_TRANSITION_COMMENT(“Acks: “); APPEND_TRANSITION_COMMENT(tbe.AcksOutstanding);}We also need an action to store the acks when we receive a message fromthe directory with an ack count. For this action, we peek into thedirectory's response message to get the number of acks and store them inthe (required to be valid) TBE.``` {.sourceCode .c++}action(storeAcks, \"sa\", desc=\"Store the needed acks to the TBE\") { assert(is_valid(tbe)); peek(response_in, ResponseMsg) { tbe.AcksOutstanding := in_msg.Acks + tbe.AcksOutstanding; } assert(tbe.AcksOutstanding > 0);}The next set of actions are to respond to CPU requests on hits andmisses. For these actions, we need to notify the sequencer (theinterface between Ruby and the rest of gem5) of the new data. In thecase of a store, we give the sequencer a pointer to the data block andthe sequencer updates the data in-place.``` {.sourceCode .c++}action(loadHit, “Lh”, desc=”Load hit”) { assert(is_valid(cache_entry)); cacheMemory.setMRU(cache_entry); sequencer.readCallback(address, cache_entry.DataBlk, false);}action(externalLoadHit, “xLh”, desc=”External load hit (was a miss)”) { assert(is_valid(cache_entry)); peek(response_in, ResponseMsg) { cacheMemory.setMRU(cache_entry); // Forward the type of machine that responded to this request // E.g., another cache or the directory. This is used for tracking // statistics. sequencer.readCallback(address, cache_entry.DataBlk, true, machineIDToMachineType(in_msg.Sender)); }}action(storeHit, “Sh”, desc=”Store hit”) { assert(is_valid(cache_entry)); cacheMemory.setMRU(cache_entry); // The same as the read callback above. sequencer.writeCallback(address, cache_entry.DataBlk, false);}action(externalStoreHit, “xSh”, desc=”External store hit (was a miss)”) { assert(is_valid(cache_entry)); peek(response_in, ResponseMsg) { cacheMemory.setMRU(cache_entry); sequencer.writeCallback(address, cache_entry.DataBlk, true, // Note: this could be the last ack. machineIDToMachineType(in_msg.Sender)); }}action(forwardEviction, “e”, desc=”sends eviction notification to CPU”) { if (send_evictions) { sequencer.evictionCallback(address); }}In each of these actions, it is vital that we call `setMRU` on the cacheentry. The `setMRU` function is what allows the replacement policy toknow which blocks are most recently accessed. If you leave out the`setMRU` call, the replacement policy will not operate correctly!On loads and stores, we call the `read/writeCallback` function on the`sequencer`. This notifies the sequencer of the new data or allows it towrite the data into the data block. These functions take four parameters(the last parameter is optional): address, data block, a boolean for ifthe original request was a miss, and finally, an optional `MachineType`.The final optional parameter is used for tracking statistics on wherethe data for the request was found. It allows you to track whether thedata comes from cache-to-cache transfers or from memory.Finally, we also have an action to forward evictions to the CPU. This isrequired for gem5's out-of-order models to squash speculative loads ifthe cache block is evicted before the load is committed. We use theparameter specified at the top of the state machine file to check ifthis is needed or not.Next, we have a set of cache management actions that allocate and freecache entries and TBEs. To create a new cache entry, we must have spacein the `CacheMemory` object. Then, we can call the `allocate` function.This allocate function doesn't actually allocate the host memory for thecache entry since this controller specialized the `Entry` type, which iswhy we need to pass a `new Entry` to the `allocate` function.Additionally, in these actions we call `set_cache_entry`,`unset_cache_entry`, and similar functions for the TBE. These set andunset the implicit variables that were passed in via the `trigger`function. For instance, when allocating a new cache block, we call`set_cache_entry` and in all actions proceeding `allocateCacheBlock` the`cache_entry` variable will be valid.There is also an action that copies the data from the cache data blockto the TBE. This allows us to keep the data around even after removingthe cache block until we are sure that this cache no longer areresponsible for the data.``` {.sourceCode .c++}action(allocateCacheBlock, \"a\", desc=\"Allocate a cache block\") { assert(is_invalid(cache_entry)); assert(cacheMemory.cacheAvail(address)); set_cache_entry(cacheMemory.allocate(address, new Entry));}action(deallocateCacheBlock, \"d\", desc=\"Deallocate a cache block\") { assert(is_valid(cache_entry)); cacheMemory.deallocate(address); // clear the cache_entry variable (now it's invalid) unset_cache_entry();}action(writeDataToCache, \"wd\", desc=\"Write data to the cache\") { peek(response_in, ResponseMsg) { assert(is_valid(cache_entry)); cache_entry.DataBlk := in_msg.DataBlk; }}action(allocateTBE, \"aT\", desc=\"Allocate TBE\") { assert(is_invalid(tbe)); TBEs.allocate(address); // this updates the tbe variable for other actions set_tbe(TBEs[address]);}action(deallocateTBE, \"dT\", desc=\"Deallocate TBE\") { assert(is_valid(tbe)); TBEs.deallocate(address); // this makes the tbe variable invalid unset_tbe();}action(copyDataFromCacheToTBE, \"Dct\", desc=\"Copy data from cache to TBE\") { assert(is_valid(cache_entry)); assert(is_valid(tbe)); tbe.DataBlk := cache_entry.DataBlk;}The next set of actions are for managing the message buffers. We need toadd actions to pop the head message off of the buffers after the messagehas been satisfied. The dequeue function takes a single parameter, atime for the dequeue to take place. Delaying the dequeue for a cycleprevents the in_port logic from consuming another message from thesame message buffer in a single cycle.``` {.sourceCode .c++}action(popMandatoryQueue, “pQ”, desc=”Pop the mandatory queue”) { mandatory_in.dequeue(clockEdge());}action(popResponseQueue, “pR”, desc=”Pop the response queue”) { response_in.dequeue(clockEdge());}action(popForwardQueue, “pF”, desc=”Pop the forward queue”) { forward_in.dequeue(clockEdge());}Finally, the last action is a stall. Below, we are using a \"z\\_stall\",which is the simplest kind of stall in SLICC. By leaving the actionblank, it generates a \"protocol stall\" in the `in_port` logic whichstalls all messages from being processed in the current message bufferand all lower priority message buffer. Protocols using \"z\\_stall\" areusually simpler, but lower performance since a stall on a high prioritybuffer can stall many requests that may not need to be stalled.``` {.sourceCode .c++}action(stall, \"z\", desc=\"Stall the incoming request\") { // z_stall}There are two other ways to deal with messages that cannot currently beprocessed that can improve the performance of protocols. (Note: We willnot be using these more complicated techniques in this simple exampleprotocol.) The first is recycle. The message buffers have a recyclefunction that moves the request on the head of the queue to the tail.This allows other requests in the buffer or requests in other buffers tobe processed immediately. recycle actions often improve theperformance of protocols significantly.However, recycle is not very realistic when compared to realimplementations of cache coherence. For a more realistichigh-performance solution to stalling messages, Ruby provides thestall_and_wait function on message buffers. This function takes thehead request and moves it into a separate structure tagged by anaddress. The address is user-specified, but is usually the request’saddress. Later, when the blocked request can be handled, there isanother function wakeUpBuffers(address) which will wake up allrequests stalled on address and wakeUpAllBuffers() that wakes up allof the stalled requests. When a request is “woken up” it is placed backinto the message buffer to be subsequently processed.", |
| "url": "/cache-actions/" |
| } |
| , |
| |
| "cache-declarations": { |
| "title": "Declaring a state machine", |
| "content": " authors Jason Lowe-PowerDeclaring a state machineLet’s start on our first state machine file! First, we will create theL1 cache controller for our MSI protocol.Create a file called MSI-cache.sm and the following code declares thestate machine.``` {.sourceCode .c++}machine(MachineType:L1Cache, “MSI cache”) : { }The first thing you'll notice about the state machine code is that islooks very C++-like. The state machine file is like creating a C++object in a header file, if you included all of the code there as well.When in doubt, C++ syntax with *probably* work in SLICC. However, thereare many cases where C++ syntax is incorrect syntax for SLICC as well ascases where SLICC extends the syntax.With `MachineType:L1Cache`, we are naming this state machine `L1Cache`.SLICC will generate many different objects for us from the state machineusing that name. For instance, once this file is compiled, there will bea new SimObject: `L1Cache_Controller` that is the cache controller. Alsoincluded in this declaration is a description of this state machine:\"MSI cache\".There are many cases in SLICC where you must include a description to goalong with the variable. The reason for this is that SLICC wasoriginally designed to just describe, not implement, coherenceprotocols. Today, these extra descriptions serve two purposes. First,they act as comments on what the author intended each variable, orstate, or event, to be used for. Second, many of them are still exportedinto HTML when building the HTML tables for the SLICC protocol. Thus,while browsing the HTML table, you can see the more detailed commentsfrom the author of the protocol. It is important to be clear with thesedescriptions since coherence protocols can get quite complicated.State machine parameters------------------------Proceeding the `machine()` declaration is a colon, after which all ofthe parameters to the state machine are declared. These parameters aredirectly exported to the SimObject that is generated by the statemachine.For our MSI L1 cache, we have the following parameters:``` {.sourceCode .c++}machine(MachineType:L1Cache, \"MSI cache\"): Sequencer *sequencer; CacheMemory *cacheMemory; bool send_evictions; <Message buffer declarations> { }First, we have a Sequencer. This is a special class that isimplemented in Ruby to interface with the rest of gem5. The Sequencer isa gem5 MemObject with a slave port so it can accept memory requestsfrom other objects. The sequencer accepts requests from a CPU (or othermaster port) and converts the gem5 the packet into a RubyRequest.Finally, the RubyRequest is pushed onto the mandatoryQueue of thestate machine. We will revisit the mandatoryQueue inin port section <MSI-in-ports-section>.Next, there is a CacheMemory object. This is what holds the cache data(i.e., cache entries). The exact implementation, size, etc. isconfigurable at runtime.Finally, we can specify any other parameters we would like, similar to ageneral SimObject. In this case, we have a boolean variablesend_evictions. This is used for out-of-order core models to notifythe load-store queue if an address is evicted after a load to squash aload if it is speculative.Next, also in the parameter block (i.e., before the first open bracket),we need to declare all of the message buffers that this state machinewill use. Message buffers are the interface between the state machineand the Ruby network. Messages are sent and received via the messagebuffers. Thus, for each virtual channel in our protocol we need aseparate message buffer.The MSI protocol needs three different virtual networks. Virtualnetworks are needed to prevent deadlock (e.g., it is bad if a responsegets stuck behind a stalled request). In this protocol, the highestpriority is responses (virtual network 2), followed by forwardedrequests (virtual network 1), then requests have the lowest priority(virtual network 0). See Sorin et al. for details on why these threevirtual networks are needed.The following code declares all of the needed message buffers. ``` {.sourceCode .c++} machine(MachineType:L1Cache, “MSI cache”) Sequencer *sequencer;CacheMemory *cacheMemory;bool send_evictions; MessageBuffer * requestToDir, network=”To”, virtual_network=”0”, vnet_type=”request”;MessageBuffer * responseToDirOrSibling, network=”To”, virtual_network=”2”, vnet_type=”response”; MessageBuffer * forwardFromDir, network=”From”, virtual_network=”1”, vnet_type=”forward”;MessageBuffer * responseFromDirOrSibling, network=”From”, virtual_network=”2”, vnet_type=”response”; MessageBuffer * mandatoryQueue; {}We have five different message buffers: two \"To\", two \"From\", and onespecial message buffer. The \"To\" message buffers are similar to slaveports in gem5. These are the message buffers that this controller usesto send messages to other controllers in the system. The \"From\" messagebuffers are like slave ports. This controller receives messages on\"From\" buffers from other controllers in the system.We have two different \"To\" buffers, one for low priority requests, andone for high priority responses. The priority for the networks are notinherent. The priority is based on the order that other controllers lookat the message buffers. It is a good idea to number the virtual networksso that higher numbers mean higher priority, but the virtual networknumber is ignored by Ruby except that messages on network 2 can only goto other message buffers on network 2 (i.e., messages can't jump fromone network to another).Similarly, there is two different ways this cache can receive messages,either as a forwarded request from the directory (e.g., another cacherequests a writable block and we have a readable copy) or as a responseto a request this controller made. The response is higher priority thanthe forwarded requests.Finally, there is a special message buffer, the `mandatoryQueue`. Thismessage buffer is used by the `Sequencer` to convert gem5 packets intoRuby requests. Unlike the other message buffers, `mandatoryQueue` doesnot connect to the Ruby network. Note: the name of this message bufferis hard-coded and must be exactly \"mandatoryQueue\".As previously mentioned, this parameter block is converted into theSimObject description file. Any parameters you put in this block will beSimObject parameters that are accessible from the Python configurationfiles. If you look at the generated file L1Cache\\_Controller.py, it willlook very familiar. Note: This is a generated file and you should nevermodify generated files directly!``` {.sourceCode .python}from m5.params import *from m5.SimObject import SimObjectfrom Controller import RubyControllerclass L1Cache_Controller(RubyController): type = 'L1Cache_Controller' cxx_header = 'mem/protocol/L1Cache_Controller.hh' sequencer = Param.RubySequencer(\"\") cacheMemory = Param.RubyCache(\"\") send_evictions = Param.Bool(\"\") requestToDir = Param.MessageBuffer(\"\") responseToDirOrSibling = Param.MessageBuffer(\"\") forwardFromDir = Param.MessageBuffer(\"\") responseFromDirOrSibling = Param.MessageBuffer(\"\") mandatoryQueue = Param.MessageBuffer(\"\")State declarationsThe next part of the state machine is the state declaration. Here, weare going to declare all of the stable and transient states for thestate machine. We will follow the naming convention in Sorin et al. Forinstance, the transient state “IM_AD” corresponds to moving fromInvalid to Modified waiting on acks and data. These states come directlyfrom the left column of Table 8.3 in Sorin et al.``` {.sourceCode .c++}state_declaration(State, desc=”Cache states”) { I, AccessPermission:Invalid, desc=”Not present/Invalid”;// States moving out of IIS_D, AccessPermission:Invalid, desc=\"Invalid, moving to S, waiting for data\";IM_AD, AccessPermission:Invalid, desc=\"Invalid, moving to M, waiting for acks and data\";IM_A, AccessPermission:Busy, desc=\"Invalid, moving to M, waiting for acks\";S, AccessPermission:Read_Only, desc=\"Shared. Read-only, other caches may have the block\";// States moving out of SSM_AD, AccessPermission:Read_Only, desc=\"Shared, moving to M, waiting for acks and 'data'\";SM_A, AccessPermission:Read_Only, desc=\"Shared, moving to M, waiting for acks\";M, AccessPermission:Read_Write, desc=\"Modified. Read & write permissions. Owner of block\";// States moving to InvalidMI_A, AccessPermission:Busy, desc=\"Was modified, moving to I, waiting for put ack\";SI_A, AccessPermission:Busy, desc=\"Was shared, moving to I, waiting for put ack\";II_A, AccessPermission:Invalid, desc=\"Sent valid data before receiving put ack. \"Waiting for put ack.\"; } ```Each state has an associated access permission: “Invalid”, “NotPresent”,“Busy”, “Read_Only”, or “Read_Write”. The access permission is usedfor functional accesses to the cache. Functional accesses aredebug-like accesses when the simulator wants to read or update the dataimmediately. One example of this is reading in files in SE mode whichare directly loaded into memory.For functional accesses all caches are checked to see if they have acorresponding block with matching address. For functional reads, allof the blocks with a matching address that have read-only or read-writepermission are accessed (they should all have the same data). Forfunctional writes, all blocks are updated with new data if they havebusy, read-only, or read-write permission.Event declarationsNext, we need to declare all of the events that are triggered byincoming messages for this cache controller. These events come directlyfrom the first row in Table 8.3 in Sorin et al.``` {.sourceCode .c++}enumeration(Event, desc=”Cache events”) { // From the processor/sequencer/mandatory queue Load, desc=”Load from processor”; Store, desc=”Store from processor”;// Internal event (only triggered from processor requests)Replacement, desc=\"Triggered when block is chosen as victim\";// Forwarded request from other cache via dir on the forward networkFwdGetS, desc=\"Directory sent us a request to satisfy GetS. We must have the block in M to respond to this.\";FwdGetM, desc=\"Directory sent us a request to satisfy GetM. We must have the block in M to respond to this.\";Inv, desc=\"Invalidate from the directory.\";PutAck, desc=\"Response from directory after we issue a put. This must be on the fwd network to avoid deadlock.\";// Responses from directoryDataDirNoAcks, desc=\"Data from directory (acks = 0)\";DataDirAcks, desc=\"Data from directory (acks > 0)\";// Responses from other cachesDataOwner, desc=\"Data from owner\";InvAck, desc=\"Invalidation ack from other cache after Inv\";// Special event to simplify implementationLastInvAck, desc=\"Triggered after the last ack is received\"; } ```User-defined structuresNext, we need to define some structures that we will use in other placesin this controller. The first one we will define is Entry. This is thestructure that is stored in the CacheMemory. It only needs to containdata and a state, but it may contain any other data you want. Note: Thestate that this structure is storing is the State type that wasdefined above, not a hardcoded state type.You can find the abstract version of this class (AbstractCacheEntry)in src/mem/ruby/slicc_interface/AbstractCacheEntry.hh. If you want touse any of the member functions of AbstractCacheEntry, you need todeclare them here (this isn’t used in this protocol).``` {.sourceCode .c++}structure(Entry, desc=”Cache entry”, interface=”AbstractCacheEntry”) { State CacheState, desc=”cache state”; DataBlock DataBlk, desc=”Data in the block”;}Another structure we will need is a TBE. TBE is the \"transaction bufferentry\". This stores information needed during transient states. This is*like* an MSHR. It functions as an MSHR in this protocol, but the entryis also allocated for other uses. In this protocol, it will store thestate (usually needed), data (also usually needed), and the number ofacks that this block is currently waiting for. The `AcksOutstanding` isused for the transitions where other controllers send acks instead ofthe data.``` {.sourceCode .c++}structure(TBE, desc=\"Entry for transient requests\") { State TBEState, desc=\"State of block\"; DataBlock DataBlk, desc=\"Data for the block. Needed for MI_A\"; int AcksOutstanding, default=0, desc=\"Number of acks left to receive.\";}Next, we need a place to store all of the TBEs. This is an externallydefined class; it is defined in C++ outside of SLICC. Therefore, we needto declare that we are going to use it, and also declare any of thefunctions that we will call on it. You can find the code for theTBETable in src/mem/ruby/structures/TBETable.hh. It is templatized onthe TBE structure defined above, which gets a little confusing, as wewill see.``` {.sourceCode .c++}structure(TBETable, external=”yes”) { TBE lookup(Addr); void allocate(Addr); void deallocate(Addr); bool isPresent(Addr);}The `external=\"yes\"` tells SLICC to not look for the definition of thisstructure. This is similar to declaring a variable `extern` in C/C++.Other declarations and definitions required-------------------------------------------Finally, we are going to go through some boilerplate of declaringvariables, declaring functions in `AbstractController` that we will usein this controller, and defining abstract functions in`AbstractController`.First, we need to have a variable that stores a TBE table. We have to dothis in SLICC because it is not until this time that we know the truetype of the TBE table since the TBE type was defined above. This is someparticularly tricky (or nasty) code to get SLICC to generate the rightC++ code. The difficulty is that we want templatize `TBETable` based onthe `TBE` type above. The key is that SLICC mangles the names of alltypes declared in the machine with the machine's name. For instance,`TBE` is actually L1Cache\\_TBE in C++.We also want to pass a parameter to the constructor of the `TBETable`.This is a parameter that is actually part of the `AbstractController`,thus we need to use the C++ name for the variable since it doesn't havea SLICC name.``` {.sourceCode .c++}TBETable TBEs, template=\"<L1Cache_TBE>\", constructor=\"m_number_of_TBEs\";If you can understand the above code, then you are an official SLICCninja!Next, any functions that are part of AbstractController need to bedeclared, if we are going to use them in the rest of the file. In thiscase, we are only going to use clockEdge()``` {.sourceCode .c++}Tick clockEdge();There are a few other functions we're going to use in actions. Thesefunctions are used in actions to set and unset implicit variablesavailable in action code-blocks. Action code blocks will be explained indetail in the action section \\<MSI-actions-section\\>. These may beneeded when a transition has many actions.``` {.sourceCode .c++}void set_cache_entry(AbstractCacheEntry a);void unset_cache_entry();void set_tbe(TBE b);void unset_tbe();Another useful function is mapAddressToMachine. This allows us tochange the address mappings for banked directories or caches at runtimeso we don’t have to hardcode them in the SLICC file.``` {.sourceCode .c++}MachineID mapAddressToMachine(Addr addr, MachineType mtype);Finally, you can also add any functions you may want to use in the fileand implement them here. For instance, it is convenient to access cacheblocks by address with a single function. Again, in this function thereis some SLICC trickery. We need to access \"by pointer\" since the cacheblock is something that we need to be mutable later (\"by reference\"would have been a better name). The cast is also necessary since wedefined a specific `Entry` type in the file, but the `CacheMemory` holdsthe abstract type.``` {.sourceCode .c++}// Convenience function to look up the cache entry.// Needs a pointer so it will be a reference and can be updated in actionsEntry getCacheEntry(Addr address), return_by_pointer=\"yes\" { return static_cast(Entry, \"pointer\", cacheMemory.lookup(address));}The next set of boilerplate code rarely changes between differentprotocols. There’s a set of functions that are pure-virtual inAbstractController that we must implement. getState Given a TBE, cache entry, and address return the state of the block.This is called on the block to decide which transition to executewhen an event is triggered. Usually, you return the state in the TBEor cache entry, whichever is valid. setState Given a TBE, cache entry, and address make sure the state is setcorrectly on the block. This is called at the end of the transitionto set the final state on the block. getAccessPermission Get the access permission of a block. This is used during functionalaccess to decide whether or not to functionally access the block. Itis similar to getState, get the information from the TBE if valid,cache entry, if valid, or the block is not present. setAccessPermission Like getAccessPermission, but sets the permission. functionalRead Functionally read the data. It is possible the TBE has moreup-to-date information, so check that first. Note: testAndRead/Writedefined in src/mem/ruby/slicc_interface/Util.hh functionalWrite Functionally write the data. Similarly, you may need to update thedata in both the TBE and the cache entry.``` {.sourceCode .c++}State getState(TBE tbe, Entry cache_entry, Addr addr) { // The TBE state will override the state in cache memory, if valid if (is_valid(tbe)) { return tbe.TBEState; } // Next, if the cache entry is valid, it holds the state else if (is_valid(cache_entry)) { return cache_entry.CacheState; } // If the block isn’t present, then it’s state must be I. else { return State:I; }}void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { if (is_valid(tbe)) { tbe.TBEState := state; } if (is_valid(cache_entry)) { cache_entry.CacheState := state; }}AccessPermission getAccessPermission(Addr addr) { TBE tbe := TBEs[addr]; if(is_valid(tbe)) { return L1Cache_State_to_permission(tbe.TBEState); }Entry cache_entry := getCacheEntry(addr);if(is_valid(cache_entry)) { return L1Cache_State_to_permission(cache_entry.CacheState);}return AccessPermission:NotPresent; }void setAccessPermission(Entry cache_entry, Addr addr, State state) { if (is_valid(cache_entry)) { cache_entry.changePermission(L1Cache_State_to_permission(state)); }}void functionalRead(Addr addr, Packet *pkt) { TBE tbe := TBEs[addr]; if(is_valid(tbe)) { testAndRead(addr, tbe.DataBlk, pkt); } else { testAndRead(addr, getCacheEntry(addr).DataBlk, pkt); }}int functionalWrite(Addr addr, Packet *pkt) { int num_functional_writes := 0;TBE tbe := TBEs[addr];if(is_valid(tbe)) { num_functional_writes := num_functional_writes + testAndWrite(addr, tbe.DataBlk, pkt); return num_functional_writes;}num_functional_writes := num_functional_writes + testAndWrite(addr, getCacheEntry(addr).DataBlk, pkt);return num_functional_writes; } ```", |
| "url": "/cache-declarations/" |
| } |
| , |
| |
| "cache-in-ports": { |
| "title": "In port code blocks", |
| "content": " authors Jason Lowe-PowerIn port code blocksAfter declaring all of the structures we need in the state machine file,the first “functional” part of the file are the “in ports”. This sectionspecifies what events to trigger on different incoming messages.However, before we get to the in ports, we must declare our out ports.``` {.sourceCode .c++}out_port(request_out, RequestMsg, requestToDir);out_port(response_out, ResponseMsg, responseToDirOrSibling);This code essentially just renames `requestToDir` and`responseToDirOrSibling` to `request_out` and `response_out`. Later inthe file, when we want to *enqueue* messages to these message buffers wewill use the new names `request_out` and `response_out`. This alsospecifies the exact implementation of the messages that we will sendacross these ports. We will look at the exact definition of these typesbelow in the file `MSI-msg.sm`.Next, we create an *in port code block*. In SLICC, there are many caseswhere there are code blocks that look similar to `if` blocks, but theyencode specific information. For instance, the code inside an`in_port()` block is put in a special generated file:`L1Cache_Wakeup.cc`.All of the `in_port` code blocks are executed in order (or based on thepriority if it is specified). On each active cycle for the controller,the first `in_port` code is executed. If it is successful, it isre-executed to see if there are other messages that can be consumed onthe port. If there are no messages or no events are triggered, then thenext `in_port` code block is executed.There are three different kinds of *stalls* that can be generated whenexecuting `in_port` code blocks. First, there is a parameterized limitfor the number of transitions per cycle at each controller. If thislimit is reached (i.e., there are more messages on the message buffersthan the transition per cycle limit), then all `in_port` will stopprocessing and wait to continue until the next cycle. Second, therecould be a *resource stall*. This happens if some needed resource isunavailable. For instance, if using the `BankedArray` bandwidth model,the needed bank of the cache may be currently occupied. Third, therecould be a *protocol stall*. This is a special kind of action thatcauses the state machine to stall until the next cycle.It is important to note that protocol stalls and resource stalls prevent**all** `in_port` blocks from executing. For instance, if the first`in_port` block generates a protocol stall, none of the other ports willbe executed, blocking all messages. This is why it is important to usethe correct number and ordering of virtual networks.Below, is the full code for the `in_port` block for the highest prioritymessages to our L1 cache controller, the response from directory orother caches. Next we will break the code block down to explain eachsection.``` {.sourceCode .c++}in_port(response_in, ResponseMsg, responseFromDirOrSibling) { if (response_in.isReady(clockEdge())) { peek(response_in, ResponseMsg) { Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs[in_msg.addr]; assert(is_valid(tbe)); if (machineIDToMachineType(in_msg.Sender) == MachineType:Directory) { if (in_msg.Type != CoherenceResponseType:Data) { error(\"Directory should only reply with data\"); } assert(in_msg.Acks + tbe.AcksOutstanding >= 0); if (in_msg.Acks + tbe.AcksOutstanding == 0) { trigger(Event:DataDirNoAcks, in_msg.addr, cache_entry, tbe); } else { trigger(Event:DataDirAcks, in_msg.addr, cache_entry, tbe); } } else { if (in_msg.Type == CoherenceResponseType:Data) { trigger(Event:DataOwner, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceResponseType:InvAck) { DPRINTF(RubySlicc, \"Got inv ack. %d left\\n\", tbe.AcksOutstanding); if (tbe.AcksOutstanding == 1) { trigger(Event:LastInvAck, in_msg.addr, cache_entry, tbe); } else { trigger(Event:InvAck, in_msg.addr, cache_entry, tbe); } } else { error(\"Unexpected response from other cache\"); } } } }}First, like the out_port above “response_in” is the name we’ll uselater when we refer to this port, and “ResponseMsg” is the type ofmessage we expect on this port (since this port processes responses toour requests). The first step in all in_port code blocks is to checkthe message buffer to see if there are any messages to be processed. Ifnot, then this in_port code block is skipped and the next one isexecuted.``` {.sourceCode .c++}in_port(response_in, ResponseMsg, responseFromDirOrSibling) { if (response_in.isReady(clockEdge())) { . . . }}Assuming there is a valid message in the message buffer, next, we grabthat message by using the special code block `peek`. Peek is a specialfunction. Any code inside a peek statement has a special variabledeclared and populated: `in_msg`. This contains the message (of typeResponseMsg in this case as specified by the second parameter of the`peek` call) at the head of the port. Here, `response_in` is the port wewant to peek into.Then, we need to grab the cache entry and the TBE for the incomingaddress. (We will look at the other parameters in response messagebelow.) Above, we implemented getCacheEntry. It will return either thevalid matching entry for the address, or an invalid entry if there isnot a matching cache block.For the TBE, since this is a response to a request this cache controllerinitiated, there *must* be a valid TBE in the TBE table. Hence, we seeour first debug statement, an *assert*. This is one of the ways to easedebugging of cache coherence protocols. It is encouraged to use assertsliberally to make debugging easier.``` {.sourceCode .c++}peek(response_in, ResponseMsg) { Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs[in_msg.addr]; assert(is_valid(tbe)); . . .}Next, we need to decide what event to trigger based on the message. Forthis, we first need to discuss what data response messages are carrying.To declare a new message type, first create a new file for all of themessage types: MSI-msg.sm. In this file, you can declare anystructures that will be globally used across all of the SLICC filesfor your protocol. We will include this file in all of the state machinedefinitions via the MSI.slicc file later. This is similar to includingglobal definitions in header files in C/C++.In the MSI-msg.sm file, add the following code block:``` {.sourceCode .c++}structure(ResponseMsg, desc=”Used for Dir->Cache and Fwd message responses”, interface=”Message”) { Addr addr, desc=”Physical address for this response”; CoherenceResponseType Type, desc=”Type of response”; MachineID Sender, desc=”Node who is responding to the request”; NetDest Destination, desc=”Multicast destination mask”; DataBlock DataBlk, desc=”data for the cache line”; MessageSizeType MessageSize, desc=”size category of the message”; int Acks, desc=”Number of acks required from others”;// This must be overridden here to support functional accessesbool functionalRead(Packet *pkt) { if (Type == CoherenceResponseType:Data) { return testAndRead(addr, DataBlk, pkt); } return false;}bool functionalWrite(Packet *pkt) { // No check on message type required since the protocol should read // data block from only those messages that contain valid data return testAndWrite(addr, DataBlk, pkt);} } ```The message is just another SLICC structure similar to the structureswe’ve defined before. However, this time, we have a specific interfacethat it is implementing: Message. Within this message, we can add anymembers that we need for our protocol. In this case, we first have theaddress. Note, a common “gotcha” is that you cannot use “Addr” with acapitol “A” for the name of the member since it is the same name as thetype!Next, we have the type of response. In our case, there are two types ofresponse data and invalidation acks from other caches after they haveinvalidated their copy. Thus, we need to define an enumeration, theCoherenceResponseType, to use it in this message. Add the followingcode before the ResponseMsg declaration in the same file.``` {.sourceCode .c++}enumeration(CoherenceResponseType, desc=”Types of response messages”) { Data, desc=”Contains the most up-to-date data”; InvAck, desc=”Message from another cache that they have inv. the blk”;}Next, in the response message type, we have the `MachineID` which sentthe response. `MachineID` is the *specific machine* that sent theresponse. For instance, it might be directory 0 or cache 12. The`MachineID` contains both the `MachineType` (e.g., we have been creatingan `L1Cache` as declared in the first `machine()`) and the specific*version* of that machine type. We will come back to machine versionnumbers when configuring the system.Next, all messages need a *destination*, and a *size*. The destinationis specified as a `NetDest`, which is a bitmap of all the `MachineID` inthe system. This allows messages to be broadcast to a flexible set ofreceivers. The message also has a size. You can find the possiblemessage sizes in `src/mem/protocol/RubySlicc_Exports.sm`.This message may also contain a data block and the number acks that areexpected. Thus, we can include these in the message definition as well.Finally, we also have to define functional read and write functions.These are used by Ruby to inspect in-flight messages on function readsand writes. Note: This functionality currently is very brittle and ifthere are messages in-flight for an address that is functionally read orwritten the functional access may fail.You can download the complete file `MSI-msg.sm`here \\<../../\\_static/scripts/part3/MSI\\_protocol/MSI-msg.sm\\>.Now that we have defined the data in the response message, we can lookat how we choose which action to trigger in the `in_port` for responseto the cache.``` {.sourceCode .c++}// If it's from the directory...if (machineIDToMachineType(in_msg.Sender) == MachineType:Directory) { if (in_msg.Type != CoherenceResponseType:Data) { error(\"Directory should only reply with data\"); } assert(in_msg.Acks + tbe.AcksOutstanding >= 0); if (in_msg.Acks + tbe.AcksOutstanding == 0) { trigger(Event:DataDirNoAcks, in_msg.addr, cache_entry, tbe); } else { trigger(Event:DataDirAcks, in_msg.addr, cache_entry, tbe); }} else { // This is from another cache. if (in_msg.Type == CoherenceResponseType:Data) { trigger(Event:DataOwner, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceResponseType:InvAck) { DPRINTF(RubySlicc, \"Got inv ack. %d left\\n\", tbe.AcksOutstanding); if (tbe.AcksOutstanding == 1) { // If there is exactly one ack remaining then we // know it is the last ack. trigger(Event:LastInvAck, in_msg.addr, cache_entry, tbe); } else { trigger(Event:InvAck, in_msg.addr, cache_entry, tbe); } } else { error(\"Unexpected response from other cache\"); }}First, we check to see if the message comes from the directory oranother cache. If it comes from the directory, we know that it must bea data response (the directory will never respond with an ack).Here, we meet our second way to add debug information to protocols: theerror function. This function breaks simulation and prints out thestring parameter similar to panic.Next, when we receive data from the directory, we expect that the numberof acks we are waiting for will never be less than 0. The number of ackswe’re waiting for is the current acks we have received(tbe.AcksOutstanding) and the number of acks the directory has told usto be waiting for. We need to check it this way because it is possiblethat we have received acks from other caches before we get the messagefrom the directory that we need to wait for acks.There are two possibilities for the acks, either we have alreadyreceived all of the acks and now we are getting the data (data from diracks==0 in Table 8.3), or we need to wait for more acks. Thus, we checkthis condition and trigger two different events, one for eachpossibility.When triggering transitions, you need to pass four parameters. The firstparameter is the event to trigger. These events were specified earlierin the Event declaration. The next parameter is the (physical memory)address of the cache block to operate on. Usually this is the same asthe address of the in_msg, but it may be different, for instance, on areplacement the address is for the block being replaced. Next is thecache entry and the TBE for the block. These may be invalid if there areno valid entries for the address in the cache or there is not a validTBE in the TBE table.When we implement actions below, we will see how these last threeparameters are used. They are passed into the actions as implicitvariables: address, cache_entry, and tbe.If the trigger function is executed, after the transition is complete,the in_port logic is executed again, assuming there have been fewertransitions than that maximum transitions per cycle. If there are othermessages in the message buffer more transitions can be triggered.If the response is from another cache instead of the directory, thenother events are triggered, as shown in the code above. These eventscome directly from Table 8.3 in Sorin et al.Importantly, you should use the in_port logic to check all conditions.After an event is triggered, it should only have a single code path.I.e., there should be no if statements in any action blocks. If youwant to conditionally execute actions, you should use different statesor different events in the in_port logic.The reason for this constraint is the way Ruby checks resources beforeexecuting a transition. In the generated code from the in_port blocksbefore the transition is actually executed all of the resources arechecked. In other words, transitions are atomic and either execute allof the actions or none. Conditional statements inside the actionsprevents the SLICC compiler from correctly tracking the resource usageand can lead to strange performance, deadlocks, and other bugs.After specifying the in_port logic for the highest priority network,the response network, we need to add the in_port logic for the forwardrequest network. However, before specifying this logic, we need todefine the RequestMsg type and the CoherenceRequestType whichcontains the types of requests. These two definitions go in theMSI-msg.sm file not in MSI-cache.sm since they are globaldefinitions.It is possible to implement this as two different messages and requesttype enumerations, one for forward and one for normal requests, but itsimplifies the code to use a single message and type.``` {.sourceCode .c++}enumeration(CoherenceRequestType, desc=”Types of request messages”) { GetS, desc=”Request from cache for a block with read permission”; GetM, desc=”Request from cache for a block with write permission”; PutS, desc=”Sent to directory when evicting a block in S (clean WB)”; PutM, desc=”Sent to directory when evicting a block in M”;// \"Requests\" from the directory to the caches on the fwd networkInv, desc=\"Probe the cache and invalidate any matching blocks\";PutAck, desc=\"The put request has been processed.\"; } `````` {.sourceCode .c++}structure(RequestMsg, desc=”Used for Cache->Dir and Fwd messages”, interface=”Message”) { Addr addr, desc=”Physical address for this request”; CoherenceRequestType Type, desc=”Type of request”; MachineID Requestor, desc=”Node who initiated the request”; NetDest Destination, desc=”Multicast destination mask”; DataBlock DataBlk, desc=”data for the cache line”; MessageSizeType MessageSize, desc=”size category of the message”;bool functionalRead(Packet *pkt) { // Requests should never have the only copy of the most up-to-date data return false;}bool functionalWrite(Packet *pkt) { // No check on message type required since the protocol should read // data block from only those messages that contain valid data return testAndWrite(addr, DataBlk, pkt);} } ```You can download the complete file MSI-msg.smhere <../../_static/scripts/part3/MSI_protocol/MSI-msg.sm>.Now, we can specify the logic for the forward network in_port. Thislogic is straightforward and triggers a different event for each requesttype.``` {.sourceCode .c++}in_port(forward_in, RequestMsg, forwardFromDir) { if (forward_in.isReady(clockEdge())) { peek(forward_in, RequestMsg) { // Grab the entry and tbe if they exist. Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs[in_msg.addr]; if (in_msg.Type == CoherenceRequestType:GetS) { trigger(Event:FwdGetS, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:GetM) { trigger(Event:FwdGetM, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:Inv) { trigger(Event:Inv, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:PutAck) { trigger(Event:PutAck, in_msg.addr, cache_entry, tbe); } else { error(\"Unexpected forward message!\"); } }} } ```The final in_port is for the mandatory queue. This is the lowestpriority queue, so it must be lowest in the state machine file. Themandatory queue has a special message type: RubyRequest. This type isspecified in src/mem/protocol/RubySlicc_Types.sm It contains twodifferent addresses, the LineAddress which is cache-block aligned andthe PhysicalAddress which holds the original request’s address and maynot be cache-block aligned. It also has other members that may be usefulin some protocols. However, for this simple protocol we only need theLineAddress.``` {.sourceCode .c++}in_port(mandatory_in, RubyRequest, mandatoryQueue) { if (mandatory_in.isReady(clockEdge())) { peek(mandatory_in, RubyRequest, block_on=”LineAddress”) { Entry cache_entry := getCacheEntry(in_msg.LineAddress); TBE tbe := TBEs[in_msg.LineAddress]; if (is_invalid(cache_entry) && cacheMemory.cacheAvail(in_msg.LineAddress) == false ) { Addr addr := cacheMemory.cacheProbe(in_msg.LineAddress); Entry victim_entry := getCacheEntry(addr); TBE victim_tbe := TBEs[addr]; trigger(Event:Replacement, addr, victim_entry, victim_tbe); } else { if (in_msg.Type == RubyRequestType:LD || in_msg.Type == RubyRequestType:IFETCH) { trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); } else if (in_msg.Type == RubyRequestType:ST) { trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); } else { error(\"Unexpected type from processor\"); } } }} } ```There are a couple of new concepts shown in this code block. First, weuse block_on=\"LineAddress\" in the peek function. What this does isensure that any other requests to the same cache line will be blockeduntil the current request is complete.Next, we check if the cache entry for this line is valid. If not, andthere are no more entries available in the set, then we need to evictanother entry. To get the victim address, we can use the cacheProbefunction on the CacheMemory object. This function uses theparameterized replacement policy and returns the physical (line) addressof the victim.Importantly, when we trigger the Replacement event we use the addressof the victim block and the victim cache entry and tbe. Thus, when wetake actions in the replacement transitions we will be acting on thevictim block, not the requesting block. Additionally, we need toremember to not remove the requesting message from the mandatory queue(pop) until it has been satisfied. The message should not be poppedafter the replacement is complete.If the cache block was found to be valid, then we simply trigger theLoad or Store event.", |
| "url": "/cache-in-ports/" |
| } |
| , |
| |
| "cache-intro": { |
| "title": "MSI example cache protocol", |
| "content": " authors Jason Lowe-PowerMSI example cache protocolBefore we implement a cache coherence protocol, it is important to havea solid understanding of cache coherence. This section leans heavily onthe great book A Primer on Memory Consistency and Cache Coherence byDaniel J. Sorin, Mark D. Hill, and David A. Wood which was published aspart of the Synthesis Lectures on Computer Architecture in 2011(DOI:`10.2200/S00346ED1V01Y201104CAC016<<https://doi.org/10.2200/S00346ED1V01Y201104CAC016>>_).If you are unfamiliar with cache coherence, I strongly advise reading that book before continuing.In this chapter, we will be implementing an MSI protocol.(An MSI protocol has three stable states, modified with read-write permission, shared with read-only permission, and invalid with no permissions.)We will implement this as a three-hop directory protocol (i.e., caches can send data directly to other caches without going through the directory).Details for the protocol can be found in Section 8.2 of the *Primer on Memory Consistency and Cache Coherence* (pages 141-149).It will be helpful to print out Section 8.2 to reference as you are implementing the protocol.You can download an exceprt of Sorin et al. that contains Section 8.2 :download:here<../../_static/external/Sorin_et-al_Excerpt_8.2.pdf>.First steps to writing a protocol~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Let’s start by creating a new directory for our protocol: src/learning_gem5/MSI_protocol.In this directory, like in all gem5 source directories, we need to create a file for SCons to know what to compile.However, this time, instead of creating a SConscript` file, we aregoing to create a SConsopts file. (The SConsopts files are processedbefore the SConscript files and we need to run the SLICC compilerbefore SCons executes.)We need to create a SConsopts file with the following:``` {.sourceCode .python}Import(‘*’)all_protocols.extend([‘MSI’,])protocol_dirs.append(str(Dir(‘.’).abspath))```We do two things in this file. First, we register the name of ourprotocol ('MSI'). Since we have named our protocol MSI, SCons willassume that there is a file named MSI.slicc which specifies all of thestate machine files and auxiliary files. We will create that file afterwriting all of our state machine files. Second, the SConsopts filestells the SCons to look in the current directory for files to pass tothe SLICC compiler.You can download the SConsopts filehere <../../_static/scripts/part3/MSI_protocol/SConsopts>.Writing a state machine fileThe next step, and most of the effort in writing a protocol, is tocreate the state machine files. State machine files generally follow theoutline: Parameters These are the parameters for the SimObject that will be generatedfrom the SLICC code. Declaring required structures and functions This section declares the states, events, and many other requiredstructures for the state machine. In port code blocks Contain code that looks at incoming messages from the (in_port)message buffers and determines what events to trigger. Actions These are simple one-effect code blocks (e.g., send a message) thatare executed when going through a transition. Transitions Specify actions to execute given a starting state and an event andthe final state. This is the meat of the state machine definition.", |
| "url": "/cache-intro/" |
| } |
| , |
| |
| "cache-transitions": { |
| "title": "Transition code blocks", |
| "content": " authors Jason Lowe-PowerTransition code blocksFinally, we’ve reached the final section of the state machine file! Thissection contains the details for all of the transitions between statesand what actions to execute during the transition.So far in this chapter we have written the state machine top to bottomone section at a time. However, in most cache coherence implementationsyou will find that you need to move around between sections. Forinstance, when writing the transitions you will realize you forgot toadd an action, or you notice that you actually need another transientstate to implement the protocol. This is the normal way to writeprotocols, but for simplicity this chapter goes through the file top tobottom.Transition blocks consist of two parts. First, the first line of atransition block contains the begin state, event to transition on, andend state (the end state may not be required, as we will discuss below).Second, the transition block contains all of the actions to execute onthis transition. For instance, a simple transition in the MSI protocolis transitioning out of Invalid on a Load.``` {.sourceCode .c++}transition(I, Load, IS_D) { allocateCacheBlock; allocateTBE; sendGetS; popMandatoryQueue;}First, you specify the transition as the \"parameters\" to the`transition` statement. In this case, if the initial state is `I` andthe event is `Load` then transition to `IS_D` (was invalid, going toshared, waiting for data). This transition is straight out of Table 8.3in Sorin et al.Then, inside the `transition` code block, all of the actions that willexecute are listed in order. For this transition first we allocate thecache block. Remember that in the `allocateCacheBlock` action the newlyallocated entry is set to the entry that will be used in the rest of theactions. After allocating the cache block, we also allocate a TBE. Thiscould be used if we need to wait for acks from other caches. Next, wesend a GetS request to the directory, and finally we pop the head entryoff of the mandatory queue since we have fully handled it.``` {.sourceCode .c++}transition(IS_D, {Load, Store, Replacement, Inv}) { stall;}In this transition, we use slightly different syntax. According to Table8.3 from Sorin et al., we should stall if the cache is in IS_D onloads, stores, replacements, and invalidates. We can specify a singletransition statement for this by including multiple events in curlybrackets as above. Additionally, the final state isn’t required. If thefinal state isn’t specified, then the transition is executed and thestate is not updated (i.e., the block stays in its beginning state). Youcan read the above transition as “If the cache block is in state IS_Dand there is a load, store, replacement, or invalidate stall theprotocol and do not transition out of the state.” You can also use curlybrackets for beginning states, as shown in some of the transitionsbelow.Below is the rest of the transitions needed to implement the L1 cachefrom the MSI protocol.``` {.sourceCode .c++}transition(IS_D, {DataDirNoAcks, DataOwner}, S) { writeDataToCache; deallocateTBE; externalLoadHit; popResponseQueue;}transition({IM_AD, IM_A}, {Load, Store, Replacement, FwdGetS, FwdGetM}) { stall;}transition({IM_AD, SM_AD}, {DataDirNoAcks, DataOwner}, M) { writeDataToCache; deallocateTBE; externalStoreHit; popResponseQueue;}transition(IM_AD, DataDirAcks, IM_A) { writeDataToCache; storeAcks; popResponseQueue;}transition({IM_AD, IM_A, SM_AD, SM_A}, InvAck) { decrAcks; popResponseQueue;}transition({IM_A, SM_A}, LastInvAck, M) { deallocateTBE; externalStoreHit; popResponseQueue;}transition({S, SM_AD, SM_A, M}, Load) { loadHit; popMandatoryQueue;}transition(S, Store, SM_AD) { allocateTBE; sendGetM; popMandatoryQueue;}transition(S, Replacement, SI_A) { sendPutS; forwardEviction;}transition(S, Inv, I) { sendInvAcktoReq; deallocateCacheBlock; forwardEviction; popForwardQueue;}transition({SM_AD, SM_A}, {Store, Replacement, FwdGetS, FwdGetM}) { stall;}transition(SM_AD, Inv, IM_AD) { sendInvAcktoReq; forwardEviction; popForwardQueue;}transition(SM_AD, DataDirAcks, SM_A) { writeDataToCache; storeAcks; popResponseQueue;}transition(M, Store) { storeHit; popMandatoryQueue;}transition(M, Replacement, MI_A) { sendPutM; forwardEviction;}transition(M, FwdGetS, S) { sendCacheDataToReq; sendCacheDataToDir; popForwardQueue;}transition(M, FwdGetM, I) { sendCacheDataToReq; deallocateCacheBlock; popForwardQueue;}transition({MI_A, SI_A, II_A}, {Load, Store, Replacement}) { stall;}transition(MI_A, FwdGetS, SI_A) { sendCacheDataToReq; sendCacheDataToDir; popForwardQueue;}transition(MI_A, FwdGetM, II_A) { sendCacheDataToReq; popForwardQueue;}transition({MI_A, SI_A, II_A}, PutAck, I) { deallocateCacheBlock; popForwardQueue;}transition(SI_A, Inv, II_A) { sendInvAcktoReq; popForwardQueue;}```You can download the complete MSI-cache.sm filehere <../../_static/scripts/part3/MSI_protocol/MSI-cache.sm>.", |
| "url": "/cache-transitions/" |
| } |
| , |
| |
| "configuration": { |
| "title": "Configuring a simple Ruby system", |
| "content": " authors Jason Lowe-PowerConfiguring a simple Ruby systemFirst, create a new configuration directory in configs/. Just like allgem5 configuration files, we will have a configuration run script. Forthe run script, we can start with simple.py fromsimple-config-chapter. Copy this file to simple_ruby.py in your newdirectory.We will make a couple of small changes to this file to use Ruby insteadof directly connecting the CPU to the memory controllers.First, so we can test our coherence protocol, let’s use two CPUs.``` {.sourceCode .python}system.cpu = [TimingSimpleCPU(), TimingSimpleCPU()]Next, after the memory controllers have been instantiated, we are goingto create the cache system and set up all of the caches. Add thefollowing lines *after the CPU interrupts have been created, but beforeinstantiating the system*.``` {.sourceCode .python}system.caches = MyCacheSystem()system.caches.setup(system, system.cpu, [system.mem_ctrl])Like the classic cache example in cache-config-chapter, we are going tocreate a second file that contains the cache configuration code. In thisfile we are going to have a class called MyCacheSystem and we willcreate a setup function that takes as parameters the CPUs in thesystem and the memory controllers.You can download the complete run scripthere <../../_static/scripts/part3/configs/simple_ruby.py>Cache system configurationNow, let’s create a file msi_caches.py. In this file, we will createfour classes: MyCacheSystem which will inherit from RubySystem,L1Cache and Directory which will inherit from the SimObjects createdby SLICC from our two state machines, and MyNetwork which will inheritfrom SimpleNetwork.L1 CacheLet’s start with the L1Cache. First, we will inherit fromL1Cache_Controller since we named our L1 cache “L1Cache” in the statemachine file. We also include a special class variable and class methodfor tracking the “version number”. For each SLICC state machine, youhave to number them in ascending order from 0. Each machine of the sametype should have a unique version number. This is used to differentiatethe individual machines. (Hopefully, in the future this requirement willbe removed.)``` {.sourceCode .python}class L1Cache(L1Cache_Controller):_version = 0@classmethoddef versionCount(cls): cls._version += 1 # Use count for this particular type return cls._version - 1 ```Next, we implement the constructor for the class.``` {.sourceCode .python}def init(self, system, ruby_system, cpu): super(L1Cache, self).init()self.version = self.versionCount()self.cacheMemory = RubyCache(size = '16kB', assoc = 8, start_index_bit = self.getBlockSizeBits(system))self.clk_domain = cpu.clk_domainself.send_evictions = self.sendEvicts(cpu)self.ruby_system = ruby_systemself.connectQueues(ruby_system) ```We need the CPUs in this function to grab the clock domain and system isneeded for the cache block size. Here, we set all of the parameters thatwe named in the state machine file (e.g., cacheMemory). We will setsequencer later. We also hardcode the size an associativity of thecache. You could add command line parameters for these options, if it isimportant to vary them at runtime.Next, we implement a couple of helper functions. First, we need tofigure out how many bits of the address to use for indexing into thecache, which is a simple log operation. We also need to decide whetherto send eviction notices to the CPU. Only if we are using theout-of-order CPU and using x86 or ARM ISA should we forward evictions.``` {.sourceCode .python}def getBlockSizeBits(self, system): bits = int(math.log(system.cache_line_size, 2)) if 2**bits != system.cache_line_size.value: panic(“Cache line size not a power of 2!”) return bitsdef sendEvicts(self, cpu): “\"”True if the CPU model or ISA requires sending evictions from caches to the CPU. Two scenarios warrant forwarding evictions to the CPU: 1. The O3 model must keep the LSQ coherent with the caches 2. The x86 mwait instruction is built on top of coherence 3. The local exclusive monitor in ARM systems “”” if type(cpu) is DerivO3CPU or \\ buildEnv[‘TARGET_ISA’] in (‘x86’, ‘arm’): return True return FalseFinally, we need to implement `connectQueues` to connect all of themessage buffers to the Ruby network. First, we create a message bufferfor the mandatory queue. Since this is an L1 cache and it will have asequencer, we need to instantiate this special message buffer. Next, weinstantiate a message buffer for each buffer in the controller. All ofthe \"to\" buffers we must set the \"master\" to the network (i.e., thebuffer will send messages into the network), and all of the \"from\"buffers we must set the \"slave\" to the network. These *names* are thesame as the gem5 ports, but *message buffers are not currentlyimplemented as gem5 ports*. In this protocol, we are assuming themessage buffers are ordered for simplicity.``` {.sourceCode .python}def connectQueues(self, ruby_system): self.mandatoryQueue = MessageBuffer() self.requestToDir = MessageBuffer(ordered = True) self.requestToDir.master = ruby_system.network.slave self.responseToDirOrSibling = MessageBuffer(ordered = True) self.responseToDirOrSibling.master = ruby_system.network.slave self.forwardFromDir = MessageBuffer(ordered = True) self.forwardFromDir.slave = ruby_system.network.master self.responseFromDirOrSibling = MessageBuffer(ordered = True) self.responseFromDirOrSibling.slave = ruby_system.network.masterDirectoryNow, we can similarly implement the directory. There are threedifferences from the L1 cache. First, we need to set the address rangesfor the directory. Since each directory corresponds to a particularmemory controller for a subset of the address range (possibly), we needto make sure the ranges match. The default address ranges for Rubycontrollers is AllMemory.Next, we need to set the master port memory. This is the port thatsends messages when queueMemoryRead/Write is called in the SLICC code.We set it the to the memory controller port. Similarly, inconnectQueues we need to instantiate the special message bufferresponseFromMemory like the mandatoryQueue in the L1 cache.``` {.sourceCode .python}class DirController(Directory_Controller):_version = 0@classmethoddef versionCount(cls): cls._version += 1 # Use count for this particular type return cls._version - 1def __init__(self, ruby_system, ranges, mem_ctrls): \"\"\"ranges are the memory ranges assigned to this controller. \"\"\" if len(mem_ctrls) > 1: panic(\"This cache system can only be connected to one mem ctrl\") super(DirController, self).__init__() self.version = self.versionCount() self.addr_ranges = ranges self.ruby_system = ruby_system self.directory = RubyDirectoryMemory() # Connect this directory to the memory side. self.memory = mem_ctrls[0].port self.connectQueues(ruby_system)def connectQueues(self, ruby_system): self.requestFromCache = MessageBuffer(ordered = True) self.requestFromCache.slave = ruby_system.network.master self.responseFromCache = MessageBuffer(ordered = True) self.responseFromCache.slave = ruby_system.network.master self.responseToCache = MessageBuffer(ordered = True) self.responseToCache.master = ruby_system.network.slave self.forwardToCache = MessageBuffer(ordered = True) self.forwardToCache.master = ruby_system.network.slave self.responseFromMemory = MessageBuffer() ```Ruby SystemNow, we can implement the Ruby system object. For this object, theconstructor is simple. It just checks the SCons variable PROTOCOL tobe sure that we are using the right configuration file for the protocolthat was compiled. We cannot create the controllers in the constructorbecause they require a pointer to the this object. If we were to createthem in the constructor, there would be a circular dependence in theSimObject hierarchy which will cause infinite recursion in when thesystem in instantiated with m5.instantiate.``` {.sourceCode .python}class MyCacheSystem(RubySystem):def __init__(self): if buildEnv['PROTOCOL'] != 'MSI': fatal(\"This system assumes MSI from learning gem5!\") super(MyCacheSystem, self).__init__() ```Instead of create the controllers in the constructor, we create a newfunction to create all of the needed objects: setup. First, we createthe network. We will look at this object next. With the network, we needto set the number of virtual networks in the system.Next, we instantiate all of the controllers. Here, we use a singleglobal list of the controllers to make it easier to connect them to thenetwork later. However, for more complicated cache topologies, it canmake sense to use multiple lists of controllers. We create one L1 cachefor each CPU and one directory for the system.Then, we instantiate all of the sequencers, one for each CPU. Eachsequencer needs a pointer to the instruction and data cache to simulatethe correct latency when initially accessing the cache. In morecomplicated systems, you also have to create sequencers for otherobjects like DMA controllers.After creating the sequencers, we set the sequencer variable on each L1cache controller.Then, we connect all of the controllers to the network and call thesetup_buffers function on the network.We then have to set the “port proxy” for both the Ruby system and thesystem for making functional accesses (e.g., loading the binary in SEmode).Finally, we connect all of the CPUs to the ruby system. In this example,we assume that there are only CPU sequencers so the first CPU isconnected to the first sequencer, and so on. We also have to connect theTLBs and interrupt ports (if we are using x86).``` {.sourceCode .python}def setup(self, system, cpus, mem_ctrls): self.network = MyNetwork(self)self.number_of_virtual_networks = 3self.network.number_of_virtual_networks = 3self.controllers = \\ [L1Cache(system, self, cpu) for cpu in cpus] + \\ [DirController(self, system.mem_ranges, mem_ctrls)]self.sequencers = [RubySequencer(version = i, # I/D cache is combined and grab from ctrl icache = self.controllers[i].cacheMemory, dcache = self.controllers[i].cacheMemory, clk_domain = self.controllers[i].clk_domain, ) for i in range(len(cpus))]for i,c in enumerate(self.controllers[0:len(self.sequencers)]): c.sequencer = self.sequencers[i]self.num_of_sequencers = len(self.sequencers)self.network.connectControllers(self.controllers)self.network.setup_buffers()self.sys_port_proxy = RubyPortProxy()system.system_port = self.sys_port_proxy.slavefor i,cpu in enumerate(cpus): cpu.icache_port = self.sequencers[i].slave cpu.dcache_port = self.sequencers[i].slave isa = buildEnv['TARGET_ISA'] if isa == 'x86': cpu.interrupts[0].pio = self.sequencers[i].master cpu.interrupts[0].int_master = self.sequencers[i].slave cpu.interrupts[0].int_slave = self.sequencers[i].master if isa == 'x86' or isa == 'arm': cpu.itb.walker.port = self.sequencers[i].slave cpu.dtb.walker.port = self.sequencers[i].slave ```NetworkFinally, the last object we have to implement is the network. Theconstructor is simple, but we need to declare an empty list for the listof network interfaces (netifs).Most of the code is in connectControllers. This function implements avery simple, unrealistic point-to-point network. In other words, everycontroller has a direct link to every other controller.The Ruby network is made of three parts: routers that route data fromone router to another or to external controllers, external links thatlink a controller to a router, and internal links that link two routerstogether. First, we create a router for each controller. Then, we createan external link from that router to the controller. Finally, we add allof the “internal” links. Each router is connected to all other routersto make the point-to-point network.``` {.sourceCode .python}class MyNetwork(SimpleNetwork):def __init__(self, ruby_system): super(MyNetwork, self).__init__() self.netifs = [] self.ruby_system = ruby_systemdef connectControllers(self, controllers): self.routers = [Switch(router_id = i) for i in range(len(controllers))] self.ext_links = [SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) for i, c in enumerate(controllers)] link_count = 0 self.int_links = [] for ri in self.routers: for rj in self.routers: if ri == rj: continue # Don't connect a router to itself! link_count += 1 self.int_links.append(SimpleIntLink(link_id = link_count, src_node = ri, dst_node = rj)) ```You can download the complete msi_caches.py filehere <../../_static/scripts/part3/configs/msi_caches.py>.", |
| "url": "/configuration/" |
| } |
| , |
| |
| "directory": { |
| "title": "MSI Directory implementation", |
| "content": " authors Jason Lowe-PowerMSI Directory implementationImplementing a directory controller is very similar to the L1 cachecontroller, except using a different state machine table. The statemachine fore the directory can be found in Table 8.2 in Sorin et al.Since things are mostly similar to the L1 cache, this section mostlyjust discusses a few more SLICC details and a few differences betweendirectory controllers and cache controllers. Let’s dive straight in andstart modifying a new file MSI-dir.sm.``` {.sourceCode .c++}machine(MachineType:Directory, “Directory protocol”): DirectoryMemory * directory; Cycles toMemLatency := 1;MessageBuffer *forwardToCache, network=”To”, virtual_network=”1”, vnet_type=”forward”;MessageBuffer *responseToCache, network=”To”, virtual_network=”2”, vnet_type=”response”;MessageBuffer *requestFromCache, network=”From”, virtual_network=”0”, vnet_type=”request”;MessageBuffer *responseFromCache, network=”From”, virtual_network=”2”, vnet_type=”response”;MessageBuffer *responseFromMemory;{. . .}First, there are two parameter to this directory controller,`DirectoryMemory` and a `toMemLatency`. The `DirectoryMemory` is alittle weird. It is allocated at initialization time such that it cancover *all* of physical memory, like a complete directory *not adirectory cache*. I.e., there are pointers in the `DirectoryMemory`object for every 64-byte block in physical memory. However, the actualentries (as defined below) are lazily created via `getDirEntry()`. We'llsee more details about `DirectoryMemory` below.Next, is the `toMemLatency` parameter. This will be used in the`enqueue` function when enqueuing requests to model the directorylatency. We didn't use a parameter for this in the L1 cache, but it issimple to make the controller latency parameterized. This parameterdefaults to 1 cycle. It is not required to set a default here. Thedefault is propagated to the generated SimObject description file as thedefault to the SimObject parameter.Next, we have the message buffers for the directory. Importantly, *theseneed to have the same virtual network numbers* as the message buffers inthe L1 cache. These virtual network numbers are how the Ruby networkdirects messages between controllers.There is also one more special message buffer: `responseFromMemory`.This is similar to the `mandatoryQueue`, except instead of being like aslave port for CPUs it is like a master port. The `responseFromMemory`buffer will deliver response sent across the the memory port, as we willsee below in the action section.After the parameters and message buffers, we need to declare all of thestates, events, and other local structures.``` {.sourceCode .c++}state_declaration(State, desc=\"Directory states\", default=\"Directory_State_I\") { // Stable states. // NOTE: These are \"cache-centric\" states like in Sorin et al. // However, The access permissions are memory-centric. I, AccessPermission:Read_Write, desc=\"Invalid in the caches.\"; S, AccessPermission:Read_Only, desc=\"At least one cache has the blk\"; M, AccessPermission:Invalid, desc=\"A cache has the block in M\"; // Transient states S_D, AccessPermission:Busy, desc=\"Moving to S, but need data\"; // Waiting for data from memory S_m, AccessPermission:Read_Write, desc=\"In S waiting for mem\"; M_m, AccessPermission:Read_Write, desc=\"Moving to M waiting for mem\"; // Waiting for write-ack from memory MI_m, AccessPermission:Busy, desc=\"Moving to I waiting for ack\"; SS_m, AccessPermission:Busy, desc=\"Moving to I waiting for ack\";}enumeration(Event, desc=\"Directory events\") { // Data requests from the cache GetS, desc=\"Request for read-only data from cache\"; GetM, desc=\"Request for read-write data from cache\"; // Writeback requests from the cache PutSNotLast, desc=\"PutS and the block has other sharers\"; PutSLast, desc=\"PutS and the block has no other sharers\"; PutMOwner, desc=\"Dirty data writeback from the owner\"; PutMNonOwner, desc=\"Dirty data writeback from non-owner\"; // Cache responses Data, desc=\"Response to fwd request with data\"; // From Memory MemData, desc=\"Data from memory\"; MemAck, desc=\"Ack from memory that write is complete\";}structure(Entry, desc=\"...\", interface=\"AbstractEntry\") { State DirState, desc=\"Directory state\"; NetDest Sharers, desc=\"Sharers for this block\"; NetDest Owner, desc=\"Owner of this block\";}In the state_declaration we define a default. For many things in SLICCyou can specify a default. However, this default must use the C++ name(mangled SLICC name). For the state below you have to use the controllername and the name we use for states. In this case, since the name of themachine is “Directory” the name for “I” is “Directory”+”State” (for thename of the structure)+”I”.Note that the permissions in the directory are “memory-centric”.Whereas, all of the states are cache centric as in Sorin et al.In the Entry definition for the directory, we use a NetDest for boththe sharers and the owner. This makes sense for the sharers, since wewant a full bitvector for all L1 caches that may be sharing the block.The reason we also use a NetDest for the owner is to simply copy thestructure into the message we send as a response as shown below.In this implementation, we use a few more transient states than in Table8.2 in Sorin et al. to deal with the fact that the memory latency inunknown. In Sorin et al., the authors assume that the directory stateand memory data is stored together in main-memory to simplify theprotocol. Similarly, we also include new actions: the responses frommemory.Next, we have the functions that need to overridden and declared. Thefunction getDirectoryEntry either returns the valid directory entry,or, if it hasn’t been allocated yet, this allocates the entry.Implementing it this way may save some host memory since this is lazilypopulated.``` {.sourceCode .c++}Tick clockEdge();Entry getDirectoryEntry(Addr addr), return_by_pointer = “yes” { Entry dir_entry := static_cast(Entry, “pointer”, directory[addr]); if (is_invalid(dir_entry)) { // This first time we see this address allocate an entry for it. dir_entry := static_cast(Entry, “pointer”, directory.allocate(addr, new Entry)); } return dir_entry;}State getState(Addr addr) { if (directory.isPresent(addr)) { return getDirectoryEntry(addr).DirState; } else { return State:I; }}void setState(Addr addr, State state) { if (directory.isPresent(addr)) { if (state == State:M) { DPRINTF(RubySlicc, “Owner %s\\n”, getDirectoryEntry(addr).Owner); assert(getDirectoryEntry(addr).Owner.count() == 1); assert(getDirectoryEntry(addr).Sharers.count() == 0); } getDirectoryEntry(addr).DirState := state; if (state == State:I) { assert(getDirectoryEntry(addr).Owner.count() == 0); assert(getDirectoryEntry(addr).Sharers.count() == 0); } }}AccessPermission getAccessPermission(Addr addr) { if (directory.isPresent(addr)) { Entry e := getDirectoryEntry(addr); return Directory_State_to_permission(e.DirState); } else { return AccessPermission:NotPresent; }}void setAccessPermission(Addr addr, State state) { if (directory.isPresent(addr)) { Entry e := getDirectoryEntry(addr); e.changePermission(Directory_State_to_permission(state)); }}void functionalRead(Addr addr, Packet *pkt) { functionalMemoryRead(pkt);}int functionalWrite(Addr addr, Packet *pkt) { if (functionalMemoryWrite(pkt)) { return 1; } else { return 0; }Next, we need to implement the ports for the cache. First we specify the`out_port` and then the `in_port` code blocks. The only differencebetween the `in_port` in the directory and in the L1 cache is that thedirectory does not have a TBE or cache entry. Thus, we do not passeither into the `trigger` function.``` {.sourceCode .c++}out_port(forward_out, RequestMsg, forwardToCache);out_port(response_out, ResponseMsg, responseToCache);in_port(memQueue_in, MemoryMsg, responseFromMemory) { if (memQueue_in.isReady(clockEdge())) { peek(memQueue_in, MemoryMsg) { if (in_msg.Type == MemoryRequestType:MEMORY_READ) { trigger(Event:MemData, in_msg.addr); } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { trigger(Event:MemAck, in_msg.addr); } else { error(\"Invalid message\"); } } }}in_port(response_in, ResponseMsg, responseFromCache) { if (response_in.isReady(clockEdge())) { peek(response_in, ResponseMsg) { if (in_msg.Type == CoherenceResponseType:Data) { trigger(Event:Data, in_msg.addr); } else { error(\"Unexpected message type.\"); } } }}in_port(request_in, RequestMsg, requestFromCache) { if (request_in.isReady(clockEdge())) { peek(request_in, RequestMsg) { Entry e := getDirectoryEntry(in_msg.addr); if (in_msg.Type == CoherenceRequestType:GetS) { trigger(Event:GetS, in_msg.addr); } else if (in_msg.Type == CoherenceRequestType:GetM) { trigger(Event:GetM, in_msg.addr); } else if (in_msg.Type == CoherenceRequestType:PutS) { assert(is_valid(e)); // If there is only a single sharer (i.e., the requestor) if (e.Sharers.count() == 1) { assert(e.Sharers.isElement(in_msg.Requestor)); trigger(Event:PutSLast, in_msg.addr); } else { trigger(Event:PutSNotLast, in_msg.addr); } } else if (in_msg.Type == CoherenceRequestType:PutM) { assert(is_valid(e)); if (e.Owner.isElement(in_msg.Requestor)) { trigger(Event:PutMOwner, in_msg.addr); } else { trigger(Event:PutMNonOwner, in_msg.addr); } } else { error(\"Unexpected message type.\"); } } }}The next part of the state machine file is the actions. First, we defineactions for queuing memory reads and writes. For this, we will use aspecial function define in the AbstractController: queueMemoryRead.This function takes an address and converts it to a gem5 request andpacket and sends it to across the port that is connected to thiscontroller. We will see how to connect this port in theconfiguration section <MSI-config-section>. Note that we need twodifferent actions to send data to memory for both requests and responsessince there are two different message buffers (virtual networks) thatdata might arrive on.``` {.sourceCode .c++}action(sendMemRead, “r”, desc=”Send a memory read request”) { peek(request_in, RequestMsg) { queueMemoryRead(in_msg.Requestor, address, toMemLatency); }}action(sendDataToMem, “w”, desc=”Write data to memory”) { peek(request_in, RequestMsg) { DPRINTF(RubySlicc, “Writing memory for %#x\\n”, address); DPRINTF(RubySlicc, “Writing %s\\n”, in_msg.DataBlk); queueMemoryWrite(in_msg.Requestor, address, toMemLatency, in_msg.DataBlk); }}action(sendRespDataToMem, “rw”, desc=”Write data to memory from resp”) { peek(response_in, ResponseMsg) { DPRINTF(RubySlicc, “Writing memory for %#x\\n”, address); DPRINTF(RubySlicc, “Writing %s\\n”, in_msg.DataBlk); queueMemoryWrite(in_msg.Sender, address, toMemLatency, in_msg.DataBlk); }}In this code, we also see the last way to add debug information to SLICCprotocols: `DPRINTF`. This is exactly the same as a `DPRINTF` in gem5,except in SLICC only the `RubySlicc` debug flag is available.Next, we specify actions to update the sharers and owner of a particularblock.``` {.sourceCode .c++}action(addReqToSharers, \"aS\", desc=\"Add requestor to sharer list\") { peek(request_in, RequestMsg) { getDirectoryEntry(address).Sharers.add(in_msg.Requestor); }}action(setOwner, \"sO\", desc=\"Set the owner\") { peek(request_in, RequestMsg) { getDirectoryEntry(address).Owner.add(in_msg.Requestor); }}action(addOwnerToSharers, \"oS\", desc=\"Add the owner to sharers\") { Entry e := getDirectoryEntry(address); assert(e.Owner.count() == 1); e.Sharers.addNetDest(e.Owner);}action(removeReqFromSharers, \"rS\", desc=\"Remove requestor from sharers\") { peek(request_in, RequestMsg) { getDirectoryEntry(address).Sharers.remove(in_msg.Requestor); }}action(clearSharers, \"cS\", desc=\"Clear the sharer list\") { getDirectoryEntry(address).Sharers.clear();}action(clearOwner, \"cO\", desc=\"Clear the owner\") { getDirectoryEntry(address).Owner.clear();}The next set of actions send invalidates and forward requests to cachesthat the directory cannot deal with alone.``` {.sourceCode .c++}action(sendInvToSharers, “i”, desc=”Send invalidate to all sharers”) { peek(request_in, RequestMsg) { enqueue(forward_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:Inv; out_msg.Requestor := in_msg.Requestor; out_msg.Destination := getDirectoryEntry(address).Sharers; out_msg.MessageSize := MessageSizeType:Control; } }}action(sendFwdGetS, “fS”, desc=”Send forward getS to owner”) { assert(getDirectoryEntry(address).Owner.count() == 1); peek(request_in, RequestMsg) { enqueue(forward_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:GetS; out_msg.Requestor := in_msg.Requestor; out_msg.Destination := getDirectoryEntry(address).Owner; out_msg.MessageSize := MessageSizeType:Control; } }}action(sendFwdGetM, “fM”, desc=”Send forward getM to owner”) { assert(getDirectoryEntry(address).Owner.count() == 1); peek(request_in, RequestMsg) { enqueue(forward_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:GetM; out_msg.Requestor := in_msg.Requestor; out_msg.Destination := getDirectoryEntry(address).Owner; out_msg.MessageSize := MessageSizeType:Control; } }}Now we have responses from the directory. Here we are peeking into thespecial buffer `responseFromMemory`. You can find the definition of`MemoryMsg` in `src/mem/protocol/RubySlicc_MemControl.sm`.``` {.sourceCode .c++}action(sendDataToReq, \"d\", desc=\"Send data from memory to requestor. May need to send sharer number, too\") { peek(memQueue_in, MemoryMsg) { enqueue(response_out, ResponseMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:Data; out_msg.Sender := machineID; out_msg.Destination.add(in_msg.OriginalRequestorMachId); out_msg.DataBlk := in_msg.DataBlk; out_msg.MessageSize := MessageSizeType:Data; Entry e := getDirectoryEntry(address); // Only need to include acks if we are the owner. if (e.Owner.isElement(in_msg.OriginalRequestorMachId)) { out_msg.Acks := e.Sharers.count(); } else { out_msg.Acks := 0; } assert(out_msg.Acks >= 0); } }}action(sendPutAck, \"a\", desc=\"Send the put ack\") { peek(request_in, RequestMsg) { enqueue(forward_out, RequestMsg, 1) { out_msg.addr := address; out_msg.Type := CoherenceRequestType:PutAck; out_msg.Requestor := machineID; out_msg.Destination.add(in_msg.Requestor); out_msg.MessageSize := MessageSizeType:Control; } }}Then, we have the queue management and stall actions.``` {.sourceCode .c++}action(popResponseQueue, “pR”, desc=”Pop the response queue”) { response_in.dequeue(clockEdge());}action(popRequestQueue, “pQ”, desc=”Pop the request queue”) { request_in.dequeue(clockEdge());}action(popMemQueue, “pM”, desc=”Pop the memory queue”) { memQueue_in.dequeue(clockEdge());}action(stall, “z”, desc=”Stall the incoming request”) { // Do nothing.}Finally, we have the transition section of the state machine file. Thesemostly come from Table 8.2 in Sorin et al., but there are some extratransitions to deal with the unknown memory latency.``` {.sourceCode .c++}transition({I, S}, GetS, S_m) { sendMemRead; addReqToSharers; popRequestQueue;}transition(I, {PutSNotLast, PutSLast, PutMNonOwner}) { sendPutAck; popRequestQueue;}transition(S_m, MemData, S) { sendDataToReq; popMemQueue;}transition(I, GetM, M_m) { sendMemRead; setOwner; popRequestQueue;}transition(M_m, MemData, M) { sendDataToReq; clearSharers; // NOTE: This isn't *required* in some cases. popMemQueue;}transition(S, GetM, M_m) { sendMemRead; removeReqFromSharers; sendInvToSharers; setOwner; popRequestQueue;}transition({S, S_D, SS_m, S_m}, {PutSNotLast, PutMNonOwner}) { removeReqFromSharers; sendPutAck; popRequestQueue;}transition(S, PutSLast, I) { removeReqFromSharers; sendPutAck; popRequestQueue;}transition(M, GetS, S_D) { sendFwdGetS; addReqToSharers; addOwnerToSharers; clearOwner; popRequestQueue;}transition(M, GetM) { sendFwdGetM; clearOwner; setOwner; popRequestQueue;}transition({M, M_m, MI_m}, {PutSNotLast, PutSLast, PutMNonOwner}) { sendPutAck; popRequestQueue;}transition(M, PutMOwner, MI_m) { sendDataToMem; clearOwner; sendPutAck; popRequestQueue;}transition(MI_m, MemAck, I) { popMemQueue;}transition(S_D, {GetS, GetM}) { stall;}transition(S_D, PutSLast) { removeReqFromSharers; sendPutAck; popRequestQueue;}transition(S_D, Data, SS_m) { sendRespDataToMem; popResponseQueue;}transition(SS_m, MemAck, S) { popMemQueue;}// If we get another request for a block that's waiting on memory,// stall that request.transition({MI_m, SS_m, S_m, M_m}, {GetS, GetM}) { stall;}You can download the complete MSI-dir.sm filehere <../../_static/scripts/part3/MSI_protocol/MSI-dir.sm>.", |
| "url": "/directory/" |
| } |
| , |
| |
| "running": { |
| "title": "Running the simple Ruby system", |
| "content": " authors Jason Lowe-PowerRunning the simple Ruby systemNow, we can run our system with the MSI protocol!As something interesting, below is a simple multithreaded program (note:as of this writing there is a bug in gem5 preventing this code fromexecuting).``` {.sourceCode .c++}#include #include using namespace std;/* c = a + b */void array_add(int *a, int *b, int *c, int tid, int threads, int num_values){ for (int i = tid; i < num_values; i += threads) { c[i] = a[i] + b[i]; }}int main(int argc, char *argv[]){ unsigned num_values; if (argc == 1) { num_values = 100; } else if (argc == 2) { num_values = atoi(argv[1]); if (num_values <= 0) { cerr « “Usage: “ « argv[0] « ” [num_values]” « endl; return 1; } } else { cerr « “Usage: “ « argv[0] « ” [num_values]” « endl; return 1; }unsigned cpus = thread::hardware_concurrency();cout << \"Running on \" << cpus << \" cores. \";cout << \"with \" << num_values << \" values\" << endl;int *a, *b, *c;a = new int[num_values];b = new int[num_values];c = new int[num_values];if (!(a && b && c)) { cerr << \"Allocation error!\" << endl; return 2;}for (int i = 0; i < num_values; i++) { a[i] = i; b[i] = num_values - i; c[i] = 0;}thread **threads = new thread*[cpus];// NOTE: -1 is required for this to work in SE mode.for (int i = 0; i < cpus - 1; i++) { threads[i] = new thread(array_add, a, b, c, i, cpus, num_values);}// Execute the last thread with this thread context to appease SE modearray_add(a, b, c, cpus - 1, cpus, num_values);cout << \"Waiting for other threads to complete\" << endl;for (int i = 0; i < cpus - 1; i++) { threads[i]->join();}delete[] threads;cout << \"Validating...\" << flush;int num_valid = 0;for (int i = 0; i < num_values; i++) { if (c[i] == num_values) { num_valid++; } else { cerr << \"c[\" << i << \"] is wrong.\"; cerr << \" Expected \" << num_values; cerr << \" Got \" << c[i] << \".\" << endl; }}if (num_valid == num_values) { cout << \"Success!\" << endl; return 0;} else { return 2;} } ```With the above code compiled as threads, we can run gem5!{.sourceCode .sh}build/MSI/gem5.opt configs/learning_gem5/part6/simple_ruby.pyThe output should be something like the following. Most of the warningsare unimplemented syscalls in SE mode due to using pthreads and can besafely ignored for this simple example.gem5 Simulator System. http://gem5.orggem5 is copyrighted software; use the --copyright option for details.gem5 compiled Sep 7 2017 12:39:51gem5 started Sep 10 2017 20:56:35gem5 executing on fuggle, pid 6687command line: build/MSI/gem5.opt configs/learning_gem5/part6/simple_ruby.pyGlobal frequency set at 1000000000000 ticks per secondwarn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)0: system.remote_gdb.listener: listening for remote gdb #0 on port 70000: system.remote_gdb.listener: listening for remote gdb #1 on port 7001Beginning simulation!info: Entering event queue @ 0. Starting simulation...warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!warn: ignoring syscall access(...)warn: ignoring syscall access(...)warn: ignoring syscall access(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall access(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall access(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall access(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall access(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall mprotect(...)warn: ignoring syscall set_robust_list(...)warn: ignoring syscall rt_sigaction(...) (further warnings will be suppressed)warn: ignoring syscall rt_sigprocmask(...) (further warnings will be suppressed)info: Increasing stack size by one page.info: Increasing stack size by one page.Running on 2 cores. with 100 valueswarn: ignoring syscall mprotect(...)warn: ClockedObject: Already in the requested power state, request ignoredwarn: ignoring syscall set_robust_list(...)Waiting for other threads to completewarn: ignoring syscall madvise(...)Validating...Success!Exiting @ tick 9386342000 because exiting with last active thread context", |
| "url": "/running/" |
| } |
| , |
| |
| "simple-mi-example": { |
| "title": "Configuring for a standard protocol", |
| "content": " authors Jason Lowe-PowerConfiguring for a standard protocolYou can easily adapt the simple example configurations from this part tothe other SLICC protocols in gem5. In this chapter, we will briefly lookat an example with MI_example, though this can be easily extended toother protocols.However, these simple configuration files will only work in syscallemulation mode. Full system mode adds some complications such as DMAcontrollers. These scripts can be extended to full system.For MI_example, we can use exactly the same runscript as before(simple_ruby.py), we just need to implement a differentMyCacheSystem (and import that file in simple_ruby.py). Below, isthe classes needed for MI_example. There are only a couple of changesfrom MSI, mostly due to different naming schemes. You can download thefilehere <../_static/scripts/part3/configs/ruby_caches_MI_example.py>``` {.sourceCode .python}class MyCacheSystem(RubySystem):def __init__(self): if buildEnv['PROTOCOL'] != 'MI_example': fatal(\"This system assumes MI_example!\") super(MyCacheSystem, self).__init__()def setup(self, system, cpus, mem_ctrls): \"\"\"Set up the Ruby cache subsystem. Note: This can't be done in the constructor because many of these items require a pointer to the ruby system (self). This causes infinite recursion in initialize() if we do this in the __init__. \"\"\" # Ruby's global network. self.network = MyNetwork(self) # MI example uses 5 virtual networks self.number_of_virtual_networks = 5 self.network.number_of_virtual_networks = 5 # There is a single global list of all of the controllers to make it # easier to connect everything to the global network. This can be # customized depending on the topology/network requirements. # Create one controller for each L1 cache (and the cache mem obj.) # Create a single directory controller (Really the memory cntrl) self.controllers = \\ [L1Cache(system, self, cpu) for cpu in cpus] + \\ [DirController(self, system.mem_ranges, mem_ctrls)] # Create one sequencer per CPU. In many systems this is more # complicated since you have to create sequencers for DMA controllers # and other controllers, too. self.sequencers = [RubySequencer(version = i, # I/D cache is combined and grab from ctrl icache = self.controllers[i].cacheMemory, dcache = self.controllers[i].cacheMemory, clk_domain = self.controllers[i].clk_domain, ) for i in range(len(cpus))] for i,c in enumerate(self.controllers[0:len(cpus)]): c.sequencer = self.sequencers[i] self.num_of_sequencers = len(self.sequencers) # Create the network and connect the controllers. # NOTE: This is quite different if using Garnet! self.network.connectControllers(self.controllers) self.network.setup_buffers() # Set up a proxy port for the system_port. Used for load binaries and # other functional-only things. self.sys_port_proxy = RubyPortProxy() system.system_port = self.sys_port_proxy.slave # Connect the cpu's cache, interrupt, and TLB ports to Ruby for i,cpu in enumerate(cpus): cpu.icache_port = self.sequencers[i].slave cpu.dcache_port = self.sequencers[i].slave isa = buildEnv['TARGET_ISA'] if isa == 'x86': cpu.interrupts[0].pio = self.sequencers[i].master cpu.interrupts[0].int_master = self.sequencers[i].slave cpu.interrupts[0].int_slave = self.sequencers[i].master if isa == 'x86' or isa == 'arm': cpu.itb.walker.port = self.sequencers[i].slave cpu.dtb.walker.port = self.sequencers[i].slaveclass L1Cache(L1Cache_Controller):_version = 0@classmethoddef versionCount(cls): cls._version += 1 # Use count for this particular type return cls._version - 1def __init__(self, system, ruby_system, cpu): \"\"\"CPUs are needed to grab the clock domain and system is needed for the cache block size. \"\"\" super(L1Cache, self).__init__() self.version = self.versionCount() # This is the cache memory object that stores the cache data and tags self.cacheMemory = RubyCache(size = '16kB', assoc = 8, start_index_bit = self.getBlockSizeBits(system)) self.clk_domain = cpu.clk_domain self.send_evictions = self.sendEvicts(cpu) self.ruby_system = ruby_system self.connectQueues(ruby_system)def getBlockSizeBits(self, system): bits = int(math.log(system.cache_line_size, 2)) if 2**bits != system.cache_line_size.value: panic(\"Cache line size not a power of 2!\") return bitsdef sendEvicts(self, cpu): \"\"\"True if the CPU model or ISA requires sending evictions from caches to the CPU. Two scenarios warrant forwarding evictions to the CPU: 1. The O3 model must keep the LSQ coherent with the caches 2. The x86 mwait instruction is built on top of coherence 3. The local exclusive monitor in ARM systems \"\"\" if type(cpu) is DerivO3CPU or \\ buildEnv['TARGET_ISA'] in ('x86', 'arm'): return True return Falsedef connectQueues(self, ruby_system): \"\"\"Connect all of the queues for this controller. \"\"\" self.mandatoryQueue = MessageBuffer() self.requestFromCache = MessageBuffer(ordered = True) self.requestFromCache.master = ruby_system.network.slave self.responseFromCache = MessageBuffer(ordered = True) self.responseFromCache.master = ruby_system.network.slave self.forwardToCache = MessageBuffer(ordered = True) self.forwardToCache.slave = ruby_system.network.master self.responseToCache = MessageBuffer(ordered = True) self.responseToCache.slave = ruby_system.network.masterclass DirController(Directory_Controller):_version = 0@classmethoddef versionCount(cls): cls._version += 1 # Use count for this particular type return cls._version - 1def __init__(self, ruby_system, ranges, mem_ctrls): \"\"\"ranges are the memory ranges assigned to this controller. \"\"\" if len(mem_ctrls) > 1: panic(\"This cache system can only be connected to one mem ctrl\") super(DirController, self).__init__() self.version = self.versionCount() self.addr_ranges = ranges self.ruby_system = ruby_system self.directory = RubyDirectoryMemory() # Connect this directory to the memory side. self.memory = mem_ctrls[0].port self.connectQueues(ruby_system)def connectQueues(self, ruby_system): self.requestToDir = MessageBuffer(ordered = True) self.requestToDir.slave = ruby_system.network.master self.dmaRequestToDir = MessageBuffer(ordered = True) self.dmaRequestToDir.slave = ruby_system.network.master self.responseFromDir = MessageBuffer() self.responseFromDir.master = ruby_system.network.slave self.dmaResponseFromDir = MessageBuffer(ordered = True) self.dmaResponseFromDir.master = ruby_system.network.slave self.forwardFromDir = MessageBuffer() self.forwardFromDir.master = ruby_system.network.slave self.responseFromMemory = MessageBuffer()class MyNetwork(SimpleNetwork): “\"”A simple point-to-point network. This doesn’t not use garnet. “””def __init__(self, ruby_system): super(MyNetwork, self).__init__() self.netifs = [] self.ruby_system = ruby_systemdef connectControllers(self, controllers): \"\"\"Connect all of the controllers to routers and connect the routers together in a point-to-point network. \"\"\" # Create one router/switch per controller in the system self.routers = [Switch(router_id = i) for i in range(len(controllers))] # Make a link from each controller to the router. The link goes # externally to the network. self.ext_links = [SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) for i, c in enumerate(controllers)] # Make an \"internal\" link (internal to the network) between every pair # of routers. link_count = 0 self.int_links = [] for ri in self.routers: for rj in self.routers: if ri == rj: continue # Don't connect a router to itself! link_count += 1 self.int_links.append(SimpleIntLink(link_id = link_count, src_node = ri, dst_node = rj)) ```", |
| "url": "/simple-MI_example/" |
| } |
| , |
| |
| "governance": { |
| "title": "Governance", |
| "content": " Overview Philosophy gem5 Roadmap Roles And Responsibilities Users Contributors Committers Project management committee PMC Chair Support Contribution Process Reviewing Patches Decision Making Process Lazy consensus Voting Overviewgem5 is a meritocratic, consensus-based community project. Anyone with an interest in the project can join the community, contribute to the project design and participate in the decision-making process. Historically, gem5 development has been carried out both in industry and in academia. This document describes how that participation takes place and how to set about earning merit within the project community.The document is broken into a number of sections. Philosophy describes the ideas behind the gem5 community. The Roadmap section points to the roadmap document for gem5’s development. Users and Responsibilities describes the classes of users that use gem5, the types of gem5 contributors, and their responsibilities. Support describes how the community supports users and the Contribution process describes how to contribute. Finally, the Decision Process describes how decisions are made and then we conclude.PhilosophyThe goal of gem5 is to provide a tool to further the state of the art in computer architecture. gem5 can be used for (but is not limited to) computer-architecture research, advanced development, system-level performance analysis and design-space exploration, hardware-software co-design, and low-level software performance analysis. Another goal of gem5 is to be a common framework for computer architecture. A common framework in the academic community makes it easier for other researchers to share workloads as well as models and to compare and contrast with other architectural techniques.The gem5 community strives to balance the needs of its three user types (academic researchers, industry researchers, and students, detailed below). For instance, gem5 strives to balance adding new features (important to researchers) and a stable code base (important for students). Specific user needs important to the community are enumerated below: Effectively and efficiently emulate the behavior of modern processors in a way that balances simulation performance and accuracy Serve as a malleable baseline infrastructure that can easily be adapted to emulate the desired behaviors Provide a core set of APIs and features that remain relatively stable Incorporate features that make it easy for companies and research groups to stay up to date with the tip and continue contributing to the projectAdditionally, the gem5 community is committed to openness, transparency, and inclusiveness. Participants in the gem5 community of all backgrounds should feel welcome and encouraged to contribute.gem5 RoadmapThe roadmap for gem5 can be found on Roadmap page. The roadmap document details the short and long term goals for the gem5 software. Users of all types are encouraged to contribute to this document and shape the future of gem5. Users are especially encouraged to update the roadmap (and get consensus) before submitting large changes to gem5.Roles And ResponsibilitiesUsersUsers are community members who have a need for the project. They are the most important members of the community and without them the project would have no purpose. Anyone can be a user; there are no special requirements. There are currently three main categories of gem5 users: academic researchers, industry researchers, and students. Individuals may transition between categories, e.g., when a graduate student takes an industry internship, then returns to school; or when a student graduates and takes a job in industry. These three users are described below.Academic ResearchersThis type of user primarily encompasses individuals that use gem5 in academic research. Examples include, but are not limited to, graduate students, research scientists, and post-graduates. This user often uses gem5 as a tool to discover and invent new computer architecture mechanisms. Academic Researchers often are first exposed to gem5 as Students (see below) and transition from Students to Academic Researchers over time.Because of these users’ goals, they primarily add new features to gem5. It is important to the gem5 community to encourage these users to contribute their work to the mainline gem5 repository. By encouraging these users to commit their research contributions, gem5 will make it much easier for other researchers to compare and contrast with other architectural techniques (see Philosophy section).Industry ResearchersThis type of user primarily encompasses individuals working for companies that use gem5. These users are distinguished from academic researchers in two ways. First, industry researchers are often part of a larger team, rather than working individually on gem5. Second, industry researchers often want to incorporate proprietary information into private branches of gem5. Therefore, industry researchers tend to have rather sophisticated software infrastructures built around gem5. For these users, the stability of gem5 features and baseline source code is important. Another key consideration is the fidelity of the models, and their ability to accurately reflect realistic implementations. To enable industry participation, it is critical to maintain licensing terms that do not restrict or burden the use of gem5 in conjunction with proprietary IP.StudentsThis type of user primarily encompasses individuals that are using gem5 in a classroom setting. These users typically have some foundation in computer architecture, but they have little or no background using simulation tools. Additionally, these users may not use gem5 for an extended period of time, after finishing their short-term goals (e.g., a semester-long class).The project asks its users to participate in the project and community as much as possible. User contributions enable the project team to ensure that they are satisfying the needs of those users. Common user contributions include (but are not limited to): evangelising about the project (e.g., a link on a website and word-of-mouth awareness raising) informing developers of strengths and weaknesses from a new user perspective providing moral support (a ‘thank you’ goes a long way) providing financial support (the software is open source, but its developers need to eat)Users who continue to engage with the project and its community will often become more and more involved. Such users may find themselves becoming contributors, as described in the next section.ContributorsContributors are community members who contribute in concrete ways to the project. Anyone can become a contributor, and contributions can take many forms. There are no specific skill requirements and no selection process. There is only one expectation of commitment to the project: contributors must be respectful to each other during the review process and work together to reach compromises. See the “Reviewing Patches” section for more on the process of contributing.In addition to their actions as users, contributors may also find themselves doing one or more of the following: answering questions on the mailing lists, particularly the “easy” questions from new users (existing users are often the best people to support new users), or those that relate to the particular contributor’s experiences reporting bugs identifying requirements providing graphics and web design programming assisting with project infrastructure writing documentation fixing bugs adding features acting as an ambassador and helping to promote the projectContributors engage with the project through the Review Board and mailing list, or by writing or editing documentation. They submit changes to the project source code via patches submitted to Review Board, which will be considered for inclusion in the project by existing committers (see next section). The developer mailing list is the most appropriate place to ask for help when making that first contribution.As contributors gain experience and familiarity with the project, their profile within, and commitment to, the community will increase. At some stage, they may find themselves being nominated for committership.CommittersCommitters are community members who have shown that they are committed to the continued development of the project through ongoing engagement with the community. Committership allows contributors to more easily carry on with their project related activities by giving them direct access to the project’s resources. That is, they can make changes directly to project outputs, although they still have to submit code changes via Review Board. Additionally, committers are expected to have an ongoing record of contributions in terms of code, reviews, and/or discussion.Committers have no more authority over the project than contributors. While committership indicates a valued member of the community who has demonstrated a healthy respect for the project’s aims and objectives, their work continues to be reviewed by the community. The key difference between a committer and a contributor is committers have the extra responsibility of pushing patches to the mainline. Additionally, committers are expected to contribute to discussions on the gem5-dev list and review patches.Anyone can become a committer. The only expectation is that a committer has demonstrated an ability to participate in the project as a team player. Specifically, refer to the 2nd paragraph of the Contributors section.Typically, a potential committer will need to show that they have an understanding of the project, its objectives and its strategy (see Philosophy section). They will also have provided valuable contributions to the project over a period of time.New committers can be nominated by any existing committer. Once they have been nominated, there will be a vote by the project management committee (PMC; see below). Committer nomination and voting is one of the few activities that takes place on the project’s private management list. This is to allow PMC members to freely express their opinions about a nominee without causing embarrassment. Once the vote has been held, the nominee is notified of the result. The nominee is entitled to request an explanation of any ‘no’ votes against them, regardless of the outcome of the vote. This explanation will be provided by the PMC Chair (see below) and will be anonymous and constructive in nature.Nominees may decline their appointment as a committer. However, this is unusual, as the project does not expect any specific time or resource commitment from its community members. The intention behind the role of committer is to allow people to contribute to the project more easily, not to tie them into the project in any formal way.It is important to recognise that commitership is a privilege, not a right. That privilege must be earned and once earned it can be removed by the PMC (see next section) in extreme circumstances. However, under normal circumstances committership exists for as long as the committer wishes to continue engaging with the project.A committer who shows an above-average level of contribution to the project, particularly with respect to its strategic direction and long-term health, may be nominated to become a member of the PMC. This role is described below.Project management committeeThe project management committee consists of those individuals identified as ‘project owners’ on the development site. The PMC has additional responsibilities over and above those of a committer. These responsibilities ensure the smooth running of the project. PMC members are expected to review code contributions, participate in strategic planning, approve changes to the governance model and manage how the software is distributed and licensed.Some PMC members are responsible for specific components of the gem5 project. This includes gem5 source modules (e.g., classic caches, O3CPU model, etc.) and project assets (e.g., the website). A list of the current components and the responsible members can be found on Module owners.Members of the PMC do not have significant authority over other members of the community, although it is the PMC that votes on new committers. It also makes decisions when community consensus cannot be reached. In addition, the PMC has access to the project’s private mailing list. This list is used for sensitive issues, such as votes for new committers and legal matters that cannot be discussed in public. It is never used for project management or planning.Membership of the PMC is by invitation from the existing PMC members. A nomination will result in discussion and then a vote by the existing PMC members. PMC membership votes are subject to consensus approval of the current PMC members. Additions to the PMC require unanimous agreement of the PMC members. Removing someone from the PMC requires N-1 positive votes, where N is the number of PMC members not including the individual who is being voted out.Members Ali Saidi Andreas Hansson Andreas Sandberg Anthony Gutierrez Brad Beckmann Jason Lowe-Power Nathan Binkerg Steve ReinhardtPMC ChairThe PMC Chair is a single individual, voted for by the PMC members. Once someone has been appointed Chair, they remain in that role until they choose to retire, or the PMC casts a two-thirds majority vote to remove them.The PMC Chair has no additional authority over other members of the PMC: the role is one of coordinator and facilitator. The Chair is also expected to ensure that all governance processes are adhered to, and has the casting vote when any project decision fails to reach consensus.SupportAll participants in the community are encouraged to provide support for new users within the project management infrastructure. This support is provided as a way of growing the community. Those seeking support should recognise that all support activity within the project is voluntary and is therefore provided as and when time allows.Contribution ProcessAnyone, capable of showing respect to others, can contribute to the project, regardless of their skills, as there are many ways to contribute. For instance, a contributor might be active on the project mailing list and issue tracker, or might supply patches. The various ways of contributing are described in more detail in a separate document Submitting Contributions.The developer mailing list is the most appropriate place for a contributor to ask for help when making their first contribution. See the Submitting Contributions page on the gem5 wiki for details of the gem5 contribution process. Each new contribution should be submitted as a patch to our Review Board site. Then, other gem5 developers will review your patch, possibly asking for minor changes. After the patch has received consensus (see Decision Making Process), the patch is ready to be committed to the gem5 tree. For committers, this is as simple as pushing the changeset. For contributors, a committer should push the changeset for you. If a committer does not push the changeset within a reasonable window (a couple of days), send a friendly reminder email to the gem5-dev list. Before a patch is committed to gem5, it must receive at least 2 “Ship its” from reviewboard. If there are no reviews on a patch, users should send follow up emails to the gem5-dev list asking for reviews.Reviewing PatchesAn important part of the contribution process is providing feedback on patches that other developers submit. The purpose of reviewing patches is to weed out obvious bugs and to ensure that the code in gem5 is of sufficient quality.All users are encouraged to review the contributions that are posted on Review Board. If you are an active gem5 user, it’s a good idea to keep your eye on the contributions that are posted there (typically by subscribing to the gem5-dev mailing list) so you can speak up when you see a contribution that could impact your use of gem5. It is far more effective to contribute your opinion in a review before a patch gets committed than to complain after the patch is committed, you update your repository, and you find that your simulations no longer work.We greatly value the efforts of reviewers to maintain gem5’s code quality and consistency. However, it is important that reviews balance the desire to maintain the quality of the code in gem5 with the need to be open to accepting contributions from a broader community. People will base their desire to contribute (or continue contributing) on how they and other contributors are received. With that in mind, here are some guidelines for reviewers: Remember that submitting a contribution is a generous act, and is very rarely a requirement for the person submitting it. It’s always a good idea to start a review with something like “thank you for submitting this contribution”. A thank-you is particularly important for new or occasional submitters. Overall, the attitude of a reviewer should be “how can we take this contribution and put it to good use”, not “what shortcomings in this work must the submitter address before the contribution can be considered worthy”. As the saying goes, “the perfect is the enemy of the good”. While we don’t want gem5 to deteriorate, we also don’t want to bypass useful functionality or improvements simply because they are not optimal. If the optimal solution is not likely to happen, then accepting a suboptimal solution may be preferable to having no solution. A suboptimal solution can always be replaced by the optimal solution later. Perhaps the suboptimal solution can be incrementally improved to reach that point. When asking a submitter for additional changes, consider the cost-benefit ratio of those changes. In particular, reviewers should not discount the costs of requested changes just because the cost to the reviewer is near zero. Asking for extensive changes, particularly from someone who is not a long-time gem5 developer, may be imposing a significant burden on someone who is just trying to be helpful by submitting their code. If you as a reviewer really feel that some extensive reworking of a patch is necessary, consider volunteering to make the changes yourself. Not everyone uses gem5 in the same way or has the same needs. It’s easy to reject a solution due to its flaws when it solves a problem you don’t have—so there’s no loss to you if we end up with no solution. That’s probably not an acceptable result for the person submitting the patch though. Another way to look at this point is as the flip side of the previous item: just as your cost-benefit analysis should not discount the costs to the submitter of making changes, just because the costs to you are low, it should also not discount the benefits to the submitter of accepting the submission, just because the benefits to you are low. Be independent and unbiased while commenting on review requests. Do not support a patch just because you or your organization will benefit from it or oppose it because you will need to do more work. Whether you are an individual or someone working with an organization, think about the patch from community’s perspective. Try to keep the arguments technical and the language simple. If you make some claim about a patch, substantiate it.Decision Making ProcessDecisions about the future of the project are made through discussion with all members of the community, from the newest user to the most experienced PMC member. All non-sensitive project management discussion takes place on the gem5-dev mailing list. Occasionally, sensitive discussion occurs on a private list.In order to ensure that the project is not bogged down by endless discussion and continual voting, the project operates a policy of lazy consensus. This allows the majority of decisions to be made without resorting to a formal vote.Lazy consensusDecision making typically involves the following steps: Proposal Discussion Vote (if consensus is not reached through discussion) DecisionAny community member can make a proposal for consideration by the community. In order to initiate a discussion about a new idea, they should send an email to the gem5-dev list or submit a patch implementing the idea to Review Board. This will prompt a review and, if necessary, a discussion of the idea. The goal of this review and discussion is to gain approval for the contribution. Since most people in the project community have a shared vision, there is often little need for discussion in order to reach consensus.In general, as long as nobody explicitly opposes a proposal, it is recognised as having the support of the community. This is called lazy consensus—that is, those who have not stated their opinion explicitly have implicitly agreed to the implementation of the proposal.Lazy consensus is a very important concept within the project. It is this process that allows a large group of people to efficiently reach consensus, as someone with no objections to a proposal need not spend time stating their position, and others need not spend time reading such mails.For lazy consensus to be effective, it is necessary to allow at least two weeks before assuming that there are no objections to the proposal. This requirement ensures that everyone is given enough time to read, digest and respond to the proposal. This time period is chosen so as to be as inclusive as possible of all participants, regardless of their location and time commitments. For Review Board requests, if there are no reviews after two weeks, the submitter should send a reminder email to the mailing list. Reviewers may ask patch submitters to delay submitting a patch when they have a desire to review a patch and need more time to do so. As discussed in the Contributing Section, each patch should have at least two “Ship its” before it is committed.VotingNot all decisions can be made using lazy consensus. Issues such as those affecting the strategic direction or legal standing of the project must gain explicit approval in the form of a vote. Every member of the community is encouraged to express their opinions in all discussion and all votes. However, only project committers and/or PMC members (as defined above) have binding votes for the purposes of decision making. A separate document on the voting within a meritocratic governance model (http://oss-watch.ac.uk/resources/meritocraticgovernancevoting) describes in more detail how voting is conducted in projects following the practice established within the Apache Software Foundation.This document is based on the example (http://oss-watch.ac.uk/resources/meritocraticgovernancemodel) by Ross Gardler and Gabriel Hanganu and is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License", |
| "url": "/governance/" |
| } |
| , |
| |
| "help": { |
| "title": "Help", |
| "content": "Help content goes here. A list item Another list item", |
| "url": "/help/" |
| } |
| , |
| |
| "publications": { |
| "title": "Publications", |
| "content": " Original Paper Special Features of gem5 GPUs DRAM Controller, DRAM Power Estimation KVM Elastic Traces SystemC Couping Other Publications related to gem5 Publications using gem5 / m5 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 Derivative projects MV5 gem5-gpu If you use gem5 in your research, we would appreciate a citation to the original paper in any publications you produce. Moreover, we would appreciate if you cite also the speacial features of gem5 which have been developed and contributed to the main line since the publication of the original paper in 2011. In other words, if you use feature X please also cite the according paper Y from the list below.Original Paper The gem5 Simulator. Nathan Binkert, Bradford Beckmann, Gabriel Black, Steven K. Reinhardt, Ali Saidi, Arkaprava Basu, Joel Hestness, Derek R. Hower, Tushar Krishna, Somayeh Sardashti, Rathijit Sen, Korey Sewell, Muhammad Shoaib, Nilay Vaish, Mark D. Hill, and David A. Wood. May 2011, ACM SIGARCH Computer Architecture News.Special Features of gem5GPUs Lost in Abstraction: Pitfalls of Analyzing GPUs at the Intermediate Language Level. Anthony Gutierrez, Bradford M. Beckmann, Alexandru Dutu, Joseph Gross, John Kalamatianos, Onur Kayiran, Michael LeBeane, Matthew Poremba, Brandon Potter, Sooraj Puthoor, Matthew D. Sinclair, Mark Wyse, Jieming Yin, Xianwei Zhang, Akshay Jain, Timothy G. Rogers. In Proceedings of the 24th IEEE International Symposium on High-Performance Computer Architecture (HPCA), February 2018. NoMali: Simulating a realistic graphics driver stack using a stub GPU. René de Jong, Andreas Sandberg. In Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS), March 2016. gem5-gpu: A Heterogeneous CPU-GPU Simulator. Jason Power, Joel Hestness, Marc S. Orr, Mark D. Hill, David A. Wood. Computer Architecture Letters vol. 13, no. 1, Jan 2014 DRAM Controller, DRAM Power Estimation Simulating DRAM controllers for future system architecture exploration. Andreas Hansson, Neha Agarwal, Aasheesh Kolli, Thomas Wenisch and Aniruddha N. Udipi. In Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS), March 2014. DRAMPower: Open-source DRAM Power & Energy Estimation Tool. Karthik Chandrasekar, Christian Weis, Yonghui Li, Sven Goossens, Matthias Jung, Omar Naji, Benny Akesson, Norbert Wehn, and Kees Goossens, URL: http://www.drampower.info. KVM Full Speed Ahead: Detailed Architectural Simulation at Near-Native Speed. Andreas Sandberg, Nikos Nikoleris, Trevor E. Carlson, Erik Hagersten, Stefanos Kaxiras, David Black-Schaffer. 2015 IEEE International Symposium on Workload CharacterizationElastic Traces Exploring system performance using elastic traces: Fast, accurate and portable. Radhika Jagtap, Matthias Jung, Stephan Diestelhorst, Andreas Hansson, Norbert Wehn. IEEE International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation (SAMOS), 2016SystemC Couping System Simulation with gem5 and SystemC: The Keystone for Full Interoperability. C. Menard, M. Jung, J. Castrillon, N. Wehn. IEEE International Conference on Embedded Computer Systems Architectures Modeling and Simulation (SAMOS), July, 2017Other Publications related to gem5 Enabling Realistic Logical Device Interface and Driver for NVM Express Enabled Full System Simulations. Donghyun Gouk, Jie Zhang and Myoungsoo Jung. IFIP International Conference on Network and Parallel Computing (NPC) and Invited for International Journal of Parallel Programming (IJPP), 2017 SimpleSSD: Modeling Solid State Drives for Holistic System Simulation. Myoungsoo Jung, Jie Zhang, Ahmed Abulila, Miryeong Kwon, Narges Shahidi, John Shalf, Nam Sung Kim and Mahmut Kandemir. IEEE Computer Architecture Letters (CAL), 2017 “dist-gem5: Distributed Simulation of Computer Clusters,” Mohammad Alian, Gabor Dozsa, Umur Darbaz, Stephan Diestelhorst, Daehoon Kim, and Nam Sung Kim. IEEE International Symposium on Performance Analysis of Systems (ISPASS), April 2017 pd-gem5: Simulation Infrastructure for Parallel/Distributed Computer Systems. Mohammad Alian, Daehoon Kim, and Nam Sung Kim. Computer Architecture Letters (CAL), 2016. A Full-System Approach to Analyze the Impact of Next-Generation Mobile Flash Storage. Rene de Jong and Andreas Hansson. In Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS), March 2015. Sources of Error in Full-System Simulation. A. Gutierrez, J. Pusdesris, R.G. Dreslinski, T. Mudge, C. Sudanthi, C.D. Emmons, M. Hayenga, and N. Paver. In Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS), March 2014. Introducing DVFS-Management in a Full-System Simulator. Vasileios Spiliopoulos, Akash Bagdia, Andreas Hansson, Peter Aldworth and Stefanos Kaxiras. In Proceedings of the 21st International Symposium on Modeling, Analysis & Simulation of Computer and Telecommunication Systems (MASCOTS), August 2013. Accuracy Evaluation of GEM5 Simulator System. A. Butko, R. Garibotti, L. Ost, and G. Sassatelli. In the proceeding of the IEEE International Workshop on Reconfigurable Communication-centric Systems-on-Chip (ReCoSoC), York, United Kingdom, July 2012. The M5 Simulator: Modeling Networked Systems. N. L. Binkert, R. G. Dreslinski, L. R. Hsu, K. T. Lim, A. G. Saidi, S. K. Reinhardt. IEEE Micro, vol. 26, no. 4, pp. 52-60, July/August, 2006. Multifacet’s General Execution-driven Multiprocessor Simulator (GEMS) Toolset. Milo M.K. Martin, Daniel J. Sorin, Bradford M. Beckmann, Michael R. Marty, Min Xu, Alaa R. Alameldeen, Kevin E. Moore, Mark D. Hill, and David A. Wood. Computer Architecture News (CAN), September 2005.Publications using gem5 / m52017 [https://chess.eecs.berkeley.edu/pubs/1194/KimEtAl_CyPhy17.pdfAn Integrated Simulation Tool for Computer Architecture and Cyber-Physical Systems]. Hokeun Kim, Armin Wasicek, and Edward A. Lee. In Proceedings of the 6th Workshop on Design, Modeling and Evaluation of Cyber-Physical Systems (CyPhy’17), Seoul, Korea, October 19, 2017. [http://www.lirmm.fr/~sassate/ADAC/wp-content/uploads/2017/06/opensuco17.pdfEfficient Programming for Multicore Processor Heterogeneity: OpenMP versus OmpSs]. Anastasiia Butko, Florent Bruguier, Abdoulaye Gamatié and Gilles Sassatelli. In Open Source Supercomputing (OpenSuCo’17) Workshop co-located with ISC’17, June 2017. [https://hal-lirmm.ccsd.cnrs.fr/lirmm-01467328MAGPIE: System-level Evaluation of Manycore Systems with Emerging Memory Technologies]. Thibaud Delobelle, Pierre-Yves Péneau, Abdoulaye Gamatié, Florent Bruguier, Sophiane Senni, Gilles Sassatelli and Lionel Torres, 2nd International Workshop on Emerging Memory Solutions (EMS) co-located with DATE’17, March 2017. 2016 [http://ieeexplore.ieee.org/document/7776838An Agile Post-Silicon Validation Methodology for the Address Translation Mechanisms of Modern Microprocessors]. G. Papadimitriou, A. Chatzidimitriou, D. Gizopoulos, R. Morad, IEEE Transactions on Device and Materials Reliability (TDMR 2016), Volume: PP, Issue: 99, December 2016. [http://ieeexplore.ieee.org/document/7753339Unveiling Difficult Bugs in Address Translation Caching Arrays for Effective Post-Silicon Validation]. G. Papadimitriou, D. Gizopoulos, A. Chatzidimitriou, T. Kolan, A. Koyfman, R. Morad, V. Sokhin, IEEE International Conference on Computer Design (ICCD 2016), Phoenix, AZ, USA, October 2016. [http://ieeexplore.ieee.org/document/7833682/Loop optimization in presence of STT-MRAM caches: A study of performance-energy tradeoffs]. Pierre-Yves Péneau, Rabab Bouziane, Abdoulaye Gamatié, Erven Rohou, Florent Bruguier, Gilles Sassatelli, Lionel Torres and Sophiane Senni, 26th International Workshop on Power and Timing Modeling, Optimization and Simulation (PATMOS), September 21-23 2016. [http://ieeexplore.ieee.org/abstract/document/7774439Full-System Simulation of big.LITTLE Multicore Architecture for Performance and Energy Exploration]. Anastasiia Butko, Florent Bruguier, Abdoulaye Gamatié, Gilles Sassatelli, David Novo, Lionel Torres and Michel Robert. Embedded Multicore/Many-core Systems-on-Chip (MCSoC), 2016 IEEE 10th International Symposium on, September 21-23, 2016. [http://ieeexplore.ieee.org/document/7448986Exploring MRAM Technologies for Energy Efficient Systems-On-Chip]. Sophiane Senni, Lionel Torres, Gilles Sassatelli, Abdoulaye Gamatié and Bruno Mussard, IEEE Journal on Emerging and Selected Topics in Circuits and Systems , Volume: 6, Issue: 3, Sept. 2016. [https://cpc2016.infor.uva.es/wp-content/uploads/2016/06/CPC2016_paper_11.pdfArchitectural exploration of heterogeneous memory systems]. Marcos Horro, Gabriel Rodríguez, Juan Touriño and Mahmut T. Kandemir. 19th Workshop on Compilers for Parallel Computing (CPC), July 2016. [http://ieeexplore.ieee.org/document/7604675ISA-Independent Post-Silicon Validation for the Address Translation Mechanisms of Modern Microprocessors]. G. Papadimitriou, A. Chatzidimitriou, D. Gizopoulos and R. Morad, IEEE International Symposium on On-Line Testing and Robust System Design (IOLTS 2016), Sant Feliu de Guixols, Spain, July 2016. Anatomy of microarchitecture-level reliability assessment: Throughput and accuracy. A.Chatzidimitriou, D.Gizopoulos, IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), Uppsala, Sweden, April 2016. Agave: A benchmark suite for exploring the complexities of the Android software stack. Martin Brown, Zachary Yannes, Michael Lustig, Mazdak Sanati, Sally A. McKee, Gary S. Tyson, Steven K. Reinhardt, IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), Uppsala, Sweden, April 2016. 2015 [http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7314163Differential Fault Injection on Microarchitectural Simulators]. M.Kaliorakis, S.Tselonis, A.Chatzidimitriou, N.Foutris, D.Gizopoulos, IEEE International Symposium on Workload Characterization (IISWC), Atlanta, GA, USA, October 2015. Live Introspection of Target-Agnostic JIT in Simulation. B. Shingarov. International Workshop IWST’15 in cooperation with ACM, Brescia, Italy, 2015. Security in MPSoCs: A NoC Firewall and an Evaluation Framework. M.D. Grammatikakis, K. Papadimitriou, P. Petrakis, A. Papagrigoriou, G. Kornaros, I. Christoforakis, O. Tomoutzoglou, G. Tsamis and M. Coppola. In IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (TCAD), vol.34, no.8, pp.1344-1357, Aug. 2015 DPCS: Dynamic Power/Capacity Scaling for SRAM Caches in the Nanoscale Era. Mark Gottscho, Abbas BanaiyanMofrad, Nikil Dutt, Alex Nicolau, and Puneet Gupta. ACM Transactions on Architecture and Code Optimization (TACO), Vol. 12, No. 3, Article 27. Pre-print June 2015, published August 2015, print October 2015. A predictable and command-level priority-based DRAM controller for mixed-criticality systems. Hokeun Kim, David Broman, Edward A. Lee, Michael Zimmer, Aviral Shrivastava, Junkwang Oh. Proceedings of the 21st IEEE Real-Time and Embedded Technology and Application Symposium (RTAS), Seattle, WA, USA, April, 2015. Security Enhancements for Building Saturation-free, Low-Power NoC-based MPSoCs. Kyprianos Papadimitriou, Polydoros Petrakis, Miltos Grammatikakis, Marcello Coppola. In IEEE Conference on Communications and Network Security (CNS) - 1st IEEE Workshop on Security and Privacy in Cybermatics, Florence, Italy, 2015 Design Exploration For Next Generation High-Performance Manycore On-chip Systems: Application To big.LITTLE Architectures. Anastasiia Butko, Abdoulaye Gamatie, Gilles Sassatelli, Lionel Torres and Michel Robert. VLSI (ISVLSI), 2015 IEEE Computer Society Annual Symposium on, July 10, 2015 [http://dx.doi.org/10.1007/s11227-014-1375-7 Gem5v: a modified gem5 for simulating virtualized systems]. Seyed Hossein Nikounia, Siamak Mohammadi. Springer Journal of Supercomputing. The source code is available [https://github.com/nikoonia/gem5v here]. Micro-architectural simulation of embedded core heterogeneity with gem5 and McPAT. Fernando A. Endo, Damien Couroussé, Henri-Pierre Charles. RAPIDO ‘15 Proceedings of the 2015 Workshop on Rapid Simulation and Performance Evaluation: Methods and Tools. January 2015. A trace-driven approach for fast and accurate simulation of manycore architectures. Anastasiia Butko, Rafael Garibotti, Luciano Ost, Vianney Lapotre, Abdoulaye Gamatie, Gilles Sassatelli and Chris Adeniyi-Jones. Design Automation Conference (ASP-DAC), 2015 20th Asia and South Pacific. January 19, 2015 2014 Evaluating Private vs. Shared Last-Level Caches for Energy Efficiency in Asymmetric Multi-Cores. A. Gutierrez, R.G. Dreslinski, and Trevor Mudge. In Proceedings of the 14th International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS), 2014. [http://dx.doi.org/10.1109/HPCC.2014.173 Security Effectiveness and a Hardware Firewall for MPSoCs]. M. D. Grammatikakis, K. Papadimitriou, P. Petrakis, A. Papagrigoriou, G. Kornaros, I. Christoforakis and M. Coppola. In 16th IEEE International Conference on High Performance Computing and Communications - Workshop on Multicore and Multithreaded Architectures and Algorithms, 2014, pp. 1032-1039 Aug. 2014 [http://dx.doi.org/10.1145/2541940.2541951 Integrated 3D-Stacked Server Designs for Increasing Physical Density of Key-Value Stores]. Anthony Gutierrez, Michael Cieslak, Bharan Giridhar, Ronald G. Dreslinski, Luis Ceze, and Trevor Mudge. ASPLOS XIX [http://dx.doi.org/10.1145/2593069.2593184 Power / Capacity Scaling: Energy Savings With Simple Fault-Tolerant Caches]. Mark Gottscho, Abbas BanaiyanMofrad, Nikil Dutt, Alex Nicolau, and Puneet Gupta. DAC, 2014. ”‘Write-Aware Replacement Policies for PCM-Based Systems “’. R. Rodríguez-Rodríguez, F. Castro, D. Chaver*, R. Gonzalez-Alberquilla, L. Piñuel and F. Tirado. The Computer Journal, 2014. ”‘Micro-architectural simulation of in-order and out-of-order ARM microprocessors with gem5 “’. Fernando A. Endo, Damien Couroussé, Henri-Pierre Charles. 2014 International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS XIV). July 2014. 2013 Continuous Real-World Inputs Can Open Up Alternative Accelerator Designs. Bilel Belhadj, Antoine Joubert, Zheng Li, Rodolphe Héliot, and Olivier Temam. ISCA ‘13 Cache Coherence for GPU Architectures. Inderpreet Singh, Arrvindh Shriraman, Wilson WL Fung, Mike O’Connor, and Tor M. Aamodt. HPCA, 2013. Navigating Heterogeneous Processors with Market Mechanisms. Marisabel Guevara, Benjamin Lubin, and Benjamin C. Lee. HPCA, 2013 Power Struggles: Revisiting the RISC vs. CISC Debate on Contemporary ARM and x86 Architectures. Emily Blem, Jaikrishnan Menon, and Karthikeyan Sankaralingam. HPCA 2013. Coset coding to extend the lifetime of memory. Adam N. Jacobvitz, Robert Calderbank, Daniel J. Sorin. HPCA ‘13. The McPAT Framework for Multicore and Manycore Architectures: Simultaneously Modeling Power, Area, and Timing. Sheng Li, Jung Ho Ahn, Richard D. Strong, Jay B. Brockman, Dean M. Tullsen, Norman P. Jouppi. ACM Transactions on Architecture and Code Optimization (TACO), Volume 10, Issue 1, April 2013 Optimization and Mathematical Modeling in Computer Architecture Nowatzki, T., Ferris, M., Sankaralingam, K., Estan, C., Vaish, N., & Wood, David A. (2013). Synthesis Lectures on Computer Architecture, 8(4), 1-144. Limits of Parallelism and Boosting in Dim Silicon. Nathaniel Pinckney, Ronald G. Dreslinski, Korey Sewell, David Fick, Trevor Mudge, Dennis Sylvester, David Blaauw, IEEE Micro, vol. 33, no. 5, pp. 30-37, Sept.-Oct., 20132012 Hardware Prefetchers for Emerging Parallel Applications, Biswabandan Panda, Shankar Balachandran. In the proceedings of the IEEE/ACM Parallel Architectures and Compilation Techniques,PACT, Minneapolis, October 2012. Lazy Cache Invalidation for Self-Modifying Codes. A. Gutierrez, J. Pusdesris, R.G. Dreslinski, and T. Mudge. In the proceedings of the International Conference on Compilers, Architecture and Synthesis for Embedded Systems (CASES), Tampere, Finland, October 2012. Accuracy Evaluation of GEM5 Simulator System. A. Butko, R. Garibotti, L. Ost, and G. Sassatelli. In the proceeding of the IEEE International Workshop on Reconfigurable Communication-centric Systems-on-Chip (ReCoSoC), York, United Kingdom, July 2012. Viper: Virtual Pipelines for Enhanced Reliability. A. Pellegrini, J. L. Greathouse, and V. Bertacco. In the proceedings of the International Symposium on Computer Architecture (ISCA), Portland, OR, June 2012. Reducing memory reference energy with opportunistic virtual caching. Arkaprava Basu, Mark D. Hill, Michael M. Swift. In the proceedings of the 39th International Symposium on Computer Architecture (ISCA 2012). Cache Revive: Architecting Volatile STT-RAM Caches for Enhanced Performance in CMPs. Adwait Jog, Asit Mishra, Cong Xu, Yuan Xie, V. Narayanan, Ravi Iyer, Chita Das. In the proceedings oF the IEEE/ACM Design Automation Conference (DAC), San Francisco, CA, June 2012.2011 Full-System Analysis and Characterization of Interactive Smartphone Applications. A. Gutierrez, R.G. Dreslinski, T.F. Wenisch, T. Mudge, A. Saidi, C. Emmons, and N. Paver. In the proceeding of the IEEE International Symposium on Workload Characterization (IISWC), pages 81-90, Austin, TX, November 2011. Universal Rules Guided Design Parameter Selection for Soft Error Resilient Processors, L. Duan, Y. Zhang, B. Li, and L. Peng. Proceedings of the International Symposium on Performance Analysis of Systems and Software(ISPASS), Austin, TX, April 2011.2010 Using Hardware Vulnerability Factors to Enhance AVF Analysis, V. Sridharan, D. R. Kaeli. Proceedings of the International Symposium on Computer Architecture (ISCA-37), Saint-Malo, France, June 2010. Leveraging Unused Cache Block Words to Reduce Power in CMP Interconnect, H. Kim, P. Gratz. IEEE Computer Architecture Letters, vol. 99, (RapidPosts), 2010. A Fast Timing-Accurate MPSoC HW/SW Co-Simulation Platform based on a Novel Synchronization Scheme, Mingyan Yu, Junjie Song, Fangfa Fu, Siyue Sun, and Bo Liu. Proceedings of the International MultiConfernce of Engineers and Computer Scientists. 2010 pdf Simulation of Standard Benchmarks in Hardware Implementations of L2 Cache Models in Verilog HDL, Rosario M. Reas, Anastacia B. Alvarez, Joy Alinda P. Reyes, Computer Modeling and Simulation, International Conference on, pp. 153-158, 2010 12th International Conference on Computer Modelling and Simulation, 2010 A Simulation of Cache Sub-banking and Block Buffering as Power Reduction Techniques for Multiprocessor Cache Design, Jestoni V. Zarsuela, Anastacia Alvarez, Joy Alinda Reyes, Computer Modeling and Simulation, International Conference on, pp. 515-520, 2010 12th International Conference on Computer Modelling and Simulation, 20102009 Efficient Implementation of Decoupling Capacitors in 3D Processor-DRAM Integrated Computing Systems. Q. Wu, J. Lu, K. Rose, and T. Zhang. Great Lakes Symposium on VLSI. 2009. Evaluating the Impact of Job Scheduling and Power Management on Processor Lifetime for Chip Multiprocessors. A. K. Coskun, R. Strong, D. M. Tullsen, and T. S. Rosing. Proceedings of the eleventh international joint conference on Measurement and modeling of computer systems. 2009. ” Devices and architectures for photonic chip-scale integration.” J. Ahn, M. Fiorentino1, R. G. Beausoleil, N. Binkert, A. Davis, D. Fattal, N. P. Jouppi, M. McLaren, C. M. Santori, R. S. Schreiber, S. M. Spillane, D. Vantrease and Q. Xu. Journal of Applied Physics A: Materials Science & Processing. February 2009. System-Level Power, Thermal and Reliability Optimization. C. Zhu. Thesis at Queen’s University. 2009. A light-weight fairness mechanism for chip multiprocessor memory systems. M. Jahre, L. Natvig. Proceedings of the 6th ACM conference on Computing Frontiers. 2009. Decoupled DIMM: building high-bandwidth memory system using low-speed DRAM devices. H. Zheng, J. Lin, Z. Zhang, and Z. Zhu. International Symposium on Computer Architecture (ISCA). 2009. On the Performance of Commit-Time-Locking Based Software Transactional Memory. Z. He and B. Hong. The 11th IEEE International Conference on. High Performance Computing and Communications (HPCC-09). 2009. A Quantitative Study of Memory System Interference in Chip Multiprocessor Architectures. M. Jahre, M. Grannaes and L. Natvig. The 11th IEEE International Conference on. High Performance Computing and Communications (HPCC-09). 2009. Hardware Support for Debugging Message Passing Applications for Many-Core Architectures. C. Svensson. Masters Thesis at the University of Illinois at Urbana-Champaign, 2009. Initial Experiments in Visualizing Fine-Grained Execution of Parallel Software Through Cycle-Level Simulation. R. Strong, J. Mudigonda, J. C. Mogul, N. Binkert. USENIX Workshop on Hot Topics in Parallelism (HotPar). 2009. MPreplay: Architecture Support for Deterministic Replay of Message Passing Programs on Message Passing Many-core Processors. C. Erik-Svensson, D. Kesler, R. Kumar, and G. Pokam. University of Illinois Technical Report number UILU-09-2209. Low-power Inter-core Communication through Cache Partitioning in Embedded Multiprocessors. C. Yu, X. Zhou, and P. Petrov .Symposium on Integrated Circuits and System Design (sbcci). 2009. Integrating NAND flash devices onto servers. D. Roberts, T. Kgil, T. Mudge. Communications of the ACM (CACM). 2009. A High-Performance Low-Power Nanophotonic On-Chip Network. Z. Li, J. Wu, L. Shang, A. Mickelson, M. Vachharajani, D. Filipovic, W. Park∗ and Y. Sun. International Symposium on Low Power Electronic Design (ISLPED). 2009. Core monitors: monitoring performance in multicore processors. P. West, Y. Peress, G. S. Tyson, and S. A. McKee. Computing Frontiers. 2009. Parallel Assertion Processing using Memory Snapshots. M. F. Iqbal, J. H. Siddiqui, and D. Chiou. Workshop on Unique Chips and Systems (UCAS). April 2009. Leveraging Memory Level Parallelism Using Dynamic Warp Subdivision. J. Meng, D. Tarjan, and K. Skadron. Univ. of Virginia Dept. of Comp. Sci. Tech Report (CS-2009-02). Reconfigurable Multicore Server Processors for Low Power Operation. R. G. Dreslinski, D. Fick, D. Blaauw, D. Sylvester and T. Mudge. 9th International Symposium on Systems, Architectures, MOdeling and Simulation (SAMOS). July 2009. Near Threshold Computing: Overcoming Performance Degradation from Aggressive Voltage Scaling R. G. Dreslinski, M. Wieckowski, D. Blaauw, D. Sylvester, and T. Mudge. Workshop on Energy Efficient Design (WEED), June 2009. Workload Adaptive Shared Memory Multicore Processors with Reconfigurable Interconnects. S. Akram, R. Kumar, and D. Chen. IEEE Symposium on Application Specific Processors, July 2009. Eliminating Microarchitectural Dependency from Architectural Vulnerability. V. Sridharan, D. R. Kaeli. Proceedings of the 15th International Symposium on High-Performance Computer Architecture (HPCA-15), February 2009. Producing Wrong Data Without Doing Anything Obviously Wrong! T. Mytkowicz, A. Diwan, M. Hauswirth, P. F. Sweeney. Proceedings of the 14th international conference on Architectural support for programming languages and operating systems (ASPLOS). 2009. End-To-End Performance Forecasting: Finding Bottlenecks Before They Happen A. Saidi, N. Binkert, S. Reinhardt, T. Mudge. Proceedings of the 36th International Symposium on Computer Architecture (ISCA-36), June 2009. Fast Switching of Threads Between Cores. R. Strong, J. Mudigonda, J. C. Mogul, N. Binkert, D. Tullsen. ACM SIGOPS Operating Systems Review. 2009. Express Cube Topologies for On-Chip Interconnects. B. Grot, J. Hestness, S. W. Keckler, O. Mutlu. Proceedings of the 15th International Symposium on High-Performance Computer Architecture (HPCA-15), February 2009. Enhancing LTP-Driven Cache Management Using Reuse Distance Information. W. Lieu, D. Yeung. Journal of Instruction-Level Parallelism 11 (2009).2008 Analyzing the Impact of Data Prefetching on Chip MultiProcessors. N. Fukumoto, T. Mihara, K. Inoue, and K. Murakami. Asia-Pacific Computer Systems Architecture Conference. 2008. Historical Study of the Development of Branch Predictors. Y. Peress. Masters Thesis at Florida State University. 2008. Hierarchical Domain Partitioning For Hierarchical Architectures. J. Meng, S. Che, J. W. Sheaffer, J. Li, J. Huang, and K. Skadron. Univ. of Virginia Dept. of Comp. Sci. Tech Report CS-2008-08. 2008. Memory Access Scheduling Schemes for Systems with Multi-Core Processors. H. Zheng, J. Lin, Z. Zhang, and Z. Zhu. International Conference on Parallel Processing, 2008. Register Multimapping: Reducing Register Bank Conflicts Through One-To-Many Logical-To-Physical Register Mapping. N. L. Duong and R. Kumar. Tehnical Report CHRC-08-07. Cross-Layer Custimization Platform for Low-Power and Real-Time Embedded Applications. X. Zhou. Dissertation at the University of Maryland. 2008. Probabilistic Replacement: Enabling Flexible Use of Shared Caches for CMPs. W. Liu and D. Yeung. University of Maryland Technical Report UMIACS-TR-2008-13. 2008. Observer Effect and Measurement Bias in Performance Analysis. T. Mytkowicz, P. F. Sweeney, M. Hauswirth, and A. Diwan. University of Colorado at Boulder Technical Report CU-CS 1042-08. June, 2008. Power-Aware Dynamic Cache Partitioning for CMPs. I. Kotera, K. Abe, R. Egawa, H. Takizawa, and H. Kobayashi. 3rd International Conference on High Performance and Embedded Architectures and Compilers (HiPEAC). 2008. Modeling of Cache Access Behavior Based on Zipf’s Law. I. Kotera, H. Takizawa, R. Egawa, H. Kobayashi. MEDEA 2008. Hierarchical Verification for Increasing Performance in Reliable Processors. J. Yoo, M. Franklin. Journal of Electronic Testing. 2008. Transaction-Aware Network-on-Chip Resource Reservation. Z. Li, C. Zhu, L. Shang, R. Dick, Y. Sun. Computer Architecture Letters. Volume PP, Issue 99, Page(s):1 - 1. Predictable Out-of-order Execution Using Virtual Traces. J. Whitham, N. Audsley. Proceedings of the 29th IEEE Real-time Systems Symposium, December 2008. pdf Architectural and Compiler Mechanisms for Acelerating Single Thread Applications on Multicore Processors. H. Zhong. Dissertation at The University of Michigan. 2008. Mini-Rank: Adaptive DRAM Architecture for Improving Memory Power Efficiency. H. Zheng, J. Lin, Z. Zhang, E. Gorbatov, H. David, Z. Zhu. Proceedings of the 41st Annual Symposium on Microarchitecture (MICRO-41), November 2008. Reconfigurable Energy Efficient Near Threshold Cache Architectures. R. Dreslinski, G. Chen, T. Mudge, D. Blaauw, D. Sylvester, K. Flautner. Proceedings of the 41st Annual Symposium on Microarchitecture (MICRO-41), November 2008. Distributed and low-power synchronization architecture for embedded multiprocessors. C. Yu, P. Petrov. Internation Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), October 2008. Thermal Monitoring Mechanisms for Chip Multiprocessors. J. Long, S.O. Memik, G. Memik, R. Mukherjee. ACM Transactions on Architecture and Code Optimization (TACO), August 2008. Multi-optimization power management for chip multiprocessors. K. Meng, R. Joseph, R. Dick, L. Shang. Proceedings of the 17th international conference on Parallel Architectures and Compilation Techniques (PACT), 2008. ” Three-Dimensional Chip-Multiprocessor Run-Time Thermal Management.” C. Zhu, Z. Gu, L. Shang, R.P. Dick, R. Joseph. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (TCAD), August 2008. ” Latency and bandwidth efficient communication through system customization for embedded multiprocessors”. C. Yu and P. Petrov. DAC 2008, June 2008. Corona: System Implications of Emerging Nanophotonic Technology. D. Vantrease, R. Schreiber, M. Monchiero, M. McLaren, N., P. Jouppi, M. Fiorentino, A. Davis, N. Binkert, R. G. Beausoleil, and J. Ahn. Proceedings of the 35th International Symposium on Computer Architecture (ISCA-35), June 2008. Improving NAND Flash Based Disk Caches. T. Kgil, D. Roberts and T. N. Mudge. Proceedings of the 35th International Symposium on Computer Architecture (ISCA-35), June 2008. A Taxonomy to Enable Error Recovery and Correction in Software. V. Sridharan, D. A. Liberty, and D. R. Kaeli. Workshop on Quality-Aware Design (W-QUAD), in conjunction with the 35th International Symposium on Computer Architecture (ISCA-35), June 2008. Quantifying Software Vulnerability. V. Sridharan and D. R. Kaeli. First Workshop on Radiation Effects and Fault Tolerance in Nanometer Technologies, in conjunction with the ACM International Conference on Computing Frontiers, May 2008. Core Monitors: Monitoring Performance in Multicore Processors. P. West. Masters Thesis at Florida State University. April 2008. Full System Critical Path Analysis. A. Saidi, N. Binkert, T. N. Mudge, and S. K. Reinhardt. 2008 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), April 2008. A Power and Temperature Aware DRAM Architecture. S. Liu, S. O. Memik, Y. Zhang, G. Memik. 45th annual conference on Design automation (DAC), 2008. Streamware: Programming General-Purpose Multicore Processors Using Streams. J. Gummaraju, J. Coburn, Y. Turner, M. Rosenblum. Procedings of the Thirteenth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), March 2008. Application-aware snoop filtering for low-power cache coherence in embedded multiprocessors. X. Zhou, C. Yu, A. Dash, and P. Petrov. Transactions on Design Automation of Electronic Systems (TODAES). January 2008. An approach for adaptive DRAM temperature and power management. Song Liu, S. O. Memik, Y. Zhang, and G. Memik. Proceedings of the 22nd annual international conference on Supercomputing. 2008.2007 Modeling and Characterizing Power Variability in Multicore Architectures. K. Meng, F. Huebbers, R, Joseph, and Y. Ismail. ISPASS-2007. A High Performance Adaptive Miss Handling Architecture for Chip Multiprocessors. M. Jahre, and L. Natvig. HiPEAC Journal 2007. Performance Effects of a Cache Miss Handling Architecture in a Multi-core Processor. M. Jahre and L. Natvig. NIK-2007 conference. 2007. Prioritizing Verification via Value-based Correctness Criticality. J. Yoo, M. Franklin. Proceedings of the 25th International Conference on Computer Design (ICCD), 2007. DRAM-Level Prefetching for Fully-Buffered DIMM: Design, Performance and Power Saving. J. Lin, H. Zheng, Z. Zhu, Z. Zhang ,H. David. ISPASS 2007. ” Virtual Exclusion: An architectural approach to reducing leakage energy in caches for multiprocessor systems”. M. Ghosh, H. Lee. Proceedings of the International Conference on Parallel and Distributed Systems. December 2007. Dependability-Performance Trade-off on Multiple Clustered Core Processors. T. Funaki, T. Sato. Proceedings of the 4th International Workshop on Dependable Embedded Systems. October 2007. Predictive Thread-to-Core Assignment on a Heterogeneous Multi-core Processor. T. Sondag, V. Krishnamurthy, H. Rajan. PLOS ‘07: ACM SIGOPS 4th Workshop on Programming Languages and Operating Systems. October 2007. Power deregulation: eliminating off-chip voltage regulation circuitry from embedded systems. S. Kim, R. P. Dick, R. Joseph. 5th IEEE/ACM International Conference on Hardware/Software Co-Design and System Synthesis (CODES+ISSS). October 2007. Aggressive Snoop Reduction for Synchronized Producer-Consumer Communication in Energy-Efficient Embedded Multi-Processors. C. Yu, P. Petrov. 5th IEEE/ACM International Conference on Hardware/Software Co-Design and System Synthesis (CODES+ISSS). October 2007. Three-Dimensional Multiprocessor System-on-Chip Thermal Optimization. C. Sun, L. Shang, R.P. Dick. 5th IEEE/ACM International Conference on Hardware/Software Co-Design and System Synthesis (CODES+ISSS). October 2007. Sampled Simulation for Multithreaded Processors. M. Van Biesbrouck. (Thesis) UC San Diego Technical Report CS2007-XXXX. September 2007. Representative Multiprogram Workloads for Multithreaded Processor Simulation. M. Van Biesbroucky, L. Eeckhoutz, B. Calder. IEEE International Symposium on Workload Characterization (IISWC). September 2007. The Interval Page Table: Virtual Memory Support in Real-Time and Memory-Constrained Embedded Systems. X. Zhou, P. Petrov. Proceedings of the 20th annual conference on Integrated circuits and systems design. 2007. A power-aware shared cache mechanism based on locality assessment of memory reference for CMPs. I. Kotera, R. Egawa, H. Takizawa, H. Kobayashi. Proceedings of the 2007 workshop on MEmory performance: DEaling with Applications, systems and architecture (MEDEA). September 2007. Architectural Support for the Stream Execution Model on General-Purpose Processors. J. Gummaraju, M. Erez, J. Coburn, M. Rosenblum, W. J. Dally. The Sixteenth International Conference on Parallel Architectures and Compilation Techniques (PACT). September 2007. An Energy Efficient Parallel Architecture Using Near Threshold Operation. R. Dreslinski, B. Zhai, T. Mudge, D. Blaauw, D. Sylvester. The Sixteenth International Conference on Parallel Architectures and Compilation Techniques (PACT). September 2007. When Homogeneous becomes Heterogeneous: Wearout Aware Task Scheduling for Streaming Applications. D. Roberts, R. Dreslinski, E. Karl, T. Mudge, D. Sylvester, D. Blaauw. Workshop on Operationg System Support for Heterogeneous Multicore Architectures (OSHMA). September 2007. ” On-Chip Cache Device Scaling Limits and Effective Fault Repair Techniques in Future Nanoscale Technology”. D. Roberts, N. Kim,T. Mudge. Digital System Design Architectures, Methods and Tools (DSD). August 2007. Energy Efficient Near-threshold Chip Multi-processing. B. Zhai, R. Dreslinski, D. Blaauw, T. Mudge, D. Sylvester. International Symposium on Low Power Electronics and Design (ISLPED). August 2007. ” A Burst Scheduling Access Reordering Mechanism”. J. Shao, B.T. Davis. IEEE 13th International Symposium on High Performance Computer Architecture (HPCA). 2007. Enhancing LTP-Driven Cache Management Using Reuse Distance Information. W. Liu, D. Yeung. University of Maryland Technical Report UMIACS-TR-2007-33. June 2007. Thermal modeling and management of DRAM memory systems. J. Lin, H. Zheng, Z. Zhu, H. David, and Z. Zhang. Proceedings of the 34th Annual international Symposium on Computer Architecture (ISCA). June 2007. Duplicating and Verifying LogTM with OS Support in the M5 Simulator. G. Blake, T. Mudge. Sixth Annual Workshop on Duplicating, Deconstructing, and Debunking (WDDD). June 2007. Analysis of Hardware Prefetching Across Virtual Page Boundaries. R. Dreslinski, A. Saidi, T. Mudge, S. Reinhardt. Proc. of the 4th Conference on Computing Frontiers. May 2007. Reliability in the Shadow of Long-Stall Instructions. V. Sridharan, D. Kaeli, A. Biswas. Third Workshop on Silicon Errors in Logic - System Effects (SELSE-3). April 2007. Extending Multicore Architectures to Exploit Hybrid Parallelism in Single-thread Applications. H. Zhong, S. A. Lieberman, S. A. Mahlke. Proc. 13th Intl. Symposium on High Performance Computer Architecture (HPCA). February 2007.2006 Evaluation of the Data Vortex Photonic All-Optical Path Interconnection Network for Next-Generation Supercomputers. W. C. Hawkins. Dissertation at Georgia Tech. December 2006. Running the manual: an approach to high-assurance microkernel development. P. Derrin, K. Elphinstone, G. Klein, D. Cock, M. M. T. Chakravarty. Proceedings of the 2006 ACM SIGPLAN workshop on Haskell. 2006. The Filter Checker: An Active Verification Management Approach. J. Yoo, M. Franklin. 21st IEEE International Symposium on Defect and Fault-Tolerance in VLSI Systems (DFT’06), 2006. Physical Resource Matching Under Power Asymmetry. K. Meng, F. Huebbers, R. Joseph, Y. Ismail. Presented at the 2006 P=ac2 Conference. 2006. pdf Process Variation Aware Cache Leakage Management. K. Meng, R. Joseph. Proceedings of the 2006 International Symposium on Low Power Electronics and Design (ISLPED). October 2006. FlashCache: a NAND flash memory file cache for low power web servers. T. Kgil, T. Mudge. Proceedings of the 2006 international conference on Compilers, Architecture and Synthesis for Embedded Systems (CASES). October 2006. PicoServer: Using 3D Stacking Technology To Enable A Compact Energy Efficient Chip Multiprocessor. T. Kgil, S. D’Souza, A. Saidi, N. Binkert, R. Dreslinski, S. Reinhardt, K. Flautner, T. Mudge. 12th Int’l Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). October 2006. Integrated Network Interfaces for High-Bandwidth TCP/IP. N. L. Binkert, A. G. Saidi, S. K. Reinhardt. 12th Int’l Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). October 2006. Communist, utilitarian, and capitalist cache policies on CMPs: caches as a shared resource. L. R. Hsu, S. K. Reinhardt, R. Iyer, S. Makineni. Proc. 15th Int’l Conf. on Parallel Architectures and Compilation Techniques (PACT), September 2006. Impact of CMP Design on High-Performance Embedded Computing. P. Crowley, M. A. Franklin, J. Buhler, and R. D. Chamberlain. Proc. of 10th High Performance Embedded Computing Workshop. September 2006. BASS: A Benchmark suite for evaluating Architectural Security Systems. J. Poe, T. Li. ACM SIGARCH Computer Architecture News. Vol. 34, No. 4, September 2006. The M5 Simulator: Modeling Networked Systems. N. L. Binkert, R. G. Dreslinski, L. R. Hsu, K. T. Lim, A. G. Saidi, S. K. Reinhardt. IEEE Micro, vol. 26, no. 4, pp. 52-60, July/August, 2006.Link Considering All Starting Points for Simultaneous Multithreading Simulation. M. Van Biesbrouck, L. Eeckhout, B. Calder. Proc. of the Int’l Symp. on Performance Analysis of Systems and Software (ISPASS). 2006.pdf Dynamic Thread Assignment on Heterogeneous Multiprocessor Architectures. M. Becchi, P. Crowley. Proc. of the 3rd Conference on Computing Frontiers. pp29-40. May 2006. pdf Integrated System Architectures for High-Performance Internet Servers. N. L. Binkert. Dissertation at the University of Michigan. February 2006. Exploring Salvage Techniques for Multi-core Architectures. R. Joseph. 2nd Workshop on High Performance Computing Reliability Issues. February 2006. pdf A Simple Integrated Network Interface for High-Bandwidth Servers. N. L. Binkert, A. G. Saidi, S. K. Reinhardt. University of Michigan Technical Report CSE-TR-514-06, January 2006. pdf 2005 Software Defined Radio - A High Performance Embedded Challenge. H. lee, Y. Lin, Y. Harel, M. Woh, S. Mahlke, T. Mudge, K. Flautner. Proc. 2005 Int’l Conf. on High Performance Embedded Architectures and Compilers (HiPEAC). November 2005. pdf How to Fake 1000 Registers. D. W. Oehmke, N. L. Binkert, S. K. Reinhardt, and T. Mudge. Proc. 38th Ann. Int’l Symp. on Microarchitecture (MICRO), November 2005. pdf Virtualizing Register Context. D. W. Oehmke. Dissertation at the University of Michigan, 2005. pdf Performance Validation of Network-Intensive Workloads on a Full-System Simulator. A. G. Saidi, N. L. Binkert, L. R. Hsu, and S. K. Reinhardt. First Ann. Workshop on Iteraction between Operating System and Computer Architecture (IOSCA), October 2005. pdf An extended version appears as University of Michigan Technical Report CSE-TR-511-05, July 2005. pdf Performance Analysis of System Overheads in TCP/IP Workloads. N. L. Binkert, L. R. Hsu, A. G. Saidi, R. G. Dreslinski, A. L. Schultz, and S. K. Reinhardt. Proc. 14th Int’l Conf. on Parallel Architectures and Compilation Techniques (PACT), September 2005. pdf Sampling and Stability in TCP/IP Workloads. L. R. Hsu, A. G. Saidi, N. L. Binkert, and S. K. Reinhardt. Proc. First Annual Workshop on Modeling, Benchmarking, and Simulation (MoBS), June pdf A Unified Compressed Memory Hierarchy. E. G. Hallnor and S. K. Reinhardt. Proc. 11th Int’l Symp. on High-Performance Computer Architecture (HPCA), February 2005. pdf Analyzing NIC Overheads in Network-Intensive Workloads. N. L. Binkert, L. R. Hsu, A. G. Saidi, R. G. Dreslinski, A. L. Schultz, and S. K. Reinhardt. Eighth Workshop on Computer Architecture Evaluation using Commercial Workloads (CAECW), February 2005. pdf An extended version appears as University of Michigan Technical Report CSE-TR-505-04, December 2004. pdf 2004 Emulation of realisitic network traffic patterns on an eight-node data vortex interconnection network subsytem. B. Small, A. Shacham, K. Bergman, K. Athikulwongse, C. Hawkins, and D.S. Will. Journal of Optical Networking Vol. 3, No.11, pp 802-809, November 2004. pdf ChipLock: Support for Secure Microarchitectures. T. Kgil, L Falk, and T. Mudge. Proc. Workshop on Architectural Support for Security and Anti-virus (WASSA), October 2004, pp. 130-139. pdf Design and Applications of a Virtual Context Architecture. D. Oehmke, N. Binkert, S. Reinhardt, and T. Mudge. University of Michigan Technical Report CSE-TR-497-04, September 2004. pdf The Performance Potential of an Integrated Network Interface. N. L. Binkert, R. G. Dreslinski, E. G. Hallnor, L. R. Hsu, S. E. Raasch, A. L. Schultz, and S. K. Reinhardt. Proc. Advanced Networking and Communications Hardware Workshop (ANCHOR), June 2004. pdf A Co-Phase Matrix to Guide Simultaneous Multithreading Simulation. M. Van Biesbrouck, T. Sherwood, and B. Calder. IEEE International Symposium on Performance Analysis and Software (ISPASS), March 2004. pdf A Compressed Memory Hierarchy using an Indirect Index Cache. E. G. Hallnor and S. K. Reinhardt. Proc. 3rd Workshop on Memory Performance Issues (WMPI), June 2004. pdf An extended version appears as University of Michigan Technical Report CSE-TR-488-04, March 2004. pdf 2003 The Impact of Resource Partitioning on SMT Processors. S. E. Raasch and S. K. Reinhardt. Proc. 12th Int’l Conf. on Parallel Architectures and Compilation Techniques (PACT), pp. 15-25, Sept. pdf Network-Oriented Full-System Simulation using M5. N. L. Binkert, E. G. Hallnor, and S. K. Reinhardt. Sixth Workshop on Computer Architecture Evaluation using Commercial Workloads (CAECW), February pdf Design, Implementation and Use of the MIRV Experimental Compiler for Computer Architecture Research. D. A. Greene. Dissertation at the Universtiy of Michigan, 2003. [http://www.eecs.umich.edu/~tnm/theses/daveg.pdg“>pdf ]2002 A Scalable Instruction Queue Design Using Dependence Chains. S. E. Raasch, N. L. Binkert, and S. K. Reinhardt. Proc. 29th Annual Int’l Symp. on Computer Architecture (ISCA), pp. 318-329, May 2002. pdf ps ps.gzDerivative projectsBelow is a list of projects that are based on gem5, are extensions of gem5, or use gem5.MV5 MV5 is a reconfigurable simulator for heterogeneous multicore architectures. It is based on M5v2.0 beta 4. Typical usage: simulating data-parallel applications on SIMT cores that operate over directory-based cache hierarchies. You can also add out-of-order cores to have a heterogeneous system, and all different types of cores can operate under the same address space through the same cache hierarchy. Research projects based on MV5 have been published in ISCA’10, ICCD’09, and IPDPS’10.Features Single-Instruction, Multiple-Threads (SIMT) cores Directory-based Coherence Cache: MESI/MSI. (Not based on gems/ruby) Interconnect: Fully connected and 2D Mesh. (Not based on gems/ruby) Threading API/library in system emulation mode (No support for full-system simulation. A benchmark suite using the thread API is provided)Resources Home Page: 1 Tutorial at ISPASS ‘11: 2 Google group: 3gem5-gpu Merges 2 popular simulators: gem5 and gpgpu-sim Simulates CPUs, GPUs, and the interactions between them Models a flexible memory system with support for heterogeneous processors and coherence Supports full-system simulation through GPU driver emulationResources Home Page: 4 Overview slides: 5 Mailing list: 6", |
| "url": "/publications/" |
| } |
| , |
| |
| "search": { |
| "title": "Search", |
| "content": " Search ", |
| "url": "/search/" |
| } |
| |
| |
| }; |
| </script> |
| <script src="/assets/js/lunr.min.js"></script> |
| <script src="/assets/js/search.js"></script> |
| |
| |
| </div> |
| |
| <!-- button to scroll to top of page --> |
| <button onclick="topFunction()" id="myBtn" title="Go to top">△</button> |
| |
| </main> |
| |
| <footer class="page-footer"> |
| <div class="container"> |
| <div class="row"> |
| |
| <div class="col-12 col-sm-4"> |
| <p>gem5</p> |
| <p><a href="/about">About</a></p> |
| <p><a href="/publications">Publications</a></p> |
| <p><a href="/contributing">Contributing</a></p> |
| <p><a href="/governance">Governance</a></p> |
| <br></div> |
| |
| <div class="col-12 col-sm-4"> |
| <p>Docs</p> |
| <p><a href="/introduction">Documentation</a></p> |
| <p><a href="http://gem5.org/Documentation">Old Documentation</a></p> |
| <p><a href="https://gem5.googlesource.com/public/gem5">Source</a></p> |
| <br></div> |
| |
| <div class="col-12 col-sm-4"> |
| <p>Help</p> |
| <p><a href="/search">Search</a></p> |
| <p><a href="#">Mailing Lists</a></p> |
| <p><a href="https://github.com/gem5/new-website/tree/master/">Website Source</a></p> |
| <br></div> |
| |
| </div> |
| </div> |
| </footer> |
| |
| |
| |
| <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script> |
| <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script> |
| |
| <script> |
| // When the user scrolls down 20px from the top of the document, show the button |
| window.onscroll = function() {scrollFunction()}; |
| |
| function scrollFunction() { |
| if (document.body.scrollTop > 100 || document.documentElement.scrollTop > 20) { |
| document.getElementById("myBtn").style.display = "block"; |
| } else { |
| document.getElementById("myBtn").style.display = "none"; |
| } |
| } |
| |
| // When the user clicks on the button, scroll to the top of the document |
| function topFunction() { |
| document.body.scrollTop = 0; |
| document.documentElement.scrollTop = 0; |
| } |
| </script> |
| |
| </body> |
| |
| |
| </html> |