blob: a8068c6146593920655d7992709ce66af74286a8 [file] [log] [blame]
/*
* Copyright (c) 2018 Inria
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Daniel Carvalho
*/
/** @file
* Implementation of the BDI cache compressor.
*/
#include "mem/cache/compressors/bdi.hh"
#include <algorithm>
#include <climits>
#include <cstring>
#include <type_traits>
#include "debug/CacheComp.hh"
#include "params/BDI.hh"
// Number of bytes in a qword
#define BYTES_PER_QWORD 8
// Declare BDI encoding names
const char* BDI::ENCODING_NAMES[] =
{"Zero", "Repeated_Values", "Base8_1", "Base8_2", "Base8_4", "Base4_1",
"Base4_2", "Base2_1", "Uncompressed"};
BDI::BDICompData::BDICompData(const uint8_t encoding)
: CompressionData(), _encoding(encoding)
{
}
uint8_t
BDI::BDICompData::getEncoding() const
{
return _encoding;
}
std::string
BDI::BDICompData::getName() const
{
return ENCODING_NAMES[_encoding];
}
BDI::BDICompDataZeros::BDICompDataZeros()
: BDICompData(ZERO)
{
// Calculate compressed size
calculateCompressedSize();
}
uint64_t
BDI::BDICompDataZeros::access(const int index) const
{
return 0;
}
void
BDI::BDICompDataZeros::calculateCompressedSize()
{
// Number of bits used by Encoding
std::size_t size = encodingBits;
setSizeBits(size);
}
BDI::BDICompDataRep::BDICompDataRep(const uint64_t rep_value)
: BDICompData(REP_VALUES)
{
// Set base value
base = rep_value;
// Calculate compressed size
calculateCompressedSize();
}
uint64_t
BDI::BDICompDataRep::access(const int index) const
{
return base;
}
void
BDI::BDICompDataRep::calculateCompressedSize()
{
// Number of bits used by Encoding
std::size_t size = encodingBits;
// Number of bits used by Base
size += sizeof(base)*CHAR_BIT;
setSizeBits(size);
}
BDI::BDICompDataUncompressed::BDICompDataUncompressed(
const uint64_t* data, const std::size_t blk_size)
: BDICompData(UNCOMPRESSED), blkSize(blk_size),
_data(data, data + blk_size/CHAR_BIT)
{
// Calculate compressed size
calculateCompressedSize();
}
uint64_t
BDI::BDICompDataUncompressed::access(const int index) const
{
return _data[index];
}
void
BDI::BDICompDataUncompressed::calculateCompressedSize()
{
// Number of bits used by Encoding
std::size_t size = encodingBits;
// Number of bits used by uncompressed line
size += blkSize*CHAR_BIT;
setSizeBits(size);
}
template <class TB, class TD>
BDI::BDICompDataBaseDelta<TB, TD>::BDICompDataBaseDelta(const uint8_t encoding,
const std::size_t blk_size, const std::size_t max_num_bases)
: BDICompData(encoding), maxNumBases(max_num_bases)
{
// Reserve the maximum possible size for the vectors
bases.reserve(maxNumBases);
bitMask.reserve(blk_size/sizeof(TD));
deltas.reserve(blk_size/sizeof(TD));
// Push virtual base 0 to bases list
bases.push_back(0);
}
template <class TB, class TD>
void
BDI::BDICompDataBaseDelta<TB, TD>::calculateCompressedSize()
{
// Number of bits used by Encoding
std::size_t size = encodingBits;
// Number of bits used by BitMask
size += bitMask.size()*std::ceil(std::log2(maxNumBases));
// Number of bits used by Bases. bases[0] is implicit in a hardware
// implementation, therefore its size is 0
size += (maxNumBases-1)*sizeof(TB)*CHAR_BIT;
// Number of bits used by Deltas
size += deltas.size()*sizeof(TD)*CHAR_BIT;
CompressionData::setSizeBits(size);
}
template <class TB, class TD>
bool
BDI::BDICompDataBaseDelta<TB, TD>::addBase(const TB base)
{
// Can't add base if reached limit of number of bases
if (bases.size() >= maxNumBases) {
return false;
}
// Push new base to end of bases list
bases.push_back(base);
// New delta is 0, as it is a difference between the new base and itself
addDelta(bases.size() - 1, 0);
return true;
}
template <class TB, class TD>
void
BDI::BDICompDataBaseDelta<TB, TD>::addDelta(const std::size_t base_index,
const TD delta)
{
// Insert new delta with respect to the given base
bitMask.push_back(base_index);
// Insert new delta
deltas.push_back(delta);
}
template <class TB, class TD> bool
BDI::BDICompDataBaseDelta<TB, TD>::compress(const uint64_t* data,
const std::size_t blk_size)
{
// Parse through data in a sizeof(TB) granularity
for (std::size_t byte_start = 0; byte_start < blk_size;
byte_start += sizeof(TB))
{
// Get current value
TB curValue;
std::memcpy(&curValue, ((uint8_t*)data) + byte_start,
sizeof(TB));
// Iterate through all bases to search for a valid delta
bool foundDelta = false;
for (int base_index = 0; base_index < bases.size(); base_index++) {
// Calculate delta relative to currently parsed base
typename std::make_signed<TB>::type delta = curValue -
bases[base_index];
// Check if the delta is within the limits of the delta size. If
// that is the case, add delta to compressed data and keep parsing
// the input data
typename std::make_signed<TB>::type limit =
ULLONG_MAX>>((BYTES_PER_QWORD-sizeof(TD))*CHAR_BIT+1);
if ((delta >= -limit) && (delta <= limit)) {
addDelta(base_index, delta);
foundDelta = true;
break;
}
}
// If we cannot find a base that can accommodate this new line's data,
// add this value as the new base and insert its respective delta of 0.
// If the new base can't be added, it means that we reached the base
// limit, so line is uncompressible using the given encoding
if (!foundDelta && !addBase(curValue)) {
return false;
}
}
// Calculate compressed size
calculateCompressedSize();
return true;
}
template <class TB, class TD>
uint64_t
BDI::BDICompDataBaseDelta<TB, TD>::access(const int index) const
{
// We decompress all base-delta pairs that form the 64-bit entry
// corresponding to the given 64-bit-array index
uint64_t value = 0;
// Get relationship between the size of an uint64_t base and size of TB
const std::size_t size_diff = sizeof(uint64_t)/sizeof(TB);
// Mask for a base entry
const uint64_t mask = ULLONG_MAX>>((BYTES_PER_QWORD-sizeof(TB))*CHAR_BIT);
// Size, in bits, of a base entry. Cant be const because compiler will
// optimize and create a 64 bit instance, which will generate a shift size
// compilation error
int base_size_bits = sizeof(TB)*CHAR_BIT;
// Concatenate all base-delta entries until they form a 64-bit entry
for (int delta_index = size_diff * (index + 1) - 1;
delta_index >= (int)(size_diff * index); delta_index--) {
// Get base and delta entries corresponding to the current delta
assert(delta_index < deltas.size());
const TD delta = deltas[delta_index];
const int base_index = bitMask[delta_index];
assert(base_index < bases.size());
const TB base = bases[base_index];
// Concatenate decompressed value
value <<= base_size_bits;
value |= static_cast<uint64_t>((base + delta) & mask);
}
return value;
}
BDI::BDI(const Params *p)
: BaseCacheCompressor(p), useMoreCompressors(p->use_more_compressors),
qwordsPerCacheLine(blkSize/BYTES_PER_QWORD)
{
static_assert(sizeof(ENCODING_NAMES)/sizeof(char*) == NUM_ENCODINGS,
"Number of encodings doesn't match the number of names");
}
bool
BDI::isZeroPackable(const uint64_t* data) const
{
return std::all_of(data, data + qwordsPerCacheLine,
[](const uint64_t entry){ return entry == 0; });
}
bool
BDI::isSameValuePackable(const uint64_t* data) const
{
// We don't want to copy the whole array to the lambda expression
const uint64_t rep_value = data[0];
return std::all_of(data, data + qwordsPerCacheLine,
[rep_value](const uint64_t entry)
{return entry == rep_value;});
}
template <class TB, class TD>
std::unique_ptr<BDI::BDICompData>
BDI::tryCompress(const uint64_t* data, const uint8_t encoding) const
{
// Instantiate compressor
auto temp_data = std::unique_ptr<BDICompDataBaseDelta<TB, TD>>(
new BDICompDataBaseDelta<TB, TD>(encoding, blkSize));
// Try compressing. Return nullptr if compressor can't compress given line
if (temp_data->compress(data, blkSize)) {
return std::move(temp_data);
} else {
return std::unique_ptr<BDICompData>{};
}
}
void
BDI::decompress(const BaseCacheCompressor::CompressionData* comp_data,
uint64_t* data)
{
// Decompress and go back to host endianness
for (std::size_t i = 0; i < qwordsPerCacheLine; i++)
data[i] = static_cast<const BDICompData*>(comp_data)->access(i);
}
std::unique_ptr<BaseCacheCompressor::CompressionData>
BDI::compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat)
{
std::unique_ptr<BDICompData> bdi_data;
// Check if it is a zero line
if (isZeroPackable(data)) {
bdi_data = std::unique_ptr<BDICompData>(new BDICompDataZeros());
// Set compression latency (compare 1 qword per cycle)
comp_lat = Cycles(qwordsPerCacheLine);
// Check if all values in the line are the same
} else if (isSameValuePackable(data)) {
bdi_data = std::unique_ptr<BDICompData>(new BDICompDataRep(data[0]));
// Set compression latency (compare 1 qword per cycle)
comp_lat = Cycles(qwordsPerCacheLine);
} else {
// Initialize compressed data as an uncompressed instance
bdi_data = std::unique_ptr<BDICompData>(
new BDICompDataUncompressed(data, blkSize));
// Base size-delta size ratio. Used to optimize run and try to
// compress less combinations when their size is worse than the
// current best. It does not reflect the compression ratio, as
// it does not take the metadata into account.
int base_delta_ratio = 2;
// Check which base-delta size combination is the best. This is
// optimized by giving priority to trying the compressor that would
// generate the smallest compression size. This way we waste less
// simulation time by not doing all possible combinations
for (int ratio = 8; ratio >= base_delta_ratio; ratio/=2) {
for (int base_size = 8; base_size >= ratio; base_size/=2) {
// If using more compressors, parse all delta sizes from 1 to
// one size smaller than the base size, otherwise just parse
// highest possible delta. When we only instantiate one delta
// size per base size, we use less area and energy, at the
// cost of lower compression efficiency
const int delta_size = base_size/ratio;
if (!useMoreCompressors && delta_size != base_size/2) {
continue;
}
std::unique_ptr<BDICompData> temp_bdi_data;
// Get the compression result for the current combination
if ((base_size == 8)&&(delta_size == 4)) {
temp_bdi_data = tryCompress<uint64_t, int32_t>(data,
BASE8_4);
} else if ((base_size == 8)&&(delta_size == 2)) {
temp_bdi_data = tryCompress<uint64_t, int16_t>(data,
BASE8_2);
} else if ((base_size == 8)&&(delta_size == 1)) {
temp_bdi_data = tryCompress<uint64_t, int8_t>(data,
BASE8_1);
} else if ((base_size == 4)&&(delta_size == 2)) {
temp_bdi_data = tryCompress<uint32_t, int16_t>(data,
BASE4_2);
} else if ((base_size == 4)&&(delta_size == 1)) {
temp_bdi_data = tryCompress<uint32_t, int8_t>(data,
BASE4_1);
} else if ((base_size == 2)&&(delta_size == 1)) {
temp_bdi_data = tryCompress<uint16_t, int8_t>(data,
BASE2_1);
} else {
fatal("Invalid combination of base and delta sizes.");
}
// If compressor was successful, check if new compression
// improves the compression factor
if (temp_bdi_data &&
(bdi_data->getSizeBits() > temp_bdi_data->getSizeBits()))
{
bdi_data = std::move(temp_bdi_data);
base_delta_ratio = ratio;
}
// Clear temp pointer
temp_bdi_data.reset(nullptr);
}
}
// Set compression latency. A successful compressor will stop all
// other compressors when it knows no other will generate a better
// compression. However, this requires either hard-coding, or a complex
// function that can calculate the exact size of every compressor for
// every cache line size. We decide to take a conservative
// optimization: assume that all compressors with a given base size
// delta size ratio (no-metadata ratio) must wait for each other.
// In the worst case scenario the data is left uncompressed, so
// it loses the time of the worst base size (2 bytes per cycle)
comp_lat = Cycles(blkSize/base_delta_ratio);
}
// Update stats
encodingStats[bdi_data->getEncoding()]++;
// Pack compression results (1 extra cycle)
comp_lat += Cycles(1);
// Set decompression latency (latency of an adder)
decomp_lat = Cycles(1);
// Print debug information
DPRINTF(CacheComp, "BDI: Compressed cache line to encoding %s (%d bits)\n",
bdi_data->getName(), bdi_data->getSizeBits());
return std::move(bdi_data);
}
void
BDI::regStats()
{
BaseCacheCompressor::regStats();
// We store the frequency of each encoding
encodingStats
.init(NUM_ENCODINGS)
.name(name() + ".encoding")
.desc("Number of data entries that were compressed to this encoding.")
;
for (unsigned i = 0; i < NUM_ENCODINGS; ++i) {
encodingStats.subname(i, ENCODING_NAMES[i]);
encodingStats.subdesc(i, "Number of data entries that match " \
"encoding " + std::string(ENCODING_NAMES[i]));
}
}
BDI*
BDIParams::create()
{
return new BDI(this);
}
#undef BYTES_PER_QWORD