1/* 2 * Copyright (c) 2018 Inria 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer; 9 * redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution; 12 * neither the name of the copyright holders nor the names of its 13 * contributors may be used to endorse or promote products derived from 14 * this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 * Authors: Daniel Carvalho 29 */ 30 31/** @file 32 * Implementation of the BDI cache compressor. 33 */ 34 35#include "mem/cache/compressors/bdi.hh" 36 37#include <algorithm> 38#include <climits> 39#include <cstring> 40#include <type_traits> 41 42#include "debug/CacheComp.hh" 43#include "params/BDI.hh" 44 45// Number of bytes in a qword 46#define BYTES_PER_QWORD 8 47 48// Declare BDI encoding names 49const char* BDI::ENCODING_NAMES[] = 50 {"Zero", "Repeated_Values", "Base8_1", "Base8_2", "Base8_4", "Base4_1", 51 "Base4_2", "Base2_1", "Uncompressed"}; 52 53BDI::BDICompData::BDICompData(const uint8_t encoding) 54 : CompressionData(), _encoding(encoding) 55{ 56} 57 58uint8_t 59BDI::BDICompData::getEncoding() const 60{ 61 return _encoding; 62} 63 64std::string 65BDI::BDICompData::getName() const 66{ 67 return ENCODING_NAMES[_encoding]; 68} 69 70BDI::BDICompDataZeros::BDICompDataZeros() 71 : BDICompData(ZERO) 72{ 73 // Calculate compressed size 74 calculateCompressedSize(); 75} 76 77uint64_t 78BDI::BDICompDataZeros::access(const int index) const 79{ 80 return 0; 81} 82 83void 84BDI::BDICompDataZeros::calculateCompressedSize() 85{ 86 // Number of bits used by Encoding 87 std::size_t size = encodingBits; 88 89 setSizeBits(size); 90} 91 92BDI::BDICompDataRep::BDICompDataRep(const uint64_t rep_value) 93 : BDICompData(REP_VALUES) 94{ 95 // Set base value 96 base = rep_value; 97 98 // Calculate compressed size 99 calculateCompressedSize(); 100} 101 102uint64_t 103BDI::BDICompDataRep::access(const int index) const 104{ 105 return base; 106} 107 108void 109BDI::BDICompDataRep::calculateCompressedSize() 110{ 111 // Number of bits used by Encoding 112 std::size_t size = encodingBits; 113 114 // Number of bits used by Base 115 size += sizeof(base)*CHAR_BIT; 116 117 setSizeBits(size); 118} 119 120BDI::BDICompDataUncompressed::BDICompDataUncompressed( 121 const uint64_t* data, const std::size_t blk_size) 122 : BDICompData(UNCOMPRESSED), blkSize(blk_size), 123 _data(data, data + blk_size/CHAR_BIT) 124{ 125 // Calculate compressed size 126 calculateCompressedSize(); 127} 128 129uint64_t 130BDI::BDICompDataUncompressed::access(const int index) const 131{ 132 return _data[index]; 133} 134 135void 136BDI::BDICompDataUncompressed::calculateCompressedSize() 137{ 138 // Number of bits used by Encoding 139 std::size_t size = encodingBits; 140 141 // Number of bits used by uncompressed line 142 size += blkSize*CHAR_BIT; 143 144 setSizeBits(size); 145} 146 147template <class TB, class TD> 148BDI::BDICompDataBaseDelta<TB, TD>::BDICompDataBaseDelta(const uint8_t encoding, 149 const std::size_t blk_size, const std::size_t max_num_bases) 150 : BDICompData(encoding), maxNumBases(max_num_bases) 151{ 152 // Reserve the maximum possible size for the vectors 153 bases.reserve(maxNumBases); 154 bitMask.reserve(blk_size/sizeof(TD)); 155 deltas.reserve(blk_size/sizeof(TD)); 156 157 // Push virtual base 0 to bases list 158 bases.push_back(0); 159} 160 161template <class TB, class TD> 162void 163BDI::BDICompDataBaseDelta<TB, TD>::calculateCompressedSize() 164{ 165 // Number of bits used by Encoding 166 std::size_t size = encodingBits; 167 168 // Number of bits used by BitMask 169 size += bitMask.size()*std::ceil(std::log2(maxNumBases)); 170 171 // Number of bits used by Bases. bases[0] is implicit in a hardware 172 // implementation, therefore its size is 0 173 size += (maxNumBases-1)*sizeof(TB)*CHAR_BIT; 174 175 // Number of bits used by Deltas 176 size += deltas.size()*sizeof(TD)*CHAR_BIT; 177 178 CompressionData::setSizeBits(size); 179} 180 181template <class TB, class TD> 182bool 183BDI::BDICompDataBaseDelta<TB, TD>::addBase(const TB base) 184{ 185 // Can't add base if reached limit of number of bases 186 if (bases.size() >= maxNumBases) { 187 return false; 188 } 189 190 // Push new base to end of bases list 191 bases.push_back(base); 192 193 // New delta is 0, as it is a difference between the new base and itself 194 addDelta(bases.size() - 1, 0); 195 196 return true; 197} 198 199template <class TB, class TD> 200void 201BDI::BDICompDataBaseDelta<TB, TD>::addDelta(const std::size_t base_index, 202 const TD delta) 203{ 204 // Insert new delta with respect to the given base 205 bitMask.push_back(base_index); 206 207 // Insert new delta 208 deltas.push_back(delta); 209} 210 211template <class TB, class TD> bool 212BDI::BDICompDataBaseDelta<TB, TD>::compress(const uint64_t* data, 213 const std::size_t blk_size) 214{ 215 // Parse through data in a sizeof(TB) granularity 216 for (std::size_t byte_start = 0; byte_start < blk_size; 217 byte_start += sizeof(TB)) 218 { 219 // Get current value 220 TB curValue; 221 std::memcpy(&curValue, ((uint8_t*)data) + byte_start, 222 sizeof(TB)); 223 224 // Iterate through all bases to search for a valid delta 225 bool foundDelta = false; 226 for (int base_index = 0; base_index < bases.size(); base_index++) { 227 // Calculate delta relative to currently parsed base 228 typename std::make_signed<TB>::type delta = curValue - 229 bases[base_index]; 230 231 // Check if the delta is within the limits of the delta size. If 232 // that is the case, add delta to compressed data and keep parsing 233 // the input data 234 typename std::make_signed<TB>::type limit = 235 ULLONG_MAX>>((BYTES_PER_QWORD-sizeof(TD))*CHAR_BIT+1); 236 if ((delta >= -limit) && (delta <= limit)) { 237 addDelta(base_index, delta); 238 foundDelta = true; 239 break; 240 } 241 } 242 243 // If we cannot find a base that can accommodate this new line's data, 244 // add this value as the new base and insert its respective delta of 0. 245 // If the new base can't be added, it means that we reached the base 246 // limit, so line is uncompressible using the given encoding 247 if (!foundDelta && !addBase(curValue)) { 248 return false; 249 } 250 } 251 252 // Calculate compressed size 253 calculateCompressedSize(); 254 255 return true; 256} 257 258template <class TB, class TD> 259uint64_t 260BDI::BDICompDataBaseDelta<TB, TD>::access(const int index) const 261{ 262 // We decompress all base-delta pairs that form the 64-bit entry 263 // corresponding to the given 64-bit-array index 264 uint64_t value = 0; 265 266 // Get relationship between the size of an uint64_t base and size of TB 267 const std::size_t size_diff = sizeof(uint64_t)/sizeof(TB); 268 269 // Mask for a base entry 270 const uint64_t mask = ULLONG_MAX>>((BYTES_PER_QWORD-sizeof(TB))*CHAR_BIT); 271 272 // Size, in bits, of a base entry. Cant be const because compiler will 273 // optimize and create a 64 bit instance, which will generate a shift size 274 // compilation error 275 int base_size_bits = sizeof(TB)*CHAR_BIT; 276 277 // Concatenate all base-delta entries until they form a 64-bit entry 278 for (int delta_index = size_diff * (index + 1) - 1; 279 delta_index >= (int)(size_diff * index); delta_index--) { 280 // Get base and delta entries corresponding to the current delta 281 assert(delta_index < deltas.size()); 282 const TD delta = deltas[delta_index]; 283 const int base_index = bitMask[delta_index]; 284 assert(base_index < bases.size()); 285 const TB base = bases[base_index]; 286 287 // Concatenate decompressed value 288 value <<= base_size_bits; 289 value |= static_cast<uint64_t>((base + delta) & mask); 290 } 291 292 return value; 293} 294 295BDI::BDI(const Params *p) 296 : BaseCacheCompressor(p), useMoreCompressors(p->use_more_compressors), 297 qwordsPerCacheLine(blkSize/BYTES_PER_QWORD) 298{ 299 static_assert(sizeof(ENCODING_NAMES)/sizeof(char*) == NUM_ENCODINGS, 300 "Number of encodings doesn't match the number of names"); 301} 302 303bool 304BDI::isZeroPackable(const uint64_t* data) const 305{ 306 return std::all_of(data, data + qwordsPerCacheLine, 307 [](const uint64_t entry){ return entry == 0; }); 308} 309 310bool 311BDI::isSameValuePackable(const uint64_t* data) const 312{ 313 // We don't want to copy the whole array to the lambda expression 314 const uint64_t rep_value = data[0]; 315 return std::all_of(data, data + qwordsPerCacheLine, 316 [rep_value](const uint64_t entry) 317 {return entry == rep_value;}); 318} 319 320template <class TB, class TD> 321std::unique_ptr<BDI::BDICompData> 322BDI::tryCompress(const uint64_t* data, const uint8_t encoding) const 323{ 324 // Instantiate compressor 325 auto temp_data = std::unique_ptr<BDICompDataBaseDelta<TB, TD>>( 326 new BDICompDataBaseDelta<TB, TD>(encoding, blkSize)); 327 328 // Try compressing. Return nullptr if compressor can't compress given line 329 if (temp_data->compress(data, blkSize)) { 330 return std::move(temp_data); 331 } else { 332 return std::unique_ptr<BDICompData>{}; 333 } 334} 335 336void 337BDI::decompress(const BaseCacheCompressor::CompressionData* comp_data, 338 uint64_t* data) 339{ 340 // Decompress and go back to host endianness 341 for (std::size_t i = 0; i < qwordsPerCacheLine; i++) 342 data[i] = static_cast<const BDICompData*>(comp_data)->access(i); 343} 344 345std::unique_ptr<BaseCacheCompressor::CompressionData> 346BDI::compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) 347{ 348 std::unique_ptr<BDICompData> bdi_data; 349 350 // Check if it is a zero line 351 if (isZeroPackable(data)) { 352 bdi_data = std::unique_ptr<BDICompData>(new BDICompDataZeros()); 353 354 // Set compression latency (compare 1 qword per cycle) 355 comp_lat = Cycles(qwordsPerCacheLine); 356 // Check if all values in the line are the same 357 } else if (isSameValuePackable(data)) { 358 bdi_data = std::unique_ptr<BDICompData>(new BDICompDataRep(data[0])); 359 360 // Set compression latency (compare 1 qword per cycle) 361 comp_lat = Cycles(qwordsPerCacheLine); 362 } else { 363 // Initialize compressed data as an uncompressed instance 364 bdi_data = std::unique_ptr<BDICompData>( 365 new BDICompDataUncompressed(data, blkSize)); 366 367 // Base size-delta size ratio. Used to optimize run and try to 368 // compress less combinations when their size is worse than the 369 // current best. It does not reflect the compression ratio, as 370 // it does not take the metadata into account. 371 int base_delta_ratio = 2; 372 373 // Check which base-delta size combination is the best. This is 374 // optimized by giving priority to trying the compressor that would 375 // generate the smallest compression size. This way we waste less 376 // simulation time by not doing all possible combinations 377 for (int ratio = 8; ratio >= base_delta_ratio; ratio/=2) { 378 for (int base_size = 8; base_size >= ratio; base_size/=2) { 379 // If using more compressors, parse all delta sizes from 1 to 380 // one size smaller than the base size, otherwise just parse 381 // highest possible delta. When we only instantiate one delta 382 // size per base size, we use less area and energy, at the 383 // cost of lower compression efficiency 384 const int delta_size = base_size/ratio; 385 if (!useMoreCompressors && delta_size != base_size/2) { 386 continue; 387 } 388 389 std::unique_ptr<BDICompData> temp_bdi_data; 390 391 // Get the compression result for the current combination 392 if ((base_size == 8)&&(delta_size == 4)) { 393 temp_bdi_data = tryCompress<uint64_t, int32_t>(data, 394 BASE8_4); 395 } else if ((base_size == 8)&&(delta_size == 2)) { 396 temp_bdi_data = tryCompress<uint64_t, int16_t>(data, 397 BASE8_2); 398 } else if ((base_size == 8)&&(delta_size == 1)) { 399 temp_bdi_data = tryCompress<uint64_t, int8_t>(data, 400 BASE8_1); 401 } else if ((base_size == 4)&&(delta_size == 2)) { 402 temp_bdi_data = tryCompress<uint32_t, int16_t>(data, 403 BASE4_2); 404 } else if ((base_size == 4)&&(delta_size == 1)) { 405 temp_bdi_data = tryCompress<uint32_t, int8_t>(data, 406 BASE4_1); 407 } else if ((base_size == 2)&&(delta_size == 1)) { 408 temp_bdi_data = tryCompress<uint16_t, int8_t>(data, 409 BASE2_1); 410 } else { 411 fatal("Invalid combination of base and delta sizes."); 412 } 413 414 // If compressor was successful, check if new compression 415 // improves the compression factor 416 if (temp_bdi_data && 417 (bdi_data->getSizeBits() > temp_bdi_data->getSizeBits())) 418 { 419 bdi_data = std::move(temp_bdi_data); 420 base_delta_ratio = ratio; 421 } 422 423 // Clear temp pointer 424 temp_bdi_data.reset(nullptr); 425 } 426 } 427 428 // Set compression latency. A successful compressor will stop all 429 // other compressors when it knows no other will generate a better 430 // compression. However, this requires either hard-coding, or a complex 431 // function that can calculate the exact size of every compressor for 432 // every cache line size. We decide to take a conservative 433 // optimization: assume that all compressors with a given base size 434 // delta size ratio (no-metadata ratio) must wait for each other. 435 // In the worst case scenario the data is left uncompressed, so 436 // it loses the time of the worst base size (2 bytes per cycle) 437 comp_lat = Cycles(blkSize/base_delta_ratio); 438 } 439 440 // Update stats 441 encodingStats[bdi_data->getEncoding()]++; 442 443 // Pack compression results (1 extra cycle) 444 comp_lat += Cycles(1); 445 446 // Set decompression latency (latency of an adder) 447 decomp_lat = Cycles(1); 448 449 // Print debug information 450 DPRINTF(CacheComp, "BDI: Compressed cache line to encoding %s (%d bits)\n", 451 bdi_data->getName(), bdi_data->getSizeBits()); 452 453 return std::move(bdi_data); 454} 455 456void 457BDI::regStats() 458{ 459 BaseCacheCompressor::regStats(); 460 461 // We store the frequency of each encoding 462 encodingStats 463 .init(NUM_ENCODINGS) 464 .name(name() + ".encoding") 465 .desc("Number of data entries that were compressed to this encoding.") 466 ; 467 468 for (unsigned i = 0; i < NUM_ENCODINGS; ++i) { 469 encodingStats.subname(i, ENCODING_NAMES[i]); 470 encodingStats.subdesc(i, "Number of data entries that match " \ 471 "encoding " + std::string(ENCODING_NAMES[i])); 472 } 473} 474 475BDI* 476BDIParams::create() 477{ 478 return new BDI(this); 479} 480 481#undef BYTES_PER_QWORD 482