1/* 2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#include "gpu-compute/hsail_code.hh" 37 38// defined in code.cc, but not worth sucking in all of code.h for this 39// at this point 40extern const char *segmentNames[]; 41 42namespace HsailISA 43{ 44 template<typename DestDataType, typename AddrRegOperandType> 45 void 46 LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly() 47 { 48 this->disassembly = csprintf("%s_%s %s,%s", this->opcode, 49 DestDataType::label, 50 this->dest.disassemble(), 51 this->addr.disassemble()); 52 } 53 54 template<typename DestDataType, typename AddrRegOperandType> 55 void 56 LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) 57 { 58 Wavefront *w = gpuDynInst->wavefront(); 59 60 typedef typename DestDataType::CType CType M5_VAR_USED; 61 const VectorMask &mask = w->getPred(); 62 std::vector<Addr> addr_vec; 63 addr_vec.resize(w->computeUnit->wfSize(), (Addr)0); 64 this->addr.calcVector(w, addr_vec); 65 66 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 67 if (mask[lane]) { 68 this->dest.set(w, lane, addr_vec[lane]); 69 } 70 } 71 addr_vec.clear(); 72 } 73 74 template<typename MemDataType, typename DestDataType, 75 typename AddrRegOperandType> 76 void 77 LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly() 78 { 79 switch (num_dest_operands) { 80 case 1: 81 this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, 82 segmentNames[this->segment], 83 MemDataType::label, 84 this->dest.disassemble(), 85 this->addr.disassemble()); 86 break; 87 case 2: 88 this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, 89 segmentNames[this->segment], 90 MemDataType::label, 91 this->dest_vect[0].disassemble(), 92 this->dest_vect[1].disassemble(), 93 this->addr.disassemble()); 94 break; 95 case 3: 96 this->disassembly = csprintf("%s_%s_%s (%s,%s,%s), %s", this->opcode, 97 segmentNames[this->segment], 98 MemDataType::label, 99 this->dest_vect[0].disassemble(), 100 this->dest_vect[1].disassemble(), 101 this->dest_vect[2].disassemble(), 102 this->addr.disassemble()); 103 break; 104 case 4: 105 this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", 106 this->opcode, 107 segmentNames[this->segment], 108 MemDataType::label, 109 this->dest_vect[0].disassemble(), 110 this->dest_vect[1].disassemble(), 111 this->dest_vect[2].disassemble(), 112 this->dest_vect[3].disassemble(), 113 this->addr.disassemble()); 114 break; 115 default: 116 fatal("Bad ld register dest operand, num vector operands: %d \n", 117 num_dest_operands); 118 break; 119 } 120 } 121 122 static Addr 123 calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i) 124 { 125 // what is the size of the object we are accessing?? 126 // NOTE: the compiler doesn't generate enough information 127 // to do this yet..have to just line up all the private 128 // work-item spaces back to back for now 129 /* 130 StorageElement* se = 131 i->parent->findSymbol(Brig::BrigPrivateSpace, addr); 132 assert(se); 133 134 return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() + 135 se->offset * w->computeUnit->wfSize() + 136 lane * se->size; 137 */ 138 139 // addressing strategy: interleave the private spaces of 140 // work-items in a wave-front on 8 byte granularity. 141 // this won't be perfect coalescing like the spill space 142 // strategy, but it's better than nothing. The spill space 143 // strategy won't work with private because the same address 144 // may be accessed by different sized loads/stores. 145 146 // Note: I'm assuming that the largest load/store to private 147 // is 8 bytes. If it is larger, the stride will have to increase 148 149 Addr addr_div8 = addr / 8; 150 Addr addr_mod8 = addr % 8; 151 152 Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 + 153 addr_mod8 + w->privBase; 154 155 assert(ret < w->privBase + 156 (w->privSizePerItem * w->computeUnit->wfSize())); 157 158 return ret; 159 } 160 161 template<typename MemDataType, typename DestDataType, 162 typename AddrRegOperandType> 163 void 164 LdInst<MemDataType, DestDataType, 165 AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) 166 { 167 Wavefront *w = gpuDynInst->wavefront(); 168 169 typedef typename MemDataType::CType MemCType; 170 const VectorMask &mask = w->getPred(); 171 172 // Kernarg references are handled uniquely for now (no Memory Request 173 // is used), so special-case them up front. Someday we should 174 // make this more realistic, at which we should get rid of this 175 // block and fold this case into the switch below. 176 if (this->segment == Brig::BRIG_SEGMENT_KERNARG) { 177 MemCType val; 178 179 // I assume no vector ld for kernargs 180 assert(num_dest_operands == 1); 181 182 // assuming for the moment that we'll never do register 183 // offsets into kernarg space... just to make life simpler 184 uint64_t address = this->addr.calcUniform(); 185 186 val = *(MemCType*)&w->kernelArgs[address]; 187 188 DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); 189 190 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 191 if (mask[lane]) { 192 this->dest.set(w, lane, val); 193 } 194 } 195 196 return; 197 } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { 198 uint64_t address = this->addr.calcUniform(); 199 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 200 if (mask[lane]) { 201 MemCType val = w->readCallArgMem<MemCType>(lane, address); 202 203 DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address, 204 (unsigned long long)val); 205 206 this->dest.set(w, lane, val); 207 } 208 } 209 210 return; 211 } 212 213 GPUDynInstPtr m = gpuDynInst; 214 215 this->addr.calcVector(w, m->addr); 216 217 m->m_type = MemDataType::memType; 218 m->v_type = DestDataType::vgprType; 219 220 m->exec_mask = w->execMask(); 221 m->statusBitVector = 0; 222 m->equiv = this->equivClass; 223 224 if (num_dest_operands == 1) { 225 m->dst_reg = this->dest.regIndex(); 226 m->n_reg = 1; 227 } else { 228 m->n_reg = num_dest_operands; 229 for (int i = 0; i < num_dest_operands; ++i) { 230 m->dst_reg_vec[i] = this->dest_vect[i].regIndex(); 231 } 232 } 233 234 m->simdId = w->simdId; 235 m->wfSlotId = w->wfSlotId; 236 m->wfDynId = w->wfDynId; 237 m->kern_id = w->kernId; 238 m->cu_id = w->computeUnit->cu_id; 239 m->latency.init(&w->computeUnit->shader->tick_cnt); 240 241 switch (this->segment) { 242 case Brig::BRIG_SEGMENT_GLOBAL: 243 m->pipeId = GLBMEM_PIPE; 244 m->latency.set(w->computeUnit->shader->ticks(1)); 245 246 // this is a complete hack to get around a compiler bug 247 // (the compiler currently generates global access for private 248 // addresses (starting from 0). We need to add the private offset) 249 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 250 if (m->addr[lane] < w->privSizePerItem) { 251 if (mask[lane]) { 252 // what is the size of the object we are accessing? 253 // find base for for this wavefront 254 255 // calcPrivAddr will fail if accesses are unaligned 256 assert(!((sizeof(MemCType) - 1) & m->addr[lane])); 257 258 Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, 259 this); 260 261 m->addr[lane] = privAddr; 262 } 263 } 264 } 265 266 w->computeUnit->globalMemoryPipe.issueRequest(m); 267 w->outstandingReqsRdGm++; 268 w->rdGmReqsInPipe--; 269 break; 270 271 case Brig::BRIG_SEGMENT_SPILL: 272 assert(num_dest_operands == 1); 273 m->pipeId = GLBMEM_PIPE; 274 m->latency.set(w->computeUnit->shader->ticks(1)); 275 { 276 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 277 // note: this calculation will NOT WORK if the compiler 278 // ever generates loads/stores to the same address with 279 // different widths (e.g., a ld_u32 addr and a ld_u16 addr) 280 if (mask[lane]) { 281 assert(m->addr[lane] < w->spillSizePerItem); 282 283 m->addr[lane] = m->addr[lane] * w->spillWidth + 284 lane * sizeof(MemCType) + w->spillBase; 285 286 w->lastAddr[lane] = m->addr[lane]; 287 } 288 } 289 } 290 291 w->computeUnit->globalMemoryPipe.issueRequest(m); 292 w->outstandingReqsRdGm++; 293 w->rdGmReqsInPipe--; 294 break; 295 296 case Brig::BRIG_SEGMENT_GROUP: 297 m->pipeId = LDSMEM_PIPE; 298 m->latency.set(w->computeUnit->shader->ticks(24)); 299 w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); 300 w->outstandingReqsRdLm++; 301 w->rdLmReqsInPipe--; 302 break; 303 304 case Brig::BRIG_SEGMENT_READONLY: 305 m->pipeId = GLBMEM_PIPE; 306 m->latency.set(w->computeUnit->shader->ticks(1)); 307 308 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 309 if (mask[lane]) { 310 assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); 311 m->addr[lane] += w->roBase; 312 } 313 } 314 315 w->computeUnit->globalMemoryPipe.issueRequest(m); 316 w->outstandingReqsRdGm++; 317 w->rdGmReqsInPipe--; 318 break; 319 320 case Brig::BRIG_SEGMENT_PRIVATE: 321 m->pipeId = GLBMEM_PIPE; 322 m->latency.set(w->computeUnit->shader->ticks(1)); 323 { 324 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 325 if (mask[lane]) { 326 assert(m->addr[lane] < w->privSizePerItem); 327 328 m->addr[lane] = m->addr[lane] + 329 lane * sizeof(MemCType) + w->privBase; 330 } 331 } 332 } 333 w->computeUnit->globalMemoryPipe.issueRequest(m); 334 w->outstandingReqsRdGm++; 335 w->rdGmReqsInPipe--; 336 break; 337 338 default: 339 fatal("Load to unsupported segment %d %llxe\n", this->segment, 340 m->addr[0]); 341 } 342 343 w->outstandingReqs++; 344 w->memReqsInPipe--; 345 } 346 347 template<typename OperationType, typename SrcDataType, 348 typename AddrRegOperandType> 349 void 350 StInst<OperationType, SrcDataType, 351 AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) 352 { 353 Wavefront *w = gpuDynInst->wavefront(); 354 355 typedef typename OperationType::CType CType; 356 357 const VectorMask &mask = w->getPred(); 358 359 // arg references are handled uniquely for now (no Memory Request 360 // is used), so special-case them up front. Someday we should 361 // make this more realistic, at which we should get rid of this 362 // block and fold this case into the switch below. 363 if (this->segment == Brig::BRIG_SEGMENT_ARG) { 364 uint64_t address = this->addr.calcUniform(); 365 366 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 367 if (mask[lane]) { 368 CType data = this->src.template get<CType>(w, lane); 369 DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); 370 w->writeCallArgMem<CType>(lane, address, data); 371 } 372 } 373 374 return; 375 } 376 377 GPUDynInstPtr m = gpuDynInst; 378 379 m->exec_mask = w->execMask(); 380 381 this->addr.calcVector(w, m->addr); 382 383 if (num_src_operands == 1) { 384 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 385 if (mask[lane]) { 386 ((CType*)m->d_data)[lane] = 387 this->src.template get<CType>(w, lane); 388 } 389 } 390 } else { 391 for (int k= 0; k < num_src_operands; ++k) { 392 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 393 if (mask[lane]) { 394 ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] = 395 this->src_vect[k].template get<CType>(w, lane); 396 } 397 } 398 } 399 } 400 401 m->m_type = OperationType::memType; 402 m->v_type = OperationType::vgprType; 403 404 m->statusBitVector = 0; 405 m->equiv = this->equivClass; 406 407 if (num_src_operands == 1) { 408 m->n_reg = 1; 409 } else { 410 m->n_reg = num_src_operands; 411 } 412 413 m->simdId = w->simdId; 414 m->wfSlotId = w->wfSlotId; 415 m->wfDynId = w->wfDynId; 416 m->kern_id = w->kernId; 417 m->cu_id = w->computeUnit->cu_id; 418 m->latency.init(&w->computeUnit->shader->tick_cnt); 419 420 switch (this->segment) { 421 case Brig::BRIG_SEGMENT_GLOBAL: 422 m->pipeId = GLBMEM_PIPE; 423 m->latency.set(w->computeUnit->shader->ticks(1)); 424 425 // this is a complete hack to get around a compiler bug 426 // (the compiler currently generates global access for private 427 // addresses (starting from 0). We need to add the private offset) 428 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 429 if (mask[lane]) { 430 if (m->addr[lane] < w->privSizePerItem) { 431 432 // calcPrivAddr will fail if accesses are unaligned 433 assert(!((sizeof(CType)-1) & m->addr[lane])); 434 435 Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, 436 this); 437 438 m->addr[lane] = privAddr; 439 } 440 } 441 } 442 443 w->computeUnit->globalMemoryPipe.issueRequest(m); 444 w->outstandingReqsWrGm++; 445 w->wrGmReqsInPipe--; 446 break; 447 448 case Brig::BRIG_SEGMENT_SPILL: 449 assert(num_src_operands == 1); 450 m->pipeId = GLBMEM_PIPE; 451 m->latency.set(w->computeUnit->shader->ticks(1)); 452 { 453 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 454 if (mask[lane]) { 455 assert(m->addr[lane] < w->spillSizePerItem); 456 457 m->addr[lane] = m->addr[lane] * w->spillWidth + 458 lane * sizeof(CType) + w->spillBase; 459 } 460 } 461 } 462 463 w->computeUnit->globalMemoryPipe.issueRequest(m); 464 w->outstandingReqsWrGm++; 465 w->wrGmReqsInPipe--; 466 break; 467 468 case Brig::BRIG_SEGMENT_GROUP: 469 m->pipeId = LDSMEM_PIPE; 470 m->latency.set(w->computeUnit->shader->ticks(24)); 471 w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); 472 w->outstandingReqsWrLm++; 473 w->wrLmReqsInPipe--; 474 break; 475 476 case Brig::BRIG_SEGMENT_PRIVATE: 477 m->pipeId = GLBMEM_PIPE; 478 m->latency.set(w->computeUnit->shader->ticks(1)); 479 { 480 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 481 if (mask[lane]) { 482 assert(m->addr[lane] < w->privSizePerItem); 483 m->addr[lane] = m->addr[lane] + lane * 484 sizeof(CType)+w->privBase; 485 } 486 } 487 } 488 489 w->computeUnit->globalMemoryPipe.issueRequest(m); 490 w->outstandingReqsWrGm++; 491 w->wrGmReqsInPipe--; 492 break; 493 494 default: 495 fatal("Store to unsupported segment %d\n", this->segment); 496 } 497 498 w->outstandingReqs++; 499 w->memReqsInPipe--; 500 } 501 502 template<typename OperationType, typename SrcDataType, 503 typename AddrRegOperandType> 504 void 505 StInst<OperationType, SrcDataType, 506 AddrRegOperandType>::generateDisassembly() 507 { 508 switch (num_src_operands) { 509 case 1: 510 this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, 511 segmentNames[this->segment], 512 OperationType::label, 513 this->src.disassemble(), 514 this->addr.disassemble()); 515 break; 516 case 2: 517 this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, 518 segmentNames[this->segment], 519 OperationType::label, 520 this->src_vect[0].disassemble(), 521 this->src_vect[1].disassemble(), 522 this->addr.disassemble()); 523 break; 524 case 4: 525 this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", 526 this->opcode, 527 segmentNames[this->segment], 528 OperationType::label, 529 this->src_vect[0].disassemble(), 530 this->src_vect[1].disassemble(), 531 this->src_vect[2].disassemble(), 532 this->src_vect[3].disassemble(), 533 this->addr.disassemble()); 534 break; 535 default: fatal("Bad ld register src operand, num vector operands: " 536 "%d \n", num_src_operands); 537 break; 538 } 539 } 540 541 template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, 542 bool HasDst> 543 void 544 AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, 545 HasDst>::execute(GPUDynInstPtr gpuDynInst) 546 { 547 typedef typename DataType::CType CType; 548 549 Wavefront *w = gpuDynInst->wavefront(); 550 551 GPUDynInstPtr m = gpuDynInst; 552 553 this->addr.calcVector(w, m->addr); 554 555 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 556 ((CType *)m->a_data)[lane] = 557 this->src[0].template get<CType>(w, lane); 558 } 559 560 // load second source operand for CAS 561 if (NumSrcOperands > 1) { 562 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 563 ((CType*)m->x_data)[lane] = 564 this->src[1].template get<CType>(w, lane); 565 } 566 } 567 568 assert(NumSrcOperands <= 2); 569 570 m->m_type = DataType::memType; 571 m->v_type = DataType::vgprType; 572 573 m->exec_mask = w->execMask(); 574 m->statusBitVector = 0; 575 m->equiv = 0; // atomics don't have an equivalence class operand 576 m->n_reg = 1; 577 578 if (HasDst) { 579 m->dst_reg = this->dest.regIndex(); 580 } 581 582 m->simdId = w->simdId; 583 m->wfSlotId = w->wfSlotId; 584 m->wfDynId = w->wfDynId; 585 m->kern_id = w->kernId; 586 m->cu_id = w->computeUnit->cu_id; 587 m->latency.init(&w->computeUnit->shader->tick_cnt); 588 589 switch (this->segment) { 590 case Brig::BRIG_SEGMENT_GLOBAL: 591 m->latency.set(w->computeUnit->shader->ticks(64)); 592 m->pipeId = GLBMEM_PIPE; 593 594 w->computeUnit->globalMemoryPipe.issueRequest(m); 595 w->outstandingReqsWrGm++; 596 w->wrGmReqsInPipe--; 597 w->outstandingReqsRdGm++; 598 w->rdGmReqsInPipe--; 599 break; 600 601 case Brig::BRIG_SEGMENT_GROUP: 602 m->pipeId = LDSMEM_PIPE; 603 m->latency.set(w->computeUnit->shader->ticks(24)); 604 w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); 605 w->outstandingReqsWrLm++; 606 w->wrLmReqsInPipe--; 607 w->outstandingReqsRdLm++; 608 w->rdLmReqsInPipe--; 609 break; 610 611 default: 612 fatal("Atomic op to unsupported segment %d\n", 613 this->segment); 614 } 615 616 w->outstandingReqs++; 617 w->memReqsInPipe--; 618 } 619 620 const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp); 621 622 template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, 623 bool HasDst> 624 void 625 AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, 626 HasDst>::generateDisassembly() 627 { 628 if (HasDst) { 629 this->disassembly = 630 csprintf("%s_%s_%s_%s %s,%s", this->opcode, 631 atomicOpToString(this->atomicOperation), 632 segmentNames[this->segment], 633 DataType::label, this->dest.disassemble(), 634 this->addr.disassemble()); 635 } else { 636 this->disassembly = 637 csprintf("%s_%s_%s_%s %s", this->opcode, 638 atomicOpToString(this->atomicOperation), 639 segmentNames[this->segment], 640 DataType::label, this->addr.disassemble()); 641 } 642 643 for (int i = 0; i < NumSrcOperands; ++i) { 644 this->disassembly += ","; 645 this->disassembly += this->src[i].disassemble(); 646 } 647 } 648} // namespace HsailISA 649