1/* 2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 37#define __ARCH_HSAIL_INSTS_MEM_HH__ 38 39#include <type_traits> 40 41#include "arch/hsail/insts/decl.hh" 42#include "arch/hsail/insts/gpu_static_inst.hh" 43#include "arch/hsail/operand.hh" 44#include "gpu-compute/compute_unit.hh" 45 46namespace HsailISA 47{ 48 class MemInst 49 { 50 public: 51 MemInst() : size(0), addr_operand(nullptr) { } 52 53 MemInst(Enums::MemType m_type) 54 { 55 if (m_type == Enums::M_U64 || 56 m_type == Enums::M_S64 || 57 m_type == Enums::M_F64) { 58 size = 8; 59 } else if (m_type == Enums::M_U32 || 60 m_type == Enums::M_S32 || 61 m_type == Enums::M_F32) { 62 size = 4; 63 } else if (m_type == Enums::M_U16 || 64 m_type == Enums::M_S16 || 65 m_type == Enums::M_F16) { 66 size = 2; 67 } else { 68 size = 1; 69 } 70 71 addr_operand = nullptr; 72 } 73 74 void 75 init_addr(AddrOperandBase *_addr_operand) 76 { 77 addr_operand = _addr_operand; 78 } 79 80 private: 81 int size; 82 AddrOperandBase *addr_operand; 83 84 public: 85 int getMemOperandSize() { return size; } 86 AddrOperandBase *getAddressOperand() { return addr_operand; } 87 }; 88 89 template<typename DestOperandType, typename AddrOperandType> 90 class LdaInstBase : public HsailGPUStaticInst 91 { 92 public: 93 typename DestOperandType::DestOperand dest; 94 AddrOperandType addr; 95 96 LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 97 const char *_opcode) 98 : HsailGPUStaticInst(obj, _opcode) 99 { 100 using namespace Brig; 101 102 setFlag(ALU); 103 104 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 105 dest.init(op_offs, obj); 106 op_offs = obj->getOperandPtr(ib->operands, 1); 107 addr.init(op_offs, obj); 108 } 109 110 int numSrcRegOperands() override 111 { return(this->addr.isVectorRegister()); } 112 int numDstRegOperands() override 113 { return dest.isVectorRegister(); } 114 bool isVectorRegister(int operandIndex) override 115 { 116 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 117 return((operandIndex == 0) ? dest.isVectorRegister() : 118 this->addr.isVectorRegister()); 119 } 120 bool isCondRegister(int operandIndex) override 121 { 122 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 123 return((operandIndex == 0) ? dest.isCondRegister() : 124 this->addr.isCondRegister()); 125 } 126 bool isScalarRegister(int operandIndex) override 127 { 128 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 129 return((operandIndex == 0) ? dest.isScalarRegister() : 130 this->addr.isScalarRegister()); 131 } 132 bool isSrcOperand(int operandIndex) override 133 { 134 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 135 if (operandIndex > 0) 136 return(this->addr.isVectorRegister()); 137 return false; 138 } 139 bool isDstOperand(int operandIndex) override { 140 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 141 return(operandIndex == 0); 142 } 143 int getOperandSize(int operandIndex) override 144 { 145 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 146 return((operandIndex == 0) ? dest.opSize() : 147 this->addr.opSize()); 148 } 149 int 150 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 151 { 152 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 153 return((operandIndex == 0) ? dest.regIndex() : 154 this->addr.regIndex()); 155 } 156 int getNumOperands() override 157 { 158 if (this->addr.isVectorRegister()) 159 return 2; 160 return 1; 161 } 162 }; 163 164 template<typename DestDataType, typename AddrOperandType> 165 class LdaInst : 166 public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, 167 public MemInst 168 { 169 public: 170 void generateDisassembly(); 171 172 LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 173 const char *_opcode) 174 : LdaInstBase<typename DestDataType::OperandType, 175 AddrOperandType>(ib, obj, _opcode) 176 { 177 init_addr(&this->addr); 178 } 179 180 void execute(GPUDynInstPtr gpuDynInst); 181 }; 182 183 template<typename DataType> 184 GPUStaticInst* 185 decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) 186 { 187 unsigned op_offs = obj->getOperandPtr(ib->operands, 1); 188 BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); 189 190 if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 191 return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); 192 } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 193 // V2/V4 not allowed 194 switch (regDataType.regKind) { 195 case Brig::BRIG_REGISTER_KIND_SINGLE: 196 return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); 197 case Brig::BRIG_REGISTER_KIND_DOUBLE: 198 return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); 199 default: 200 fatal("Bad ldas register operand type %d\n", regDataType.type); 201 } 202 } else { 203 fatal("Bad ldas register operand kind %d\n", regDataType.kind); 204 } 205 } 206 207 template<typename MemOperandType, typename DestOperandType, 208 typename AddrOperandType> 209 class LdInstBase : public HsailGPUStaticInst 210 { 211 public: 212 Brig::BrigWidth8_t width; 213 typename DestOperandType::DestOperand dest; 214 AddrOperandType addr; 215 216 Brig::BrigSegment segment; 217 Brig::BrigMemoryOrder memoryOrder; 218 Brig::BrigMemoryScope memoryScope; 219 unsigned int equivClass; 220 221 LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 222 const char *_opcode) 223 : HsailGPUStaticInst(obj, _opcode) 224 { 225 using namespace Brig; 226 227 setFlag(MemoryRef); 228 setFlag(Load); 229 230 if (ib->opcode == BRIG_OPCODE_LD) { 231 const BrigInstMem *ldst = (const BrigInstMem*)ib; 232 233 segment = (BrigSegment)ldst->segment; 234 memoryOrder = BRIG_MEMORY_ORDER_NONE; 235 memoryScope = BRIG_MEMORY_SCOPE_NONE; 236 equivClass = ldst->equivClass; 237 238 width = ldst->width; 239 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 240 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 241 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 242 dest.init(op_offs, obj); 243 244 op_offs = obj->getOperandPtr(ib->operands, 1); 245 addr.init(op_offs, obj); 246 } else { 247 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 248 249 segment = (BrigSegment)at->segment; 250 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 251 memoryScope = (BrigMemoryScope)at->memoryScope; 252 equivClass = 0; 253 254 width = BRIG_WIDTH_1; 255 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 256 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 257 258 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 259 dest.init(op_offs, obj); 260 261 op_offs = obj->getOperandPtr(ib->operands,1); 262 addr.init(op_offs, obj); 263 } 264 265 switch (memoryOrder) { 266 case BRIG_MEMORY_ORDER_NONE: 267 setFlag(NoOrder); 268 break; 269 case BRIG_MEMORY_ORDER_RELAXED: 270 setFlag(RelaxedOrder); 271 break; 272 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 273 setFlag(Acquire); 274 break; 275 case BRIG_MEMORY_ORDER_SC_RELEASE: 276 setFlag(Release); 277 break; 278 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 279 setFlag(AcquireRelease); 280 break; 281 default: 282 fatal("LdInst has bad memory order type\n"); 283 } 284 285 switch (memoryScope) { 286 case BRIG_MEMORY_SCOPE_NONE: 287 setFlag(NoScope); 288 break; 289 case BRIG_MEMORY_SCOPE_WORKITEM: 290 setFlag(WorkitemScope); 291 break; 292 case BRIG_MEMORY_SCOPE_WORKGROUP: 293 setFlag(WorkgroupScope); 294 break; 295 case BRIG_MEMORY_SCOPE_AGENT: 296 setFlag(DeviceScope); 297 break; 298 case BRIG_MEMORY_SCOPE_SYSTEM: 299 setFlag(SystemScope); 300 break; 301 default: 302 fatal("LdInst has bad memory scope type\n"); 303 } 304 305 switch (segment) { 306 case BRIG_SEGMENT_GLOBAL: 307 setFlag(GlobalSegment); 308 break; 309 case BRIG_SEGMENT_GROUP: 310 setFlag(GroupSegment); 311 break; 312 case BRIG_SEGMENT_PRIVATE: 313 setFlag(PrivateSegment); 314 break; 315 case BRIG_SEGMENT_READONLY: 316 setFlag(ReadOnlySegment); 317 break; 318 case BRIG_SEGMENT_SPILL: 319 setFlag(SpillSegment); 320 break; 321 case BRIG_SEGMENT_FLAT: 322 setFlag(Flat); 323 break; 324 case BRIG_SEGMENT_KERNARG: 325 setFlag(KernArgSegment); 326 break; 327 case BRIG_SEGMENT_ARG: 328 setFlag(ArgSegment); 329 break; 330 default: 331 panic("Ld: segment %d not supported\n", segment); 332 } 333 } 334 335 int numSrcRegOperands() override 336 { return(this->addr.isVectorRegister()); } 337 int numDstRegOperands() override { return dest.isVectorRegister(); } 338 int getNumOperands() override 339 { 340 if (this->addr.isVectorRegister()) 341 return 2; 342 else 343 return 1; 344 } 345 bool isVectorRegister(int operandIndex) override 346 { 347 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 348 return((operandIndex == 0) ? dest.isVectorRegister() : 349 this->addr.isVectorRegister()); 350 } 351 bool isCondRegister(int operandIndex) override 352 { 353 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 354 return((operandIndex == 0) ? dest.isCondRegister() : 355 this->addr.isCondRegister()); 356 } 357 bool isScalarRegister(int operandIndex) override 358 { 359 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 360 return((operandIndex == 0) ? dest.isScalarRegister() : 361 this->addr.isScalarRegister()); 362 } 363 bool isSrcOperand(int operandIndex) override 364 { 365 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 366 if (operandIndex > 0) 367 return(this->addr.isVectorRegister()); 368 return false; 369 } 370 bool isDstOperand(int operandIndex) override 371 { 372 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 373 return(operandIndex == 0); 374 } 375 int getOperandSize(int operandIndex) override 376 { 377 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 378 return((operandIndex == 0) ? dest.opSize() : 379 this->addr.opSize()); 380 } 381 int 382 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 383 { 384 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 385 return((operandIndex == 0) ? dest.regIndex() : 386 this->addr.regIndex()); 387 } 388 }; 389 390 template<typename MemDataType, typename DestDataType, 391 typename AddrOperandType> 392 class LdInst : 393 public LdInstBase<typename MemDataType::CType, 394 typename DestDataType::OperandType, AddrOperandType>, 395 public MemInst 396 { 397 typename DestDataType::OperandType::DestOperand dest_vect[4]; 398 uint16_t num_dest_operands; 399 void generateDisassembly() override; 400 401 public: 402 LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 403 const char *_opcode) 404 : LdInstBase<typename MemDataType::CType, 405 typename DestDataType::OperandType, 406 AddrOperandType>(ib, obj, _opcode), 407 MemInst(MemDataType::memType) 408 { 409 init_addr(&this->addr); 410 411 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 412 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 413 414 if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 415 const Brig::BrigOperandOperandList *brigRegVecOp = 416 (const Brig::BrigOperandOperandList*)brigOp; 417 418 num_dest_operands = 419 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 420 421 assert(num_dest_operands <= 4); 422 } else { 423 num_dest_operands = 1; 424 } 425 426 if (num_dest_operands > 1) { 427 assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 428 429 for (int i = 0; i < num_dest_operands; ++i) { 430 dest_vect[i].init_from_vect(op_offs, obj, i); 431 } 432 } 433 } 434 435 void 436 initiateAcc(GPUDynInstPtr gpuDynInst) override 437 { 438 typedef typename MemDataType::CType c0; 439 440 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 441 442 if (num_dest_operands > 1) { 443 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 444 if (gpuDynInst->exec_mask[i]) 445 gpuDynInst->statusVector.push_back(num_dest_operands); 446 else 447 gpuDynInst->statusVector.push_back(0); 448 } 449 450 for (int k = 0; k < num_dest_operands; ++k) { 451 452 c0 *d = &((c0*)gpuDynInst->d_data) 453 [k * gpuDynInst->computeUnit()->wfSize()]; 454 455 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 456 if (gpuDynInst->exec_mask[i]) { 457 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 458 459 if (this->isLocalMem()) { 460 // load from shared memory 461 *d = gpuDynInst->wavefront()->ldsChunk-> 462 read<c0>(vaddr); 463 } else { 464 RequestPtr req = std::make_shared<Request>(0, 465 vaddr, sizeof(c0), 0, 466 gpuDynInst->computeUnit()->masterId(), 467 0, gpuDynInst->wfDynId); 468 469 gpuDynInst->setRequestFlags(req); 470 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 471 pkt->dataStatic(d); 472 473 if (gpuDynInst->computeUnit()->shader-> 474 separate_acquire_release && 475 gpuDynInst->isAcquire()) { 476 // if this load has acquire semantics, 477 // set the response continuation function 478 // to perform an Acquire request 479 gpuDynInst->execContinuation = 480 &GPUStaticInst::execLdAcq; 481 482 gpuDynInst->useContinuation = true; 483 } else { 484 // the request will be finished when 485 // the load completes 486 gpuDynInst->useContinuation = false; 487 } 488 // translation is performed in sendRequest() 489 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 490 i, pkt); 491 } 492 } 493 ++d; 494 } 495 } 496 497 gpuDynInst->updateStats(); 498 } 499 500 void 501 completeAcc(GPUDynInstPtr gpuDynInst) override 502 { 503 typedef typename MemDataType::CType c1; 504 505 constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; 506 507 /** 508 * this code essentially replaces the long if-else chain 509 * that was in used GlobalMemPipeline::exec() to infer the 510 * size (single/double) and type (floating point/integer) of 511 * the destination register. this is needed for load 512 * instructions because the loaded value and the 513 * destination type can be of different sizes, and we also 514 * need to know if the value we're writing back is floating 515 * point and signed/unsigned, so we can properly cast the 516 * writeback value 517 */ 518 typedef typename std::conditional<is_vt_32, 519 typename std::conditional<std::is_floating_point<c1>::value, 520 float, typename std::conditional<std::is_signed<c1>::value, 521 int32_t, uint32_t>::type>::type, 522 typename std::conditional<std::is_floating_point<c1>::value, 523 double, typename std::conditional<std::is_signed<c1>::value, 524 int64_t, uint64_t>::type>::type>::type c0; 525 526 527 Wavefront *w = gpuDynInst->wavefront(); 528 529 std::vector<uint32_t> regVec; 530 // iterate over number of destination register operands since 531 // this is a load 532 for (int k = 0; k < num_dest_operands; ++k) { 533 assert((sizeof(c1) * num_dest_operands) 534 <= MAX_WIDTH_FOR_MEM_INST); 535 536 int dst = this->dest.regIndex() + k; 537 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) 538 dst = dest_vect[k].regIndex(); 539 // virtual->physical VGPR mapping 540 int physVgpr = w->remap(dst, sizeof(c0), 1); 541 // save the physical VGPR index 542 regVec.push_back(physVgpr); 543 544 c1 *p1 = 545 &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; 546 547 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 548 if (gpuDynInst->exec_mask[i]) { 549 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 550 "$%s%d <- %d global ld done (src = wavefront " 551 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 552 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", 553 dst, *p1); 554 // write the value into the physical VGPR. This is a 555 // purely functional operation. No timing is modeled. 556 w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, 557 *p1, i); 558 } 559 ++p1; 560 } 561 } 562 563 // Schedule the write operation of the load data on the VRF. 564 // This simply models the timing aspect of the VRF write operation. 565 // It does not modify the physical VGPR. 566 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 567 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 568 sizeof(c0), gpuDynInst->time); 569 570 if (this->isGlobalMem()) { 571 gpuDynInst->computeUnit()->globalMemoryPipe 572 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 573 } else { 574 assert(this->isLocalMem()); 575 gpuDynInst->computeUnit()->localMemoryPipe 576 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 577 } 578 } 579 580 private: 581 void 582 execLdAcq(GPUDynInstPtr gpuDynInst) override 583 { 584 // after the load has complete and if the load has acquire 585 // semantics, issue an acquire request. 586 if (!this->isLocalMem()) { 587 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 588 && gpuDynInst->isAcquire()) { 589 gpuDynInst->statusBitVector = VectorMask(1); 590 gpuDynInst->useContinuation = false; 591 // create request 592 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, 593 gpuDynInst->computeUnit()->masterId(), 594 0, gpuDynInst->wfDynId); 595 req->setFlags(Request::ACQUIRE); 596 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 597 } 598 } 599 } 600 601 public: 602 bool isVectorRegister(int operandIndex) override 603 { 604 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 605 if ((num_dest_operands != getNumOperands()) && 606 (operandIndex == (getNumOperands()-1))) 607 return(this->addr.isVectorRegister()); 608 if (num_dest_operands > 1) { 609 return dest_vect[operandIndex].isVectorRegister(); 610 } 611 else if (num_dest_operands == 1) { 612 return LdInstBase<typename MemDataType::CType, 613 typename DestDataType::OperandType, 614 AddrOperandType>::dest.isVectorRegister(); 615 } 616 return false; 617 } 618 bool isCondRegister(int operandIndex) override 619 { 620 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 621 if ((num_dest_operands != getNumOperands()) && 622 (operandIndex == (getNumOperands()-1))) 623 return(this->addr.isCondRegister()); 624 if (num_dest_operands > 1) 625 return dest_vect[operandIndex].isCondRegister(); 626 else if (num_dest_operands == 1) 627 return LdInstBase<typename MemDataType::CType, 628 typename DestDataType::OperandType, 629 AddrOperandType>::dest.isCondRegister(); 630 return false; 631 } 632 bool isScalarRegister(int operandIndex) override 633 { 634 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 635 if ((num_dest_operands != getNumOperands()) && 636 (operandIndex == (getNumOperands()-1))) 637 return(this->addr.isScalarRegister()); 638 if (num_dest_operands > 1) 639 return dest_vect[operandIndex].isScalarRegister(); 640 else if (num_dest_operands == 1) 641 return LdInstBase<typename MemDataType::CType, 642 typename DestDataType::OperandType, 643 AddrOperandType>::dest.isScalarRegister(); 644 return false; 645 } 646 bool isSrcOperand(int operandIndex) override 647 { 648 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 649 if ((num_dest_operands != getNumOperands()) && 650 (operandIndex == (getNumOperands()-1))) 651 return(this->addr.isVectorRegister()); 652 return false; 653 } 654 bool isDstOperand(int operandIndex) override 655 { 656 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 657 if ((num_dest_operands != getNumOperands()) && 658 (operandIndex == (getNumOperands()-1))) 659 return false; 660 return true; 661 } 662 int getOperandSize(int operandIndex) override 663 { 664 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 665 if ((num_dest_operands != getNumOperands()) && 666 (operandIndex == (getNumOperands()-1))) 667 return(this->addr.opSize()); 668 if (num_dest_operands > 1) 669 return(dest_vect[operandIndex].opSize()); 670 else if (num_dest_operands == 1) 671 return(LdInstBase<typename MemDataType::CType, 672 typename DestDataType::OperandType, 673 AddrOperandType>::dest.opSize()); 674 return 0; 675 } 676 int 677 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 678 { 679 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 680 if ((num_dest_operands != getNumOperands()) && 681 (operandIndex == (getNumOperands()-1))) 682 return(this->addr.regIndex()); 683 if (num_dest_operands > 1) 684 return(dest_vect[operandIndex].regIndex()); 685 else if (num_dest_operands == 1) 686 return(LdInstBase<typename MemDataType::CType, 687 typename DestDataType::OperandType, 688 AddrOperandType>::dest.regIndex()); 689 return -1; 690 } 691 int getNumOperands() override 692 { 693 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 694 return(num_dest_operands+1); 695 else 696 return(num_dest_operands); 697 } 698 void execute(GPUDynInstPtr gpuDynInst) override; 699 }; 700 701 template<typename MemDT, typename DestDT> 702 GPUStaticInst* 703 decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) 704 { 705 unsigned op_offs = obj->getOperandPtr(ib->operands,1); 706 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 707 708 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 709 return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); 710 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 711 tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 712 switch (tmp.regKind) { 713 case Brig::BRIG_REGISTER_KIND_SINGLE: 714 return new LdInst<MemDT, DestDT, 715 SRegAddrOperand>(ib, obj, "ld"); 716 case Brig::BRIG_REGISTER_KIND_DOUBLE: 717 return new LdInst<MemDT, DestDT, 718 DRegAddrOperand>(ib, obj, "ld"); 719 default: 720 fatal("Bad ld register operand type %d\n", tmp.regKind); 721 } 722 } else { 723 fatal("Bad ld register operand kind %d\n", tmp.kind); 724 } 725 } 726 727 template<typename MemDT> 728 GPUStaticInst* 729 decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) 730 { 731 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 732 BrigRegOperandInfo dest = findRegDataType(op_offs, obj); 733 734 assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 735 dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 736 switch(dest.regKind) { 737 case Brig::BRIG_REGISTER_KIND_SINGLE: 738 switch (ib->type) { 739 case Brig::BRIG_TYPE_B8: 740 case Brig::BRIG_TYPE_B16: 741 case Brig::BRIG_TYPE_B32: 742 return decodeLd2<MemDT, B32>(ib, obj); 743 case Brig::BRIG_TYPE_U8: 744 case Brig::BRIG_TYPE_U16: 745 case Brig::BRIG_TYPE_U32: 746 return decodeLd2<MemDT, U32>(ib, obj); 747 case Brig::BRIG_TYPE_S8: 748 case Brig::BRIG_TYPE_S16: 749 case Brig::BRIG_TYPE_S32: 750 return decodeLd2<MemDT, S32>(ib, obj); 751 case Brig::BRIG_TYPE_F16: 752 case Brig::BRIG_TYPE_F32: 753 return decodeLd2<MemDT, U32>(ib, obj); 754 default: 755 fatal("Bad ld register operand type %d, %d\n", 756 dest.regKind, ib->type); 757 }; 758 case Brig::BRIG_REGISTER_KIND_DOUBLE: 759 switch (ib->type) { 760 case Brig::BRIG_TYPE_B64: 761 return decodeLd2<MemDT, B64>(ib, obj); 762 case Brig::BRIG_TYPE_U64: 763 return decodeLd2<MemDT, U64>(ib, obj); 764 case Brig::BRIG_TYPE_S64: 765 return decodeLd2<MemDT, S64>(ib, obj); 766 case Brig::BRIG_TYPE_F64: 767 return decodeLd2<MemDT, U64>(ib, obj); 768 default: 769 fatal("Bad ld register operand type %d, %d\n", 770 dest.regKind, ib->type); 771 }; 772 default: 773 fatal("Bad ld register operand type %d, %d\n", dest.regKind, 774 ib->type); 775 } 776 } 777 778 template<typename MemDataType, typename SrcOperandType, 779 typename AddrOperandType> 780 class StInstBase : public HsailGPUStaticInst 781 { 782 public: 783 typename SrcOperandType::SrcOperand src; 784 AddrOperandType addr; 785 786 Brig::BrigSegment segment; 787 Brig::BrigMemoryScope memoryScope; 788 Brig::BrigMemoryOrder memoryOrder; 789 unsigned int equivClass; 790 791 StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 792 const char *_opcode) 793 : HsailGPUStaticInst(obj, _opcode) 794 { 795 using namespace Brig; 796 797 setFlag(MemoryRef); 798 setFlag(Store); 799 800 if (ib->opcode == BRIG_OPCODE_ST) { 801 const BrigInstMem *ldst = (const BrigInstMem*)ib; 802 803 segment = (BrigSegment)ldst->segment; 804 memoryOrder = BRIG_MEMORY_ORDER_NONE; 805 memoryScope = BRIG_MEMORY_SCOPE_NONE; 806 equivClass = ldst->equivClass; 807 808 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 809 const BrigOperand *baseOp = obj->getOperand(op_offs); 810 811 if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || 812 (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { 813 src.init(op_offs, obj); 814 } 815 816 op_offs = obj->getOperandPtr(ib->operands, 1); 817 addr.init(op_offs, obj); 818 } else { 819 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 820 821 segment = (BrigSegment)at->segment; 822 memoryScope = (BrigMemoryScope)at->memoryScope; 823 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 824 equivClass = 0; 825 826 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 827 addr.init(op_offs, obj); 828 829 op_offs = obj->getOperandPtr(ib->operands, 1); 830 src.init(op_offs, obj); 831 } 832 833 switch (memoryOrder) { 834 case BRIG_MEMORY_ORDER_NONE: 835 setFlag(NoOrder); 836 break; 837 case BRIG_MEMORY_ORDER_RELAXED: 838 setFlag(RelaxedOrder); 839 break; 840 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 841 setFlag(Acquire); 842 break; 843 case BRIG_MEMORY_ORDER_SC_RELEASE: 844 setFlag(Release); 845 break; 846 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 847 setFlag(AcquireRelease); 848 break; 849 default: 850 fatal("StInst has bad memory order type\n"); 851 } 852 853 switch (memoryScope) { 854 case BRIG_MEMORY_SCOPE_NONE: 855 setFlag(NoScope); 856 break; 857 case BRIG_MEMORY_SCOPE_WORKITEM: 858 setFlag(WorkitemScope); 859 break; 860 case BRIG_MEMORY_SCOPE_WORKGROUP: 861 setFlag(WorkgroupScope); 862 break; 863 case BRIG_MEMORY_SCOPE_AGENT: 864 setFlag(DeviceScope); 865 break; 866 case BRIG_MEMORY_SCOPE_SYSTEM: 867 setFlag(SystemScope); 868 break; 869 default: 870 fatal("StInst has bad memory scope type\n"); 871 } 872 873 switch (segment) { 874 case BRIG_SEGMENT_GLOBAL: 875 setFlag(GlobalSegment); 876 break; 877 case BRIG_SEGMENT_GROUP: 878 setFlag(GroupSegment); 879 break; 880 case BRIG_SEGMENT_PRIVATE: 881 setFlag(PrivateSegment); 882 break; 883 case BRIG_SEGMENT_READONLY: 884 setFlag(ReadOnlySegment); 885 break; 886 case BRIG_SEGMENT_SPILL: 887 setFlag(SpillSegment); 888 break; 889 case BRIG_SEGMENT_FLAT: 890 setFlag(Flat); 891 break; 892 case BRIG_SEGMENT_ARG: 893 setFlag(ArgSegment); 894 break; 895 default: 896 panic("St: segment %d not supported\n", segment); 897 } 898 } 899 900 int numDstRegOperands() override { return 0; } 901 int numSrcRegOperands() override 902 { 903 return src.isVectorRegister() + this->addr.isVectorRegister(); 904 } 905 int getNumOperands() override 906 { 907 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 908 return 2; 909 else 910 return 1; 911 } 912 bool isVectorRegister(int operandIndex) override 913 { 914 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 915 return !operandIndex ? src.isVectorRegister() : 916 this->addr.isVectorRegister(); 917 } 918 bool isCondRegister(int operandIndex) override 919 { 920 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 921 return !operandIndex ? src.isCondRegister() : 922 this->addr.isCondRegister(); 923 } 924 bool isScalarRegister(int operandIndex) override 925 { 926 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 927 return !operandIndex ? src.isScalarRegister() : 928 this->addr.isScalarRegister(); 929 } 930 bool isSrcOperand(int operandIndex) override 931 { 932 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 933 return true; 934 } 935 bool isDstOperand(int operandIndex) override { return false; } 936 int getOperandSize(int operandIndex) override 937 { 938 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 939 return !operandIndex ? src.opSize() : this->addr.opSize(); 940 } 941 int 942 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 943 { 944 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 945 return !operandIndex ? src.regIndex() : this->addr.regIndex(); 946 } 947 }; 948 949 950 template<typename MemDataType, typename SrcDataType, 951 typename AddrOperandType> 952 class StInst : 953 public StInstBase<MemDataType, typename SrcDataType::OperandType, 954 AddrOperandType>, 955 public MemInst 956 { 957 public: 958 typename SrcDataType::OperandType::SrcOperand src_vect[4]; 959 uint16_t num_src_operands; 960 void generateDisassembly() override; 961 962 StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 963 const char *_opcode, int srcIdx) 964 : StInstBase<MemDataType, typename SrcDataType::OperandType, 965 AddrOperandType>(ib, obj, _opcode), 966 MemInst(SrcDataType::memType) 967 { 968 init_addr(&this->addr); 969 970 BrigRegOperandInfo rinfo; 971 unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); 972 const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); 973 974 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { 975 const Brig::BrigOperandConstantBytes *op = 976 (Brig::BrigOperandConstantBytes*)baseOp; 977 978 rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, 979 Brig::BRIG_TYPE_NONE); 980 } else { 981 rinfo = findRegDataType(op_offs, obj); 982 } 983 984 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 985 const Brig::BrigOperandOperandList *brigRegVecOp = 986 (const Brig::BrigOperandOperandList*)baseOp; 987 988 num_src_operands = 989 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 990 991 assert(num_src_operands <= 4); 992 } else { 993 num_src_operands = 1; 994 } 995 996 if (num_src_operands > 1) { 997 assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 998 999 for (int i = 0; i < num_src_operands; ++i) { 1000 src_vect[i].init_from_vect(op_offs, obj, i); 1001 } 1002 } 1003 } 1004 1005 void 1006 initiateAcc(GPUDynInstPtr gpuDynInst) override 1007 { 1008 // before performing a store, check if this store has 1009 // release semantics, and if so issue a release first 1010 if (!this->isLocalMem()) { 1011 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1012 && gpuDynInst->isRelease()) { 1013 1014 gpuDynInst->statusBitVector = VectorMask(1); 1015 gpuDynInst->execContinuation = &GPUStaticInst::execSt; 1016 gpuDynInst->useContinuation = true; 1017 // create request 1018 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, 1019 gpuDynInst->computeUnit()->masterId(), 1020 0, gpuDynInst->wfDynId); 1021 req->setFlags(Request::RELEASE); 1022 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1023 1024 return; 1025 } 1026 } 1027 1028 // if there is no release semantic, perform stores immediately 1029 execSt(gpuDynInst); 1030 } 1031 1032 // stores don't write anything back, so there is nothing 1033 // to do here. we only override this method to avoid the 1034 // fatal in the base class implementation 1035 void completeAcc(GPUDynInstPtr gpuDynInst) override { } 1036 1037 private: 1038 // execSt may be called through a continuation 1039 // if the store had release semantics. see comment for 1040 // execSt in gpu_static_inst.hh 1041 void 1042 execSt(GPUDynInstPtr gpuDynInst) override 1043 { 1044 typedef typename MemDataType::CType c0; 1045 1046 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1047 1048 if (num_src_operands > 1) { 1049 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 1050 if (gpuDynInst->exec_mask[i]) 1051 gpuDynInst->statusVector.push_back(num_src_operands); 1052 else 1053 gpuDynInst->statusVector.push_back(0); 1054 } 1055 1056 for (int k = 0; k < num_src_operands; ++k) { 1057 c0 *d = &((c0*)gpuDynInst->d_data) 1058 [k * gpuDynInst->computeUnit()->wfSize()]; 1059 1060 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1061 if (gpuDynInst->exec_mask[i]) { 1062 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 1063 1064 if (this->isLocalMem()) { 1065 //store to shared memory 1066 gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, 1067 *d); 1068 } else { 1069 RequestPtr req = std::make_shared<Request>( 1070 0, vaddr, sizeof(c0), 0, 1071 gpuDynInst->computeUnit()->masterId(), 1072 0, gpuDynInst->wfDynId); 1073 1074 gpuDynInst->setRequestFlags(req); 1075 PacketPtr pkt = new Packet(req, MemCmd::WriteReq); 1076 pkt->dataStatic<c0>(d); 1077 1078 // translation is performed in sendRequest() 1079 // the request will be finished when the store completes 1080 gpuDynInst->useContinuation = false; 1081 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 1082 i, pkt); 1083 1084 } 1085 } 1086 ++d; 1087 } 1088 } 1089 1090 gpuDynInst->updateStats(); 1091 } 1092 1093 public: 1094 bool isVectorRegister(int operandIndex) override 1095 { 1096 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1097 if (operandIndex == num_src_operands) 1098 return this->addr.isVectorRegister(); 1099 if (num_src_operands > 1) 1100 return src_vect[operandIndex].isVectorRegister(); 1101 else if (num_src_operands == 1) 1102 return StInstBase<MemDataType, 1103 typename SrcDataType::OperandType, 1104 AddrOperandType>::src.isVectorRegister(); 1105 return false; 1106 } 1107 bool isCondRegister(int operandIndex) override 1108 { 1109 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1110 if (operandIndex == num_src_operands) 1111 return this->addr.isCondRegister(); 1112 if (num_src_operands > 1) 1113 return src_vect[operandIndex].isCondRegister(); 1114 else if (num_src_operands == 1) 1115 return StInstBase<MemDataType, 1116 typename SrcDataType::OperandType, 1117 AddrOperandType>::src.isCondRegister(); 1118 return false; 1119 } 1120 bool isScalarRegister(int operandIndex) override 1121 { 1122 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1123 if (operandIndex == num_src_operands) 1124 return this->addr.isScalarRegister(); 1125 if (num_src_operands > 1) 1126 return src_vect[operandIndex].isScalarRegister(); 1127 else if (num_src_operands == 1) 1128 return StInstBase<MemDataType, 1129 typename SrcDataType::OperandType, 1130 AddrOperandType>::src.isScalarRegister(); 1131 return false; 1132 } 1133 bool isSrcOperand(int operandIndex) override 1134 { 1135 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1136 return true; 1137 } 1138 bool isDstOperand(int operandIndex) override { return false; } 1139 int getOperandSize(int operandIndex) override 1140 { 1141 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1142 if (operandIndex == num_src_operands) 1143 return this->addr.opSize(); 1144 if (num_src_operands > 1) 1145 return src_vect[operandIndex].opSize(); 1146 else if (num_src_operands == 1) 1147 return StInstBase<MemDataType, 1148 typename SrcDataType::OperandType, 1149 AddrOperandType>::src.opSize(); 1150 return 0; 1151 } 1152 int 1153 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 1154 { 1155 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1156 if (operandIndex == num_src_operands) 1157 return this->addr.regIndex(); 1158 if (num_src_operands > 1) 1159 return src_vect[operandIndex].regIndex(); 1160 else if (num_src_operands == 1) 1161 return StInstBase<MemDataType, 1162 typename SrcDataType::OperandType, 1163 AddrOperandType>::src.regIndex(); 1164 return -1; 1165 } 1166 int getNumOperands() override 1167 { 1168 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 1169 return num_src_operands + 1; 1170 else 1171 return num_src_operands; 1172 } 1173 void execute(GPUDynInstPtr gpuDynInst) override; 1174 }; 1175 1176 template<typename DataType, typename SrcDataType> 1177 GPUStaticInst* 1178 decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) 1179 { 1180 int srcIdx = 0; 1181 int destIdx = 1; 1182 if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || 1183 ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { 1184 srcIdx = 1; 1185 destIdx = 0; 1186 } 1187 unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); 1188 1189 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1190 1191 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1192 return new StInst<DataType, SrcDataType, 1193 NoRegAddrOperand>(ib, obj, "st", srcIdx); 1194 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1195 // V2/V4 not allowed 1196 switch (tmp.regKind) { 1197 case Brig::BRIG_REGISTER_KIND_SINGLE: 1198 return new StInst<DataType, SrcDataType, 1199 SRegAddrOperand>(ib, obj, "st", srcIdx); 1200 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1201 return new StInst<DataType, SrcDataType, 1202 DRegAddrOperand>(ib, obj, "st", srcIdx); 1203 default: 1204 fatal("Bad st register operand type %d\n", tmp.type); 1205 } 1206 } else { 1207 fatal("Bad st register operand kind %d\n", tmp.kind); 1208 } 1209 } 1210 1211 template<typename OperandType, typename AddrOperandType, int NumSrcOperands, 1212 bool HasDst> 1213 class AtomicInstBase : public HsailGPUStaticInst 1214 { 1215 public: 1216 typename OperandType::DestOperand dest; 1217 typename OperandType::SrcOperand src[NumSrcOperands]; 1218 AddrOperandType addr; 1219 1220 Brig::BrigSegment segment; 1221 Brig::BrigMemoryOrder memoryOrder; 1222 Brig::BrigAtomicOperation atomicOperation; 1223 Brig::BrigMemoryScope memoryScope; 1224 Brig::BrigOpcode opcode; 1225 1226 AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 1227 const char *_opcode) 1228 : HsailGPUStaticInst(obj, _opcode) 1229 { 1230 using namespace Brig; 1231 1232 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 1233 1234 segment = (BrigSegment)at->segment; 1235 memoryScope = (BrigMemoryScope)at->memoryScope; 1236 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 1237 atomicOperation = (BrigAtomicOperation)at->atomicOperation; 1238 opcode = (BrigOpcode)ib->opcode; 1239 1240 assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || 1241 opcode == Brig::BRIG_OPCODE_ATOMIC); 1242 1243 setFlag(MemoryRef); 1244 1245 if (opcode == Brig::BRIG_OPCODE_ATOMIC) { 1246 setFlag(AtomicReturn); 1247 } else { 1248 setFlag(AtomicNoReturn); 1249 } 1250 1251 switch (memoryOrder) { 1252 case BRIG_MEMORY_ORDER_NONE: 1253 setFlag(NoOrder); 1254 break; 1255 case BRIG_MEMORY_ORDER_RELAXED: 1256 setFlag(RelaxedOrder); 1257 break; 1258 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 1259 setFlag(Acquire); 1260 break; 1261 case BRIG_MEMORY_ORDER_SC_RELEASE: 1262 setFlag(Release); 1263 break; 1264 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 1265 setFlag(AcquireRelease); 1266 break; 1267 default: 1268 fatal("AtomicInst has bad memory order type\n"); 1269 } 1270 1271 switch (memoryScope) { 1272 case BRIG_MEMORY_SCOPE_NONE: 1273 setFlag(NoScope); 1274 break; 1275 case BRIG_MEMORY_SCOPE_WORKITEM: 1276 setFlag(WorkitemScope); 1277 break; 1278 case BRIG_MEMORY_SCOPE_WORKGROUP: 1279 setFlag(WorkgroupScope); 1280 break; 1281 case BRIG_MEMORY_SCOPE_AGENT: 1282 setFlag(DeviceScope); 1283 break; 1284 case BRIG_MEMORY_SCOPE_SYSTEM: 1285 setFlag(SystemScope); 1286 break; 1287 default: 1288 fatal("AtomicInst has bad memory scope type\n"); 1289 } 1290 1291 switch (atomicOperation) { 1292 case Brig::BRIG_ATOMIC_AND: 1293 setFlag(AtomicAnd); 1294 break; 1295 case Brig::BRIG_ATOMIC_OR: 1296 setFlag(AtomicOr); 1297 break; 1298 case Brig::BRIG_ATOMIC_XOR: 1299 setFlag(AtomicXor); 1300 break; 1301 case Brig::BRIG_ATOMIC_CAS: 1302 setFlag(AtomicCAS); 1303 break; 1304 case Brig::BRIG_ATOMIC_EXCH: 1305 setFlag(AtomicExch); 1306 break; 1307 case Brig::BRIG_ATOMIC_ADD: 1308 setFlag(AtomicAdd); 1309 break; 1310 case Brig::BRIG_ATOMIC_WRAPINC: 1311 setFlag(AtomicInc); 1312 break; 1313 case Brig::BRIG_ATOMIC_WRAPDEC: 1314 setFlag(AtomicDec); 1315 break; 1316 case Brig::BRIG_ATOMIC_MIN: 1317 setFlag(AtomicMin); 1318 break; 1319 case Brig::BRIG_ATOMIC_MAX: 1320 setFlag(AtomicMax); 1321 break; 1322 case Brig::BRIG_ATOMIC_SUB: 1323 setFlag(AtomicSub); 1324 break; 1325 default: 1326 fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); 1327 } 1328 1329 switch (segment) { 1330 case BRIG_SEGMENT_GLOBAL: 1331 setFlag(GlobalSegment); 1332 break; 1333 case BRIG_SEGMENT_GROUP: 1334 setFlag(GroupSegment); 1335 break; 1336 case BRIG_SEGMENT_FLAT: 1337 setFlag(Flat); 1338 break; 1339 default: 1340 panic("Atomic: segment %d not supported\n", segment); 1341 } 1342 1343 if (HasDst) { 1344 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1345 dest.init(op_offs, obj); 1346 1347 op_offs = obj->getOperandPtr(ib->operands, 1); 1348 addr.init(op_offs, obj); 1349 1350 for (int i = 0; i < NumSrcOperands; ++i) { 1351 op_offs = obj->getOperandPtr(ib->operands, i + 2); 1352 src[i].init(op_offs, obj); 1353 } 1354 } else { 1355 1356 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1357 addr.init(op_offs, obj); 1358 1359 for (int i = 0; i < NumSrcOperands; ++i) { 1360 op_offs = obj->getOperandPtr(ib->operands, i + 1); 1361 src[i].init(op_offs, obj); 1362 } 1363 } 1364 } 1365 1366 int numSrcRegOperands() 1367 { 1368 int operands = 0; 1369 for (int i = 0; i < NumSrcOperands; i++) { 1370 if (src[i].isVectorRegister()) { 1371 operands++; 1372 } 1373 } 1374 if (addr.isVectorRegister()) 1375 operands++; 1376 return operands; 1377 } 1378 int numDstRegOperands() { return dest.isVectorRegister(); } 1379 int getNumOperands() 1380 { 1381 if (addr.isVectorRegister()) 1382 return(NumSrcOperands + 2); 1383 return(NumSrcOperands + 1); 1384 } 1385 bool isVectorRegister(int operandIndex) 1386 { 1387 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1388 if (operandIndex < NumSrcOperands) 1389 return src[operandIndex].isVectorRegister(); 1390 else if (operandIndex == NumSrcOperands) 1391 return(addr.isVectorRegister()); 1392 else 1393 return dest.isVectorRegister(); 1394 } 1395 bool isCondRegister(int operandIndex) 1396 { 1397 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1398 if (operandIndex < NumSrcOperands) 1399 return src[operandIndex].isCondRegister(); 1400 else if (operandIndex == NumSrcOperands) 1401 return(addr.isCondRegister()); 1402 else 1403 return dest.isCondRegister(); 1404 } 1405 bool isScalarRegister(int operandIndex) 1406 { 1407 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1408 if (operandIndex < NumSrcOperands) 1409 return src[operandIndex].isScalarRegister(); 1410 else if (operandIndex == NumSrcOperands) 1411 return(addr.isScalarRegister()); 1412 else 1413 return dest.isScalarRegister(); 1414 } 1415 bool isSrcOperand(int operandIndex) 1416 { 1417 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1418 if (operandIndex < NumSrcOperands) 1419 return true; 1420 else if (operandIndex == NumSrcOperands) 1421 return(addr.isVectorRegister()); 1422 else 1423 return false; 1424 } 1425 bool isDstOperand(int operandIndex) 1426 { 1427 if (operandIndex <= NumSrcOperands) 1428 return false; 1429 else 1430 return true; 1431 } 1432 int getOperandSize(int operandIndex) 1433 { 1434 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1435 if (operandIndex < NumSrcOperands) 1436 return(src[operandIndex].opSize()); 1437 else if (operandIndex == NumSrcOperands) 1438 return(addr.opSize()); 1439 else 1440 return(dest.opSize()); 1441 } 1442 int 1443 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) 1444 { 1445 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1446 if (operandIndex < NumSrcOperands) 1447 return(src[operandIndex].regIndex()); 1448 else if (operandIndex == NumSrcOperands) 1449 return(addr.regIndex()); 1450 else 1451 return(dest.regIndex()); 1452 return -1; 1453 } 1454 }; 1455 1456 template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, 1457 bool HasDst> 1458 class AtomicInst : 1459 public AtomicInstBase<typename MemDataType::OperandType, 1460 AddrOperandType, NumSrcOperands, HasDst>, 1461 public MemInst 1462 { 1463 public: 1464 void generateDisassembly() override; 1465 1466 AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 1467 const char *_opcode) 1468 : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, 1469 NumSrcOperands, HasDst> 1470 (ib, obj, _opcode), 1471 MemInst(MemDataType::memType) 1472 { 1473 init_addr(&this->addr); 1474 } 1475 1476 void 1477 initiateAcc(GPUDynInstPtr gpuDynInst) override 1478 { 1479 // before doing the RMW, check if this atomic has 1480 // release semantics, and if so issue a release first 1481 if (!this->isLocalMem()) { 1482 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1483 && (gpuDynInst->isRelease() 1484 || gpuDynInst->isAcquireRelease())) { 1485 1486 gpuDynInst->statusBitVector = VectorMask(1); 1487 1488 gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; 1489 gpuDynInst->useContinuation = true; 1490 1491 // create request 1492 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, 1493 gpuDynInst->computeUnit()->masterId(), 1494 0, gpuDynInst->wfDynId); 1495 req->setFlags(Request::RELEASE); 1496 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1497 1498 return; 1499 } 1500 } 1501 1502 // if there is no release semantic, execute the RMW immediately 1503 execAtomic(gpuDynInst); 1504 1505 } 1506 1507 void 1508 completeAcc(GPUDynInstPtr gpuDynInst) override 1509 { 1510 // if this is not an atomic return op, then we 1511 // have nothing more to do. 1512 if (this->isAtomicRet()) { 1513 // the size of the src operands and the 1514 // memory being operated on must match 1515 // for HSAIL atomics - this assumption may 1516 // not apply to all ISAs 1517 typedef typename MemDataType::CType CType; 1518 1519 Wavefront *w = gpuDynInst->wavefront(); 1520 int dst = this->dest.regIndex(); 1521 std::vector<uint32_t> regVec; 1522 // virtual->physical VGPR mapping 1523 int physVgpr = w->remap(dst, sizeof(CType), 1); 1524 regVec.push_back(physVgpr); 1525 CType *p1 = &((CType*)gpuDynInst->d_data)[0]; 1526 1527 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 1528 if (gpuDynInst->exec_mask[i]) { 1529 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 1530 "$%s%d <- %d global ld done (src = wavefront " 1531 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 1532 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", 1533 dst, *p1); 1534 // write the value into the physical VGPR. This is a 1535 // purely functional operation. No timing is modeled. 1536 w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); 1537 } 1538 ++p1; 1539 } 1540 1541 // Schedule the write operation of the load data on the VRF. 1542 // This simply models the timing aspect of the VRF write operation. 1543 // It does not modify the physical VGPR. 1544 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 1545 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 1546 sizeof(CType), gpuDynInst->time); 1547 1548 if (this->isGlobalMem()) { 1549 gpuDynInst->computeUnit()->globalMemoryPipe 1550 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1551 } else { 1552 assert(this->isLocalMem()); 1553 gpuDynInst->computeUnit()->localMemoryPipe 1554 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1555 } 1556 } 1557 } 1558 1559 void execute(GPUDynInstPtr gpuDynInst) override; 1560 1561 private: 1562 // execAtomic may be called through a continuation 1563 // if the RMW had release semantics. see comment for 1564 // execContinuation in gpu_dyn_inst.hh 1565 void 1566 execAtomic(GPUDynInstPtr gpuDynInst) override 1567 { 1568 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1569 1570 typedef typename MemDataType::CType c0; 1571 1572 c0 *d = &((c0*) gpuDynInst->d_data)[0]; 1573 c0 *e = &((c0*) gpuDynInst->a_data)[0]; 1574 c0 *f = &((c0*) gpuDynInst->x_data)[0]; 1575 1576 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1577 if (gpuDynInst->exec_mask[i]) { 1578 Addr vaddr = gpuDynInst->addr[i]; 1579 1580 if (this->isLocalMem()) { 1581 Wavefront *wavefront = gpuDynInst->wavefront(); 1582 *d = wavefront->ldsChunk->read<c0>(vaddr); 1583 1584 if (this->isAtomicAdd()) { 1585 wavefront->ldsChunk->write<c0>(vaddr, 1586 wavefront->ldsChunk->read<c0>(vaddr) + (*e)); 1587 } else if (this->isAtomicSub()) { 1588 wavefront->ldsChunk->write<c0>(vaddr, 1589 wavefront->ldsChunk->read<c0>(vaddr) - (*e)); 1590 } else if (this->isAtomicMax()) { 1591 wavefront->ldsChunk->write<c0>(vaddr, 1592 std::max(wavefront->ldsChunk->read<c0>(vaddr), 1593 (*e))); 1594 } else if (this->isAtomicMin()) { 1595 wavefront->ldsChunk->write<c0>(vaddr, 1596 std::min(wavefront->ldsChunk->read<c0>(vaddr), 1597 (*e))); 1598 } else if (this->isAtomicAnd()) { 1599 wavefront->ldsChunk->write<c0>(vaddr, 1600 wavefront->ldsChunk->read<c0>(vaddr) & (*e)); 1601 } else if (this->isAtomicOr()) { 1602 wavefront->ldsChunk->write<c0>(vaddr, 1603 wavefront->ldsChunk->read<c0>(vaddr) | (*e)); 1604 } else if (this->isAtomicXor()) { 1605 wavefront->ldsChunk->write<c0>(vaddr, 1606 wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); 1607 } else if (this->isAtomicInc()) { 1608 wavefront->ldsChunk->write<c0>(vaddr, 1609 wavefront->ldsChunk->read<c0>(vaddr) + 1); 1610 } else if (this->isAtomicDec()) { 1611 wavefront->ldsChunk->write<c0>(vaddr, 1612 wavefront->ldsChunk->read<c0>(vaddr) - 1); 1613 } else if (this->isAtomicExch()) { 1614 wavefront->ldsChunk->write<c0>(vaddr, (*e)); 1615 } else if (this->isAtomicCAS()) { 1616 wavefront->ldsChunk->write<c0>(vaddr, 1617 (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? 1618 (*f) : wavefront->ldsChunk->read<c0>(vaddr)); 1619 } else { 1620 fatal("Unrecognized or invalid HSAIL atomic op " 1621 "type.\n"); 1622 } 1623 } else { 1624 RequestPtr req = 1625 std::make_shared<Request>(0, vaddr, sizeof(c0), 0, 1626 gpuDynInst->computeUnit()->masterId(), 1627 0, gpuDynInst->wfDynId, 1628 gpuDynInst->makeAtomicOpFunctor<c0>(e, 1629 f)); 1630 1631 gpuDynInst->setRequestFlags(req); 1632 PacketPtr pkt = new Packet(req, MemCmd::SwapReq); 1633 pkt->dataStatic(d); 1634 1635 if (gpuDynInst->computeUnit()->shader-> 1636 separate_acquire_release && 1637 (gpuDynInst->isAcquire())) { 1638 // if this atomic has acquire semantics, 1639 // schedule the continuation to perform an 1640 // acquire after the RMW completes 1641 gpuDynInst->execContinuation = 1642 &GPUStaticInst::execAtomicAcq; 1643 1644 gpuDynInst->useContinuation = true; 1645 } else { 1646 // the request will be finished when the RMW completes 1647 gpuDynInst->useContinuation = false; 1648 } 1649 // translation is performed in sendRequest() 1650 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, 1651 pkt); 1652 } 1653 } 1654 1655 ++d; 1656 ++e; 1657 ++f; 1658 } 1659 1660 gpuDynInst->updateStats(); 1661 } 1662 1663 // execAtomicACq will always be called through a continuation. 1664 // see comment for execContinuation in gpu_dyn_inst.hh 1665 void 1666 execAtomicAcq(GPUDynInstPtr gpuDynInst) override 1667 { 1668 // after performing the RMW, check to see if this instruction 1669 // has acquire semantics, and if so, issue an acquire 1670 if (!this->isLocalMem()) { 1671 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1672 && gpuDynInst->isAcquire()) { 1673 gpuDynInst->statusBitVector = VectorMask(1); 1674 1675 // the request will be finished when 1676 // the acquire completes 1677 gpuDynInst->useContinuation = false; 1678 // create request 1679 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, 1680 gpuDynInst->computeUnit()->masterId(), 1681 0, gpuDynInst->wfDynId); 1682 req->setFlags(Request::ACQUIRE); 1683 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1684 } 1685 } 1686 } 1687 }; 1688 1689 template<typename DataType, typename AddrOperandType, int NumSrcOperands> 1690 GPUStaticInst* 1691 constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1692 { 1693 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1694 1695 if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { 1696 return decodeLd<DataType>(ib, obj); 1697 } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { 1698 switch (ib->type) { 1699 case Brig::BRIG_TYPE_B8: 1700 return decodeSt<S8,S8>(ib, obj); 1701 case Brig::BRIG_TYPE_B16: 1702 return decodeSt<S16,S16>(ib, obj); 1703 case Brig::BRIG_TYPE_B32: 1704 return decodeSt<S32,S32>(ib, obj); 1705 case Brig::BRIG_TYPE_B64: 1706 return decodeSt<S64,S64>(ib, obj); 1707 default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); 1708 } 1709 } else { 1710 if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) 1711 return new AtomicInst<DataType, AddrOperandType, 1712 NumSrcOperands, false>(ib, obj, "atomicnoret"); 1713 else 1714 return new AtomicInst<DataType, AddrOperandType, 1715 NumSrcOperands, true>(ib, obj, "atomic"); 1716 } 1717 } 1718 1719 template<typename DataType, int NumSrcOperands> 1720 GPUStaticInst* 1721 decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) 1722 { 1723 unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == 1724 Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; 1725 1726 unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); 1727 1728 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1729 1730 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1731 return constructAtomic<DataType, NoRegAddrOperand, 1732 NumSrcOperands>(ib, obj); 1733 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1734 // V2/V4 not allowed 1735 switch (tmp.regKind) { 1736 case Brig::BRIG_REGISTER_KIND_SINGLE: 1737 return constructAtomic<DataType, SRegAddrOperand, 1738 NumSrcOperands>(ib, obj); 1739 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1740 return constructAtomic<DataType, DRegAddrOperand, 1741 NumSrcOperands>(ib, obj); 1742 default: 1743 fatal("Bad atomic register operand type %d\n", tmp.type); 1744 } 1745 } else { 1746 fatal("Bad atomic register operand kind %d\n", tmp.kind); 1747 } 1748 } 1749 1750 1751 template<typename DataType> 1752 GPUStaticInst* 1753 decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1754 { 1755 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1756 1757 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1758 return decodeAtomicHelper<DataType, 2>(ib, obj); 1759 } else { 1760 return decodeAtomicHelper<DataType, 1>(ib, obj); 1761 } 1762 } 1763 1764 template<typename DataType> 1765 GPUStaticInst* 1766 decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) 1767 { 1768 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1769 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1770 return decodeAtomicHelper<DataType, 2>(ib, obj); 1771 } else { 1772 return decodeAtomicHelper<DataType, 1>(ib, obj); 1773 } 1774 } 1775} // namespace HsailISA 1776 1777#endif // __ARCH_HSAIL_INSTS_MEM_HH__ 1778