mem.hh revision 11699:c7453f485a5f
1/* 2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 37#define __ARCH_HSAIL_INSTS_MEM_HH__ 38 39#include <type_traits> 40 41#include "arch/hsail/insts/decl.hh" 42#include "arch/hsail/insts/gpu_static_inst.hh" 43#include "arch/hsail/operand.hh" 44#include "gpu-compute/compute_unit.hh" 45 46namespace HsailISA 47{ 48 class MemInst 49 { 50 public: 51 MemInst() : size(0), addr_operand(nullptr) { } 52 53 MemInst(Enums::MemType m_type) 54 { 55 if (m_type == Enums::M_U64 || 56 m_type == Enums::M_S64 || 57 m_type == Enums::M_F64) { 58 size = 8; 59 } else if (m_type == Enums::M_U32 || 60 m_type == Enums::M_S32 || 61 m_type == Enums::M_F32) { 62 size = 4; 63 } else if (m_type == Enums::M_U16 || 64 m_type == Enums::M_S16 || 65 m_type == Enums::M_F16) { 66 size = 2; 67 } else { 68 size = 1; 69 } 70 71 addr_operand = nullptr; 72 } 73 74 void 75 init_addr(AddrOperandBase *_addr_operand) 76 { 77 addr_operand = _addr_operand; 78 } 79 80 private: 81 int size; 82 AddrOperandBase *addr_operand; 83 84 public: 85 int getMemOperandSize() { return size; } 86 AddrOperandBase *getAddressOperand() { return addr_operand; } 87 }; 88 89 template<typename DestOperandType, typename AddrOperandType> 90 class LdaInstBase : public HsailGPUStaticInst 91 { 92 public: 93 typename DestOperandType::DestOperand dest; 94 AddrOperandType addr; 95 96 LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 97 const char *_opcode) 98 : HsailGPUStaticInst(obj, _opcode) 99 { 100 using namespace Brig; 101 102 setFlag(ALU); 103 104 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 105 dest.init(op_offs, obj); 106 op_offs = obj->getOperandPtr(ib->operands, 1); 107 addr.init(op_offs, obj); 108 } 109 110 int numSrcRegOperands() override 111 { return(this->addr.isVectorRegister()); } 112 int numDstRegOperands() override 113 { return dest.isVectorRegister(); } 114 bool isVectorRegister(int operandIndex) override 115 { 116 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 117 return((operandIndex == 0) ? dest.isVectorRegister() : 118 this->addr.isVectorRegister()); 119 } 120 bool isCondRegister(int operandIndex) override 121 { 122 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 123 return((operandIndex == 0) ? dest.isCondRegister() : 124 this->addr.isCondRegister()); 125 } 126 bool isScalarRegister(int operandIndex) override 127 { 128 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 129 return((operandIndex == 0) ? dest.isScalarRegister() : 130 this->addr.isScalarRegister()); 131 } 132 bool isSrcOperand(int operandIndex) override 133 { 134 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 135 if (operandIndex > 0) 136 return(this->addr.isVectorRegister()); 137 return false; 138 } 139 bool isDstOperand(int operandIndex) override { 140 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 141 return(operandIndex == 0); 142 } 143 int getOperandSize(int operandIndex) override 144 { 145 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 146 return((operandIndex == 0) ? dest.opSize() : 147 this->addr.opSize()); 148 } 149 int 150 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 151 { 152 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 153 return((operandIndex == 0) ? dest.regIndex() : 154 this->addr.regIndex()); 155 } 156 int getNumOperands() override 157 { 158 if (this->addr.isVectorRegister()) 159 return 2; 160 return 1; 161 } 162 }; 163 164 template<typename DestDataType, typename AddrOperandType> 165 class LdaInst : 166 public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, 167 public MemInst 168 { 169 public: 170 void generateDisassembly(); 171 172 LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 173 const char *_opcode) 174 : LdaInstBase<typename DestDataType::OperandType, 175 AddrOperandType>(ib, obj, _opcode) 176 { 177 init_addr(&this->addr); 178 } 179 180 void execute(GPUDynInstPtr gpuDynInst); 181 }; 182 183 template<typename DataType> 184 GPUStaticInst* 185 decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) 186 { 187 unsigned op_offs = obj->getOperandPtr(ib->operands, 1); 188 BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); 189 190 if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 191 return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); 192 } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 193 // V2/V4 not allowed 194 switch (regDataType.regKind) { 195 case Brig::BRIG_REGISTER_KIND_SINGLE: 196 return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); 197 case Brig::BRIG_REGISTER_KIND_DOUBLE: 198 return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); 199 default: 200 fatal("Bad ldas register operand type %d\n", regDataType.type); 201 } 202 } else { 203 fatal("Bad ldas register operand kind %d\n", regDataType.kind); 204 } 205 } 206 207 template<typename MemOperandType, typename DestOperandType, 208 typename AddrOperandType> 209 class LdInstBase : public HsailGPUStaticInst 210 { 211 public: 212 Brig::BrigWidth8_t width; 213 typename DestOperandType::DestOperand dest; 214 AddrOperandType addr; 215 216 Brig::BrigSegment segment; 217 Brig::BrigMemoryOrder memoryOrder; 218 Brig::BrigMemoryScope memoryScope; 219 unsigned int equivClass; 220 221 LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 222 const char *_opcode) 223 : HsailGPUStaticInst(obj, _opcode) 224 { 225 using namespace Brig; 226 227 setFlag(MemoryRef); 228 setFlag(Load); 229 230 if (ib->opcode == BRIG_OPCODE_LD) { 231 const BrigInstMem *ldst = (const BrigInstMem*)ib; 232 233 segment = (BrigSegment)ldst->segment; 234 memoryOrder = BRIG_MEMORY_ORDER_NONE; 235 memoryScope = BRIG_MEMORY_SCOPE_NONE; 236 equivClass = ldst->equivClass; 237 238 width = ldst->width; 239 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 240 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 241 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 242 dest.init(op_offs, obj); 243 244 op_offs = obj->getOperandPtr(ib->operands, 1); 245 addr.init(op_offs, obj); 246 } else { 247 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 248 249 segment = (BrigSegment)at->segment; 250 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 251 memoryScope = (BrigMemoryScope)at->memoryScope; 252 equivClass = 0; 253 254 width = BRIG_WIDTH_1; 255 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 256 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 257 258 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 259 dest.init(op_offs, obj); 260 261 op_offs = obj->getOperandPtr(ib->operands,1); 262 addr.init(op_offs, obj); 263 } 264 265 switch (memoryOrder) { 266 case BRIG_MEMORY_ORDER_NONE: 267 setFlag(NoOrder); 268 break; 269 case BRIG_MEMORY_ORDER_RELAXED: 270 setFlag(RelaxedOrder); 271 break; 272 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 273 setFlag(Acquire); 274 break; 275 case BRIG_MEMORY_ORDER_SC_RELEASE: 276 setFlag(Release); 277 break; 278 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 279 setFlag(AcquireRelease); 280 break; 281 default: 282 fatal("LdInst has bad memory order type\n"); 283 } 284 285 switch (memoryScope) { 286 case BRIG_MEMORY_SCOPE_NONE: 287 setFlag(NoScope); 288 break; 289 case BRIG_MEMORY_SCOPE_WORKITEM: 290 setFlag(WorkitemScope); 291 break; 292 case BRIG_MEMORY_SCOPE_WORKGROUP: 293 setFlag(WorkgroupScope); 294 break; 295 case BRIG_MEMORY_SCOPE_AGENT: 296 setFlag(DeviceScope); 297 break; 298 case BRIG_MEMORY_SCOPE_SYSTEM: 299 setFlag(SystemScope); 300 break; 301 default: 302 fatal("LdInst has bad memory scope type\n"); 303 } 304 305 switch (segment) { 306 case BRIG_SEGMENT_GLOBAL: 307 setFlag(GlobalSegment); 308 break; 309 case BRIG_SEGMENT_GROUP: 310 setFlag(GroupSegment); 311 break; 312 case BRIG_SEGMENT_PRIVATE: 313 setFlag(PrivateSegment); 314 break; 315 case BRIG_SEGMENT_READONLY: 316 setFlag(ReadOnlySegment); 317 break; 318 case BRIG_SEGMENT_SPILL: 319 setFlag(SpillSegment); 320 break; 321 case BRIG_SEGMENT_FLAT: 322 setFlag(Flat); 323 break; 324 case BRIG_SEGMENT_KERNARG: 325 setFlag(KernArgSegment); 326 break; 327 case BRIG_SEGMENT_ARG: 328 setFlag(ArgSegment); 329 break; 330 default: 331 panic("Ld: segment %d not supported\n", segment); 332 } 333 } 334 335 int numSrcRegOperands() override 336 { return(this->addr.isVectorRegister()); } 337 int numDstRegOperands() override { return dest.isVectorRegister(); } 338 int getNumOperands() override 339 { 340 if (this->addr.isVectorRegister()) 341 return 2; 342 else 343 return 1; 344 } 345 bool isVectorRegister(int operandIndex) override 346 { 347 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 348 return((operandIndex == 0) ? dest.isVectorRegister() : 349 this->addr.isVectorRegister()); 350 } 351 bool isCondRegister(int operandIndex) override 352 { 353 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 354 return((operandIndex == 0) ? dest.isCondRegister() : 355 this->addr.isCondRegister()); 356 } 357 bool isScalarRegister(int operandIndex) override 358 { 359 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 360 return((operandIndex == 0) ? dest.isScalarRegister() : 361 this->addr.isScalarRegister()); 362 } 363 bool isSrcOperand(int operandIndex) override 364 { 365 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 366 if (operandIndex > 0) 367 return(this->addr.isVectorRegister()); 368 return false; 369 } 370 bool isDstOperand(int operandIndex) override 371 { 372 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 373 return(operandIndex == 0); 374 } 375 int getOperandSize(int operandIndex) override 376 { 377 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 378 return((operandIndex == 0) ? dest.opSize() : 379 this->addr.opSize()); 380 } 381 int 382 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 383 { 384 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 385 return((operandIndex == 0) ? dest.regIndex() : 386 this->addr.regIndex()); 387 } 388 }; 389 390 template<typename MemDataType, typename DestDataType, 391 typename AddrOperandType> 392 class LdInst : 393 public LdInstBase<typename MemDataType::CType, 394 typename DestDataType::OperandType, AddrOperandType>, 395 public MemInst 396 { 397 typename DestDataType::OperandType::DestOperand dest_vect[4]; 398 uint16_t num_dest_operands; 399 void generateDisassembly() override; 400 401 public: 402 LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 403 const char *_opcode) 404 : LdInstBase<typename MemDataType::CType, 405 typename DestDataType::OperandType, 406 AddrOperandType>(ib, obj, _opcode), 407 MemInst(MemDataType::memType) 408 { 409 init_addr(&this->addr); 410 411 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 412 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 413 414 if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 415 const Brig::BrigOperandOperandList *brigRegVecOp = 416 (const Brig::BrigOperandOperandList*)brigOp; 417 418 num_dest_operands = 419 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 420 421 assert(num_dest_operands <= 4); 422 } else { 423 num_dest_operands = 1; 424 } 425 426 if (num_dest_operands > 1) { 427 assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 428 429 for (int i = 0; i < num_dest_operands; ++i) { 430 dest_vect[i].init_from_vect(op_offs, obj, i); 431 } 432 } 433 } 434 435 void 436 initiateAcc(GPUDynInstPtr gpuDynInst) override 437 { 438 typedef typename MemDataType::CType c0; 439 440 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 441 442 if (num_dest_operands > 1) { 443 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 444 if (gpuDynInst->exec_mask[i]) 445 gpuDynInst->statusVector.push_back(num_dest_operands); 446 else 447 gpuDynInst->statusVector.push_back(0); 448 } 449 450 for (int k = 0; k < num_dest_operands; ++k) { 451 452 c0 *d = &((c0*)gpuDynInst->d_data) 453 [k * gpuDynInst->computeUnit()->wfSize()]; 454 455 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 456 if (gpuDynInst->exec_mask[i]) { 457 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 458 459 if (this->isLocalMem()) { 460 // load from shared memory 461 *d = gpuDynInst->wavefront()->ldsChunk-> 462 read<c0>(vaddr); 463 } else { 464 Request *req = new Request(0, vaddr, sizeof(c0), 0, 465 gpuDynInst->computeUnit()->masterId(), 466 0, gpuDynInst->wfDynId); 467 468 gpuDynInst->setRequestFlags(req); 469 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 470 pkt->dataStatic(d); 471 472 if (gpuDynInst->computeUnit()->shader-> 473 separate_acquire_release && 474 gpuDynInst->isAcquire()) { 475 // if this load has acquire semantics, 476 // set the response continuation function 477 // to perform an Acquire request 478 gpuDynInst->execContinuation = 479 &GPUStaticInst::execLdAcq; 480 481 gpuDynInst->useContinuation = true; 482 } else { 483 // the request will be finished when 484 // the load completes 485 gpuDynInst->useContinuation = false; 486 } 487 // translation is performed in sendRequest() 488 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 489 i, pkt); 490 } 491 } 492 ++d; 493 } 494 } 495 496 gpuDynInst->updateStats(); 497 } 498 499 void 500 completeAcc(GPUDynInstPtr gpuDynInst) override 501 { 502 typedef typename MemDataType::CType c1; 503 504 constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; 505 506 /** 507 * this code essentially replaces the long if-else chain 508 * that was in used GlobalMemPipeline::exec() to infer the 509 * size (single/double) and type (floating point/integer) of 510 * the destination register. this is needed for load 511 * instructions because the loaded value and the 512 * destination type can be of different sizes, and we also 513 * need to know if the value we're writing back is floating 514 * point and signed/unsigned, so we can properly cast the 515 * writeback value 516 */ 517 typedef typename std::conditional<is_vt_32, 518 typename std::conditional<std::is_floating_point<c1>::value, 519 float, typename std::conditional<std::is_signed<c1>::value, 520 int32_t, uint32_t>::type>::type, 521 typename std::conditional<std::is_floating_point<c1>::value, 522 double, typename std::conditional<std::is_signed<c1>::value, 523 int64_t, uint64_t>::type>::type>::type c0; 524 525 526 Wavefront *w = gpuDynInst->wavefront(); 527 528 std::vector<uint32_t> regVec; 529 // iterate over number of destination register operands since 530 // this is a load 531 for (int k = 0; k < num_dest_operands; ++k) { 532 assert((sizeof(c1) * num_dest_operands) 533 <= MAX_WIDTH_FOR_MEM_INST); 534 535 int dst = this->dest.regIndex() + k; 536 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) 537 dst = dest_vect[k].regIndex(); 538 // virtual->physical VGPR mapping 539 int physVgpr = w->remap(dst, sizeof(c0), 1); 540 // save the physical VGPR index 541 regVec.push_back(physVgpr); 542 543 c1 *p1 = 544 &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; 545 546 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 547 if (gpuDynInst->exec_mask[i]) { 548 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 549 "$%s%d <- %d global ld done (src = wavefront " 550 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 551 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", 552 dst, *p1); 553 // write the value into the physical VGPR. This is a 554 // purely functional operation. No timing is modeled. 555 w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, 556 *p1, i); 557 } 558 ++p1; 559 } 560 } 561 562 // Schedule the write operation of the load data on the VRF. 563 // This simply models the timing aspect of the VRF write operation. 564 // It does not modify the physical VGPR. 565 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 566 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 567 sizeof(c0), gpuDynInst->time); 568 569 if (this->isGlobalMem()) { 570 gpuDynInst->computeUnit()->globalMemoryPipe 571 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 572 } else { 573 assert(this->isLocalMem()); 574 gpuDynInst->computeUnit()->localMemoryPipe 575 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 576 } 577 } 578 579 private: 580 void 581 execLdAcq(GPUDynInstPtr gpuDynInst) override 582 { 583 // after the load has complete and if the load has acquire 584 // semantics, issue an acquire request. 585 if (!this->isLocalMem()) { 586 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 587 && gpuDynInst->isAcquire()) { 588 gpuDynInst->statusBitVector = VectorMask(1); 589 gpuDynInst->useContinuation = false; 590 // create request 591 Request *req = new Request(0, 0, 0, 0, 592 gpuDynInst->computeUnit()->masterId(), 593 0, gpuDynInst->wfDynId); 594 req->setFlags(Request::ACQUIRE); 595 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 596 } 597 } 598 } 599 600 public: 601 bool isVectorRegister(int operandIndex) override 602 { 603 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 604 if ((num_dest_operands != getNumOperands()) && 605 (operandIndex == (getNumOperands()-1))) 606 return(this->addr.isVectorRegister()); 607 if (num_dest_operands > 1) { 608 return dest_vect[operandIndex].isVectorRegister(); 609 } 610 else if (num_dest_operands == 1) { 611 return LdInstBase<typename MemDataType::CType, 612 typename DestDataType::OperandType, 613 AddrOperandType>::dest.isVectorRegister(); 614 } 615 return false; 616 } 617 bool isCondRegister(int operandIndex) override 618 { 619 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 620 if ((num_dest_operands != getNumOperands()) && 621 (operandIndex == (getNumOperands()-1))) 622 return(this->addr.isCondRegister()); 623 if (num_dest_operands > 1) 624 return dest_vect[operandIndex].isCondRegister(); 625 else if (num_dest_operands == 1) 626 return LdInstBase<typename MemDataType::CType, 627 typename DestDataType::OperandType, 628 AddrOperandType>::dest.isCondRegister(); 629 return false; 630 } 631 bool isScalarRegister(int operandIndex) override 632 { 633 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 634 if ((num_dest_operands != getNumOperands()) && 635 (operandIndex == (getNumOperands()-1))) 636 return(this->addr.isScalarRegister()); 637 if (num_dest_operands > 1) 638 return dest_vect[operandIndex].isScalarRegister(); 639 else if (num_dest_operands == 1) 640 return LdInstBase<typename MemDataType::CType, 641 typename DestDataType::OperandType, 642 AddrOperandType>::dest.isScalarRegister(); 643 return false; 644 } 645 bool isSrcOperand(int operandIndex) override 646 { 647 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 648 if ((num_dest_operands != getNumOperands()) && 649 (operandIndex == (getNumOperands()-1))) 650 return(this->addr.isVectorRegister()); 651 return false; 652 } 653 bool isDstOperand(int operandIndex) override 654 { 655 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 656 if ((num_dest_operands != getNumOperands()) && 657 (operandIndex == (getNumOperands()-1))) 658 return false; 659 return true; 660 } 661 int getOperandSize(int operandIndex) override 662 { 663 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 664 if ((num_dest_operands != getNumOperands()) && 665 (operandIndex == (getNumOperands()-1))) 666 return(this->addr.opSize()); 667 if (num_dest_operands > 1) 668 return(dest_vect[operandIndex].opSize()); 669 else if (num_dest_operands == 1) 670 return(LdInstBase<typename MemDataType::CType, 671 typename DestDataType::OperandType, 672 AddrOperandType>::dest.opSize()); 673 return 0; 674 } 675 int 676 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 677 { 678 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 679 if ((num_dest_operands != getNumOperands()) && 680 (operandIndex == (getNumOperands()-1))) 681 return(this->addr.regIndex()); 682 if (num_dest_operands > 1) 683 return(dest_vect[operandIndex].regIndex()); 684 else if (num_dest_operands == 1) 685 return(LdInstBase<typename MemDataType::CType, 686 typename DestDataType::OperandType, 687 AddrOperandType>::dest.regIndex()); 688 return -1; 689 } 690 int getNumOperands() override 691 { 692 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 693 return(num_dest_operands+1); 694 else 695 return(num_dest_operands); 696 } 697 void execute(GPUDynInstPtr gpuDynInst) override; 698 }; 699 700 template<typename MemDT, typename DestDT> 701 GPUStaticInst* 702 decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) 703 { 704 unsigned op_offs = obj->getOperandPtr(ib->operands,1); 705 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 706 707 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 708 return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); 709 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 710 tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 711 switch (tmp.regKind) { 712 case Brig::BRIG_REGISTER_KIND_SINGLE: 713 return new LdInst<MemDT, DestDT, 714 SRegAddrOperand>(ib, obj, "ld"); 715 case Brig::BRIG_REGISTER_KIND_DOUBLE: 716 return new LdInst<MemDT, DestDT, 717 DRegAddrOperand>(ib, obj, "ld"); 718 default: 719 fatal("Bad ld register operand type %d\n", tmp.regKind); 720 } 721 } else { 722 fatal("Bad ld register operand kind %d\n", tmp.kind); 723 } 724 } 725 726 template<typename MemDT> 727 GPUStaticInst* 728 decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) 729 { 730 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 731 BrigRegOperandInfo dest = findRegDataType(op_offs, obj); 732 733 assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 734 dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 735 switch(dest.regKind) { 736 case Brig::BRIG_REGISTER_KIND_SINGLE: 737 switch (ib->type) { 738 case Brig::BRIG_TYPE_B8: 739 case Brig::BRIG_TYPE_B16: 740 case Brig::BRIG_TYPE_B32: 741 return decodeLd2<MemDT, B32>(ib, obj); 742 case Brig::BRIG_TYPE_U8: 743 case Brig::BRIG_TYPE_U16: 744 case Brig::BRIG_TYPE_U32: 745 return decodeLd2<MemDT, U32>(ib, obj); 746 case Brig::BRIG_TYPE_S8: 747 case Brig::BRIG_TYPE_S16: 748 case Brig::BRIG_TYPE_S32: 749 return decodeLd2<MemDT, S32>(ib, obj); 750 case Brig::BRIG_TYPE_F16: 751 case Brig::BRIG_TYPE_F32: 752 return decodeLd2<MemDT, U32>(ib, obj); 753 default: 754 fatal("Bad ld register operand type %d, %d\n", 755 dest.regKind, ib->type); 756 }; 757 case Brig::BRIG_REGISTER_KIND_DOUBLE: 758 switch (ib->type) { 759 case Brig::BRIG_TYPE_B64: 760 return decodeLd2<MemDT, B64>(ib, obj); 761 case Brig::BRIG_TYPE_U64: 762 return decodeLd2<MemDT, U64>(ib, obj); 763 case Brig::BRIG_TYPE_S64: 764 return decodeLd2<MemDT, S64>(ib, obj); 765 case Brig::BRIG_TYPE_F64: 766 return decodeLd2<MemDT, U64>(ib, obj); 767 default: 768 fatal("Bad ld register operand type %d, %d\n", 769 dest.regKind, ib->type); 770 }; 771 default: 772 fatal("Bad ld register operand type %d, %d\n", dest.regKind, 773 ib->type); 774 } 775 } 776 777 template<typename MemDataType, typename SrcOperandType, 778 typename AddrOperandType> 779 class StInstBase : public HsailGPUStaticInst 780 { 781 public: 782 typename SrcOperandType::SrcOperand src; 783 AddrOperandType addr; 784 785 Brig::BrigSegment segment; 786 Brig::BrigMemoryScope memoryScope; 787 Brig::BrigMemoryOrder memoryOrder; 788 unsigned int equivClass; 789 790 StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 791 const char *_opcode) 792 : HsailGPUStaticInst(obj, _opcode) 793 { 794 using namespace Brig; 795 796 setFlag(MemoryRef); 797 setFlag(Store); 798 799 if (ib->opcode == BRIG_OPCODE_ST) { 800 const BrigInstMem *ldst = (const BrigInstMem*)ib; 801 802 segment = (BrigSegment)ldst->segment; 803 memoryOrder = BRIG_MEMORY_ORDER_NONE; 804 memoryScope = BRIG_MEMORY_SCOPE_NONE; 805 equivClass = ldst->equivClass; 806 807 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 808 const BrigOperand *baseOp = obj->getOperand(op_offs); 809 810 if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || 811 (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { 812 src.init(op_offs, obj); 813 } 814 815 op_offs = obj->getOperandPtr(ib->operands, 1); 816 addr.init(op_offs, obj); 817 } else { 818 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 819 820 segment = (BrigSegment)at->segment; 821 memoryScope = (BrigMemoryScope)at->memoryScope; 822 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 823 equivClass = 0; 824 825 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 826 addr.init(op_offs, obj); 827 828 op_offs = obj->getOperandPtr(ib->operands, 1); 829 src.init(op_offs, obj); 830 } 831 832 switch (memoryOrder) { 833 case BRIG_MEMORY_ORDER_NONE: 834 setFlag(NoOrder); 835 break; 836 case BRIG_MEMORY_ORDER_RELAXED: 837 setFlag(RelaxedOrder); 838 break; 839 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 840 setFlag(Acquire); 841 break; 842 case BRIG_MEMORY_ORDER_SC_RELEASE: 843 setFlag(Release); 844 break; 845 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 846 setFlag(AcquireRelease); 847 break; 848 default: 849 fatal("StInst has bad memory order type\n"); 850 } 851 852 switch (memoryScope) { 853 case BRIG_MEMORY_SCOPE_NONE: 854 setFlag(NoScope); 855 break; 856 case BRIG_MEMORY_SCOPE_WORKITEM: 857 setFlag(WorkitemScope); 858 break; 859 case BRIG_MEMORY_SCOPE_WORKGROUP: 860 setFlag(WorkgroupScope); 861 break; 862 case BRIG_MEMORY_SCOPE_AGENT: 863 setFlag(DeviceScope); 864 break; 865 case BRIG_MEMORY_SCOPE_SYSTEM: 866 setFlag(SystemScope); 867 break; 868 default: 869 fatal("StInst has bad memory scope type\n"); 870 } 871 872 switch (segment) { 873 case BRIG_SEGMENT_GLOBAL: 874 setFlag(GlobalSegment); 875 break; 876 case BRIG_SEGMENT_GROUP: 877 setFlag(GroupSegment); 878 break; 879 case BRIG_SEGMENT_PRIVATE: 880 setFlag(PrivateSegment); 881 break; 882 case BRIG_SEGMENT_READONLY: 883 setFlag(ReadOnlySegment); 884 break; 885 case BRIG_SEGMENT_SPILL: 886 setFlag(SpillSegment); 887 break; 888 case BRIG_SEGMENT_FLAT: 889 setFlag(Flat); 890 break; 891 case BRIG_SEGMENT_ARG: 892 setFlag(ArgSegment); 893 break; 894 default: 895 panic("St: segment %d not supported\n", segment); 896 } 897 } 898 899 int numDstRegOperands() override { return 0; } 900 int numSrcRegOperands() override 901 { 902 return src.isVectorRegister() + this->addr.isVectorRegister(); 903 } 904 int getNumOperands() override 905 { 906 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 907 return 2; 908 else 909 return 1; 910 } 911 bool isVectorRegister(int operandIndex) override 912 { 913 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 914 return !operandIndex ? src.isVectorRegister() : 915 this->addr.isVectorRegister(); 916 } 917 bool isCondRegister(int operandIndex) override 918 { 919 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 920 return !operandIndex ? src.isCondRegister() : 921 this->addr.isCondRegister(); 922 } 923 bool isScalarRegister(int operandIndex) override 924 { 925 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 926 return !operandIndex ? src.isScalarRegister() : 927 this->addr.isScalarRegister(); 928 } 929 bool isSrcOperand(int operandIndex) override 930 { 931 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 932 return true; 933 } 934 bool isDstOperand(int operandIndex) override { return false; } 935 int getOperandSize(int operandIndex) override 936 { 937 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 938 return !operandIndex ? src.opSize() : this->addr.opSize(); 939 } 940 int 941 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 942 { 943 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 944 return !operandIndex ? src.regIndex() : this->addr.regIndex(); 945 } 946 }; 947 948 949 template<typename MemDataType, typename SrcDataType, 950 typename AddrOperandType> 951 class StInst : 952 public StInstBase<MemDataType, typename SrcDataType::OperandType, 953 AddrOperandType>, 954 public MemInst 955 { 956 public: 957 typename SrcDataType::OperandType::SrcOperand src_vect[4]; 958 uint16_t num_src_operands; 959 void generateDisassembly() override; 960 961 StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 962 const char *_opcode, int srcIdx) 963 : StInstBase<MemDataType, typename SrcDataType::OperandType, 964 AddrOperandType>(ib, obj, _opcode), 965 MemInst(SrcDataType::memType) 966 { 967 init_addr(&this->addr); 968 969 BrigRegOperandInfo rinfo; 970 unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); 971 const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); 972 973 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { 974 const Brig::BrigOperandConstantBytes *op = 975 (Brig::BrigOperandConstantBytes*)baseOp; 976 977 rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, 978 Brig::BRIG_TYPE_NONE); 979 } else { 980 rinfo = findRegDataType(op_offs, obj); 981 } 982 983 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 984 const Brig::BrigOperandOperandList *brigRegVecOp = 985 (const Brig::BrigOperandOperandList*)baseOp; 986 987 num_src_operands = 988 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 989 990 assert(num_src_operands <= 4); 991 } else { 992 num_src_operands = 1; 993 } 994 995 if (num_src_operands > 1) { 996 assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 997 998 for (int i = 0; i < num_src_operands; ++i) { 999 src_vect[i].init_from_vect(op_offs, obj, i); 1000 } 1001 } 1002 } 1003 1004 void 1005 initiateAcc(GPUDynInstPtr gpuDynInst) override 1006 { 1007 // before performing a store, check if this store has 1008 // release semantics, and if so issue a release first 1009 if (!this->isLocalMem()) { 1010 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1011 && gpuDynInst->isRelease()) { 1012 1013 gpuDynInst->statusBitVector = VectorMask(1); 1014 gpuDynInst->execContinuation = &GPUStaticInst::execSt; 1015 gpuDynInst->useContinuation = true; 1016 // create request 1017 Request *req = new Request(0, 0, 0, 0, 1018 gpuDynInst->computeUnit()->masterId(), 1019 0, gpuDynInst->wfDynId); 1020 req->setFlags(Request::RELEASE); 1021 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1022 1023 return; 1024 } 1025 } 1026 1027 // if there is no release semantic, perform stores immediately 1028 execSt(gpuDynInst); 1029 } 1030 1031 // stores don't write anything back, so there is nothing 1032 // to do here. we only override this method to avoid the 1033 // fatal in the base class implementation 1034 void completeAcc(GPUDynInstPtr gpuDynInst) override { } 1035 1036 private: 1037 // execSt may be called through a continuation 1038 // if the store had release semantics. see comment for 1039 // execSt in gpu_static_inst.hh 1040 void 1041 execSt(GPUDynInstPtr gpuDynInst) override 1042 { 1043 typedef typename MemDataType::CType c0; 1044 1045 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1046 1047 if (num_src_operands > 1) { 1048 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 1049 if (gpuDynInst->exec_mask[i]) 1050 gpuDynInst->statusVector.push_back(num_src_operands); 1051 else 1052 gpuDynInst->statusVector.push_back(0); 1053 } 1054 1055 for (int k = 0; k < num_src_operands; ++k) { 1056 c0 *d = &((c0*)gpuDynInst->d_data) 1057 [k * gpuDynInst->computeUnit()->wfSize()]; 1058 1059 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1060 if (gpuDynInst->exec_mask[i]) { 1061 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 1062 1063 if (this->isLocalMem()) { 1064 //store to shared memory 1065 gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, 1066 *d); 1067 } else { 1068 Request *req = 1069 new Request(0, vaddr, sizeof(c0), 0, 1070 gpuDynInst->computeUnit()->masterId(), 1071 0, gpuDynInst->wfDynId); 1072 1073 gpuDynInst->setRequestFlags(req); 1074 PacketPtr pkt = new Packet(req, MemCmd::WriteReq); 1075 pkt->dataStatic<c0>(d); 1076 1077 // translation is performed in sendRequest() 1078 // the request will be finished when the store completes 1079 gpuDynInst->useContinuation = false; 1080 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 1081 i, pkt); 1082 1083 } 1084 } 1085 ++d; 1086 } 1087 } 1088 1089 gpuDynInst->updateStats(); 1090 } 1091 1092 public: 1093 bool isVectorRegister(int operandIndex) override 1094 { 1095 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1096 if (operandIndex == num_src_operands) 1097 return this->addr.isVectorRegister(); 1098 if (num_src_operands > 1) 1099 return src_vect[operandIndex].isVectorRegister(); 1100 else if (num_src_operands == 1) 1101 return StInstBase<MemDataType, 1102 typename SrcDataType::OperandType, 1103 AddrOperandType>::src.isVectorRegister(); 1104 return false; 1105 } 1106 bool isCondRegister(int operandIndex) override 1107 { 1108 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1109 if (operandIndex == num_src_operands) 1110 return this->addr.isCondRegister(); 1111 if (num_src_operands > 1) 1112 return src_vect[operandIndex].isCondRegister(); 1113 else if (num_src_operands == 1) 1114 return StInstBase<MemDataType, 1115 typename SrcDataType::OperandType, 1116 AddrOperandType>::src.isCondRegister(); 1117 return false; 1118 } 1119 bool isScalarRegister(int operandIndex) override 1120 { 1121 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1122 if (operandIndex == num_src_operands) 1123 return this->addr.isScalarRegister(); 1124 if (num_src_operands > 1) 1125 return src_vect[operandIndex].isScalarRegister(); 1126 else if (num_src_operands == 1) 1127 return StInstBase<MemDataType, 1128 typename SrcDataType::OperandType, 1129 AddrOperandType>::src.isScalarRegister(); 1130 return false; 1131 } 1132 bool isSrcOperand(int operandIndex) override 1133 { 1134 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1135 return true; 1136 } 1137 bool isDstOperand(int operandIndex) override { return false; } 1138 int getOperandSize(int operandIndex) override 1139 { 1140 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1141 if (operandIndex == num_src_operands) 1142 return this->addr.opSize(); 1143 if (num_src_operands > 1) 1144 return src_vect[operandIndex].opSize(); 1145 else if (num_src_operands == 1) 1146 return StInstBase<MemDataType, 1147 typename SrcDataType::OperandType, 1148 AddrOperandType>::src.opSize(); 1149 return 0; 1150 } 1151 int 1152 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override 1153 { 1154 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1155 if (operandIndex == num_src_operands) 1156 return this->addr.regIndex(); 1157 if (num_src_operands > 1) 1158 return src_vect[operandIndex].regIndex(); 1159 else if (num_src_operands == 1) 1160 return StInstBase<MemDataType, 1161 typename SrcDataType::OperandType, 1162 AddrOperandType>::src.regIndex(); 1163 return -1; 1164 } 1165 int getNumOperands() override 1166 { 1167 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 1168 return num_src_operands + 1; 1169 else 1170 return num_src_operands; 1171 } 1172 void execute(GPUDynInstPtr gpuDynInst) override; 1173 }; 1174 1175 template<typename DataType, typename SrcDataType> 1176 GPUStaticInst* 1177 decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) 1178 { 1179 int srcIdx = 0; 1180 int destIdx = 1; 1181 if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || 1182 ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { 1183 srcIdx = 1; 1184 destIdx = 0; 1185 } 1186 unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); 1187 1188 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1189 1190 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1191 return new StInst<DataType, SrcDataType, 1192 NoRegAddrOperand>(ib, obj, "st", srcIdx); 1193 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1194 // V2/V4 not allowed 1195 switch (tmp.regKind) { 1196 case Brig::BRIG_REGISTER_KIND_SINGLE: 1197 return new StInst<DataType, SrcDataType, 1198 SRegAddrOperand>(ib, obj, "st", srcIdx); 1199 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1200 return new StInst<DataType, SrcDataType, 1201 DRegAddrOperand>(ib, obj, "st", srcIdx); 1202 default: 1203 fatal("Bad st register operand type %d\n", tmp.type); 1204 } 1205 } else { 1206 fatal("Bad st register operand kind %d\n", tmp.kind); 1207 } 1208 } 1209 1210 template<typename OperandType, typename AddrOperandType, int NumSrcOperands, 1211 bool HasDst> 1212 class AtomicInstBase : public HsailGPUStaticInst 1213 { 1214 public: 1215 typename OperandType::DestOperand dest; 1216 typename OperandType::SrcOperand src[NumSrcOperands]; 1217 AddrOperandType addr; 1218 1219 Brig::BrigSegment segment; 1220 Brig::BrigMemoryOrder memoryOrder; 1221 Brig::BrigAtomicOperation atomicOperation; 1222 Brig::BrigMemoryScope memoryScope; 1223 Brig::BrigOpcode opcode; 1224 1225 AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 1226 const char *_opcode) 1227 : HsailGPUStaticInst(obj, _opcode) 1228 { 1229 using namespace Brig; 1230 1231 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 1232 1233 segment = (BrigSegment)at->segment; 1234 memoryScope = (BrigMemoryScope)at->memoryScope; 1235 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 1236 atomicOperation = (BrigAtomicOperation)at->atomicOperation; 1237 opcode = (BrigOpcode)ib->opcode; 1238 1239 assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || 1240 opcode == Brig::BRIG_OPCODE_ATOMIC); 1241 1242 setFlag(MemoryRef); 1243 1244 if (opcode == Brig::BRIG_OPCODE_ATOMIC) { 1245 setFlag(AtomicReturn); 1246 } else { 1247 setFlag(AtomicNoReturn); 1248 } 1249 1250 switch (memoryOrder) { 1251 case BRIG_MEMORY_ORDER_NONE: 1252 setFlag(NoOrder); 1253 break; 1254 case BRIG_MEMORY_ORDER_RELAXED: 1255 setFlag(RelaxedOrder); 1256 break; 1257 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 1258 setFlag(Acquire); 1259 break; 1260 case BRIG_MEMORY_ORDER_SC_RELEASE: 1261 setFlag(Release); 1262 break; 1263 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 1264 setFlag(AcquireRelease); 1265 break; 1266 default: 1267 fatal("AtomicInst has bad memory order type\n"); 1268 } 1269 1270 switch (memoryScope) { 1271 case BRIG_MEMORY_SCOPE_NONE: 1272 setFlag(NoScope); 1273 break; 1274 case BRIG_MEMORY_SCOPE_WORKITEM: 1275 setFlag(WorkitemScope); 1276 break; 1277 case BRIG_MEMORY_SCOPE_WORKGROUP: 1278 setFlag(WorkgroupScope); 1279 break; 1280 case BRIG_MEMORY_SCOPE_AGENT: 1281 setFlag(DeviceScope); 1282 break; 1283 case BRIG_MEMORY_SCOPE_SYSTEM: 1284 setFlag(SystemScope); 1285 break; 1286 default: 1287 fatal("AtomicInst has bad memory scope type\n"); 1288 } 1289 1290 switch (atomicOperation) { 1291 case Brig::BRIG_ATOMIC_AND: 1292 setFlag(AtomicAnd); 1293 break; 1294 case Brig::BRIG_ATOMIC_OR: 1295 setFlag(AtomicOr); 1296 break; 1297 case Brig::BRIG_ATOMIC_XOR: 1298 setFlag(AtomicXor); 1299 break; 1300 case Brig::BRIG_ATOMIC_CAS: 1301 setFlag(AtomicCAS); 1302 break; 1303 case Brig::BRIG_ATOMIC_EXCH: 1304 setFlag(AtomicExch); 1305 break; 1306 case Brig::BRIG_ATOMIC_ADD: 1307 setFlag(AtomicAdd); 1308 break; 1309 case Brig::BRIG_ATOMIC_WRAPINC: 1310 setFlag(AtomicInc); 1311 break; 1312 case Brig::BRIG_ATOMIC_WRAPDEC: 1313 setFlag(AtomicDec); 1314 break; 1315 case Brig::BRIG_ATOMIC_MIN: 1316 setFlag(AtomicMin); 1317 break; 1318 case Brig::BRIG_ATOMIC_MAX: 1319 setFlag(AtomicMax); 1320 break; 1321 case Brig::BRIG_ATOMIC_SUB: 1322 setFlag(AtomicSub); 1323 break; 1324 default: 1325 fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); 1326 } 1327 1328 switch (segment) { 1329 case BRIG_SEGMENT_GLOBAL: 1330 setFlag(GlobalSegment); 1331 break; 1332 case BRIG_SEGMENT_GROUP: 1333 setFlag(GroupSegment); 1334 break; 1335 case BRIG_SEGMENT_FLAT: 1336 setFlag(Flat); 1337 break; 1338 default: 1339 panic("Atomic: segment %d not supported\n", segment); 1340 } 1341 1342 if (HasDst) { 1343 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1344 dest.init(op_offs, obj); 1345 1346 op_offs = obj->getOperandPtr(ib->operands, 1); 1347 addr.init(op_offs, obj); 1348 1349 for (int i = 0; i < NumSrcOperands; ++i) { 1350 op_offs = obj->getOperandPtr(ib->operands, i + 2); 1351 src[i].init(op_offs, obj); 1352 } 1353 } else { 1354 1355 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1356 addr.init(op_offs, obj); 1357 1358 for (int i = 0; i < NumSrcOperands; ++i) { 1359 op_offs = obj->getOperandPtr(ib->operands, i + 1); 1360 src[i].init(op_offs, obj); 1361 } 1362 } 1363 } 1364 1365 int numSrcRegOperands() 1366 { 1367 int operands = 0; 1368 for (int i = 0; i < NumSrcOperands; i++) { 1369 if (src[i].isVectorRegister()) { 1370 operands++; 1371 } 1372 } 1373 if (addr.isVectorRegister()) 1374 operands++; 1375 return operands; 1376 } 1377 int numDstRegOperands() { return dest.isVectorRegister(); } 1378 int getNumOperands() 1379 { 1380 if (addr.isVectorRegister()) 1381 return(NumSrcOperands + 2); 1382 return(NumSrcOperands + 1); 1383 } 1384 bool isVectorRegister(int operandIndex) 1385 { 1386 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1387 if (operandIndex < NumSrcOperands) 1388 return src[operandIndex].isVectorRegister(); 1389 else if (operandIndex == NumSrcOperands) 1390 return(addr.isVectorRegister()); 1391 else 1392 return dest.isVectorRegister(); 1393 } 1394 bool isCondRegister(int operandIndex) 1395 { 1396 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1397 if (operandIndex < NumSrcOperands) 1398 return src[operandIndex].isCondRegister(); 1399 else if (operandIndex == NumSrcOperands) 1400 return(addr.isCondRegister()); 1401 else 1402 return dest.isCondRegister(); 1403 } 1404 bool isScalarRegister(int operandIndex) 1405 { 1406 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1407 if (operandIndex < NumSrcOperands) 1408 return src[operandIndex].isScalarRegister(); 1409 else if (operandIndex == NumSrcOperands) 1410 return(addr.isScalarRegister()); 1411 else 1412 return dest.isScalarRegister(); 1413 } 1414 bool isSrcOperand(int operandIndex) 1415 { 1416 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1417 if (operandIndex < NumSrcOperands) 1418 return true; 1419 else if (operandIndex == NumSrcOperands) 1420 return(addr.isVectorRegister()); 1421 else 1422 return false; 1423 } 1424 bool isDstOperand(int operandIndex) 1425 { 1426 if (operandIndex <= NumSrcOperands) 1427 return false; 1428 else 1429 return true; 1430 } 1431 int getOperandSize(int operandIndex) 1432 { 1433 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1434 if (operandIndex < NumSrcOperands) 1435 return(src[operandIndex].opSize()); 1436 else if (operandIndex == NumSrcOperands) 1437 return(addr.opSize()); 1438 else 1439 return(dest.opSize()); 1440 } 1441 int 1442 getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) 1443 { 1444 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1445 if (operandIndex < NumSrcOperands) 1446 return(src[operandIndex].regIndex()); 1447 else if (operandIndex == NumSrcOperands) 1448 return(addr.regIndex()); 1449 else 1450 return(dest.regIndex()); 1451 return -1; 1452 } 1453 }; 1454 1455 template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, 1456 bool HasDst> 1457 class AtomicInst : 1458 public AtomicInstBase<typename MemDataType::OperandType, 1459 AddrOperandType, NumSrcOperands, HasDst>, 1460 public MemInst 1461 { 1462 public: 1463 void generateDisassembly() override; 1464 1465 AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 1466 const char *_opcode) 1467 : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, 1468 NumSrcOperands, HasDst> 1469 (ib, obj, _opcode), 1470 MemInst(MemDataType::memType) 1471 { 1472 init_addr(&this->addr); 1473 } 1474 1475 void 1476 initiateAcc(GPUDynInstPtr gpuDynInst) override 1477 { 1478 // before doing the RMW, check if this atomic has 1479 // release semantics, and if so issue a release first 1480 if (!this->isLocalMem()) { 1481 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1482 && (gpuDynInst->isRelease() 1483 || gpuDynInst->isAcquireRelease())) { 1484 1485 gpuDynInst->statusBitVector = VectorMask(1); 1486 1487 gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; 1488 gpuDynInst->useContinuation = true; 1489 1490 // create request 1491 Request *req = new Request(0, 0, 0, 0, 1492 gpuDynInst->computeUnit()->masterId(), 1493 0, gpuDynInst->wfDynId); 1494 req->setFlags(Request::RELEASE); 1495 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1496 1497 return; 1498 } 1499 } 1500 1501 // if there is no release semantic, execute the RMW immediately 1502 execAtomic(gpuDynInst); 1503 1504 } 1505 1506 void 1507 completeAcc(GPUDynInstPtr gpuDynInst) override 1508 { 1509 // if this is not an atomic return op, then we 1510 // have nothing more to do. 1511 if (this->isAtomicRet()) { 1512 // the size of the src operands and the 1513 // memory being operated on must match 1514 // for HSAIL atomics - this assumption may 1515 // not apply to all ISAs 1516 typedef typename MemDataType::CType CType; 1517 1518 Wavefront *w = gpuDynInst->wavefront(); 1519 int dst = this->dest.regIndex(); 1520 std::vector<uint32_t> regVec; 1521 // virtual->physical VGPR mapping 1522 int physVgpr = w->remap(dst, sizeof(CType), 1); 1523 regVec.push_back(physVgpr); 1524 CType *p1 = &((CType*)gpuDynInst->d_data)[0]; 1525 1526 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 1527 if (gpuDynInst->exec_mask[i]) { 1528 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 1529 "$%s%d <- %d global ld done (src = wavefront " 1530 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 1531 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", 1532 dst, *p1); 1533 // write the value into the physical VGPR. This is a 1534 // purely functional operation. No timing is modeled. 1535 w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); 1536 } 1537 ++p1; 1538 } 1539 1540 // Schedule the write operation of the load data on the VRF. 1541 // This simply models the timing aspect of the VRF write operation. 1542 // It does not modify the physical VGPR. 1543 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 1544 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 1545 sizeof(CType), gpuDynInst->time); 1546 1547 if (this->isGlobalMem()) { 1548 gpuDynInst->computeUnit()->globalMemoryPipe 1549 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1550 } else { 1551 assert(this->isLocalMem()); 1552 gpuDynInst->computeUnit()->localMemoryPipe 1553 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1554 } 1555 } 1556 } 1557 1558 void execute(GPUDynInstPtr gpuDynInst) override; 1559 1560 private: 1561 // execAtomic may be called through a continuation 1562 // if the RMW had release semantics. see comment for 1563 // execContinuation in gpu_dyn_inst.hh 1564 void 1565 execAtomic(GPUDynInstPtr gpuDynInst) override 1566 { 1567 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1568 1569 typedef typename MemDataType::CType c0; 1570 1571 c0 *d = &((c0*) gpuDynInst->d_data)[0]; 1572 c0 *e = &((c0*) gpuDynInst->a_data)[0]; 1573 c0 *f = &((c0*) gpuDynInst->x_data)[0]; 1574 1575 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1576 if (gpuDynInst->exec_mask[i]) { 1577 Addr vaddr = gpuDynInst->addr[i]; 1578 1579 if (this->isLocalMem()) { 1580 Wavefront *wavefront = gpuDynInst->wavefront(); 1581 *d = wavefront->ldsChunk->read<c0>(vaddr); 1582 1583 if (this->isAtomicAdd()) { 1584 wavefront->ldsChunk->write<c0>(vaddr, 1585 wavefront->ldsChunk->read<c0>(vaddr) + (*e)); 1586 } else if (this->isAtomicSub()) { 1587 wavefront->ldsChunk->write<c0>(vaddr, 1588 wavefront->ldsChunk->read<c0>(vaddr) - (*e)); 1589 } else if (this->isAtomicMax()) { 1590 wavefront->ldsChunk->write<c0>(vaddr, 1591 std::max(wavefront->ldsChunk->read<c0>(vaddr), 1592 (*e))); 1593 } else if (this->isAtomicMin()) { 1594 wavefront->ldsChunk->write<c0>(vaddr, 1595 std::min(wavefront->ldsChunk->read<c0>(vaddr), 1596 (*e))); 1597 } else if (this->isAtomicAnd()) { 1598 wavefront->ldsChunk->write<c0>(vaddr, 1599 wavefront->ldsChunk->read<c0>(vaddr) & (*e)); 1600 } else if (this->isAtomicOr()) { 1601 wavefront->ldsChunk->write<c0>(vaddr, 1602 wavefront->ldsChunk->read<c0>(vaddr) | (*e)); 1603 } else if (this->isAtomicXor()) { 1604 wavefront->ldsChunk->write<c0>(vaddr, 1605 wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); 1606 } else if (this->isAtomicInc()) { 1607 wavefront->ldsChunk->write<c0>(vaddr, 1608 wavefront->ldsChunk->read<c0>(vaddr) + 1); 1609 } else if (this->isAtomicDec()) { 1610 wavefront->ldsChunk->write<c0>(vaddr, 1611 wavefront->ldsChunk->read<c0>(vaddr) - 1); 1612 } else if (this->isAtomicExch()) { 1613 wavefront->ldsChunk->write<c0>(vaddr, (*e)); 1614 } else if (this->isAtomicCAS()) { 1615 wavefront->ldsChunk->write<c0>(vaddr, 1616 (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? 1617 (*f) : wavefront->ldsChunk->read<c0>(vaddr)); 1618 } else { 1619 fatal("Unrecognized or invalid HSAIL atomic op " 1620 "type.\n"); 1621 } 1622 } else { 1623 Request *req = 1624 new Request(0, vaddr, sizeof(c0), 0, 1625 gpuDynInst->computeUnit()->masterId(), 1626 0, gpuDynInst->wfDynId, 1627 gpuDynInst->makeAtomicOpFunctor<c0>(e, 1628 f)); 1629 1630 gpuDynInst->setRequestFlags(req); 1631 PacketPtr pkt = new Packet(req, MemCmd::SwapReq); 1632 pkt->dataStatic(d); 1633 1634 if (gpuDynInst->computeUnit()->shader-> 1635 separate_acquire_release && 1636 (gpuDynInst->isAcquire())) { 1637 // if this atomic has acquire semantics, 1638 // schedule the continuation to perform an 1639 // acquire after the RMW completes 1640 gpuDynInst->execContinuation = 1641 &GPUStaticInst::execAtomicAcq; 1642 1643 gpuDynInst->useContinuation = true; 1644 } else { 1645 // the request will be finished when the RMW completes 1646 gpuDynInst->useContinuation = false; 1647 } 1648 // translation is performed in sendRequest() 1649 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, 1650 pkt); 1651 } 1652 } 1653 1654 ++d; 1655 ++e; 1656 ++f; 1657 } 1658 1659 gpuDynInst->updateStats(); 1660 } 1661 1662 // execAtomicACq will always be called through a continuation. 1663 // see comment for execContinuation in gpu_dyn_inst.hh 1664 void 1665 execAtomicAcq(GPUDynInstPtr gpuDynInst) override 1666 { 1667 // after performing the RMW, check to see if this instruction 1668 // has acquire semantics, and if so, issue an acquire 1669 if (!this->isLocalMem()) { 1670 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1671 && gpuDynInst->isAcquire()) { 1672 gpuDynInst->statusBitVector = VectorMask(1); 1673 1674 // the request will be finished when 1675 // the acquire completes 1676 gpuDynInst->useContinuation = false; 1677 // create request 1678 Request *req = new Request(0, 0, 0, 0, 1679 gpuDynInst->computeUnit()->masterId(), 1680 0, gpuDynInst->wfDynId); 1681 req->setFlags(Request::ACQUIRE); 1682 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1683 } 1684 } 1685 } 1686 }; 1687 1688 template<typename DataType, typename AddrOperandType, int NumSrcOperands> 1689 GPUStaticInst* 1690 constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1691 { 1692 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1693 1694 if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { 1695 return decodeLd<DataType>(ib, obj); 1696 } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { 1697 switch (ib->type) { 1698 case Brig::BRIG_TYPE_B8: 1699 return decodeSt<S8,S8>(ib, obj); 1700 case Brig::BRIG_TYPE_B16: 1701 return decodeSt<S16,S16>(ib, obj); 1702 case Brig::BRIG_TYPE_B32: 1703 return decodeSt<S32,S32>(ib, obj); 1704 case Brig::BRIG_TYPE_B64: 1705 return decodeSt<S64,S64>(ib, obj); 1706 default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); 1707 } 1708 } else { 1709 if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) 1710 return new AtomicInst<DataType, AddrOperandType, 1711 NumSrcOperands, false>(ib, obj, "atomicnoret"); 1712 else 1713 return new AtomicInst<DataType, AddrOperandType, 1714 NumSrcOperands, true>(ib, obj, "atomic"); 1715 } 1716 } 1717 1718 template<typename DataType, int NumSrcOperands> 1719 GPUStaticInst* 1720 decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) 1721 { 1722 unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == 1723 Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; 1724 1725 unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); 1726 1727 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1728 1729 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1730 return constructAtomic<DataType, NoRegAddrOperand, 1731 NumSrcOperands>(ib, obj); 1732 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1733 // V2/V4 not allowed 1734 switch (tmp.regKind) { 1735 case Brig::BRIG_REGISTER_KIND_SINGLE: 1736 return constructAtomic<DataType, SRegAddrOperand, 1737 NumSrcOperands>(ib, obj); 1738 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1739 return constructAtomic<DataType, DRegAddrOperand, 1740 NumSrcOperands>(ib, obj); 1741 default: 1742 fatal("Bad atomic register operand type %d\n", tmp.type); 1743 } 1744 } else { 1745 fatal("Bad atomic register operand kind %d\n", tmp.kind); 1746 } 1747 } 1748 1749 1750 template<typename DataType> 1751 GPUStaticInst* 1752 decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1753 { 1754 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1755 1756 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1757 return decodeAtomicHelper<DataType, 2>(ib, obj); 1758 } else { 1759 return decodeAtomicHelper<DataType, 1>(ib, obj); 1760 } 1761 } 1762 1763 template<typename DataType> 1764 GPUStaticInst* 1765 decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) 1766 { 1767 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1768 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1769 return decodeAtomicHelper<DataType, 2>(ib, obj); 1770 } else { 1771 return decodeAtomicHelper<DataType, 1>(ib, obj); 1772 } 1773 } 1774} // namespace HsailISA 1775 1776#endif // __ARCH_HSAIL_INSTS_MEM_HH__ 1777