mem.hh revision 11693
15450Sgblack@eecs.umich.edu/* 25450Sgblack@eecs.umich.edu * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 35450Sgblack@eecs.umich.edu * All rights reserved. 47087Snate@binkert.org * 57087Snate@binkert.org * For use for simulation and test purposes only 67087Snate@binkert.org * 77087Snate@binkert.org * Redistribution and use in source and binary forms, with or without 87087Snate@binkert.org * modification, are permitted provided that the following conditions are met: 97087Snate@binkert.org * 107087Snate@binkert.org * 1. Redistributions of source code must retain the above copyright notice, 117087Snate@binkert.org * this list of conditions and the following disclaimer. 125450Sgblack@eecs.umich.edu * 137087Snate@binkert.org * 2. Redistributions in binary form must reproduce the above copyright notice, 147087Snate@binkert.org * this list of conditions and the following disclaimer in the documentation 157087Snate@binkert.org * and/or other materials provided with the distribution. 167087Snate@binkert.org * 177087Snate@binkert.org * 3. Neither the name of the copyright holder nor the names of its contributors 187087Snate@binkert.org * may be used to endorse or promote products derived from this software 197087Snate@binkert.org * without specific prior written permission. 207087Snate@binkert.org * 215450Sgblack@eecs.umich.edu * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 227087Snate@binkert.org * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 235450Sgblack@eecs.umich.edu * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 245450Sgblack@eecs.umich.edu * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 255450Sgblack@eecs.umich.edu * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 265450Sgblack@eecs.umich.edu * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 275450Sgblack@eecs.umich.edu * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 285450Sgblack@eecs.umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 295450Sgblack@eecs.umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 305450Sgblack@eecs.umich.edu * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 315450Sgblack@eecs.umich.edu * POSSIBILITY OF SUCH DAMAGE. 325450Sgblack@eecs.umich.edu * 335450Sgblack@eecs.umich.edu * Author: Steve Reinhardt 345450Sgblack@eecs.umich.edu */ 355450Sgblack@eecs.umich.edu 365450Sgblack@eecs.umich.edu#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 375450Sgblack@eecs.umich.edu#define __ARCH_HSAIL_INSTS_MEM_HH__ 385450Sgblack@eecs.umich.edu 395450Sgblack@eecs.umich.edu#include <type_traits> 405450Sgblack@eecs.umich.edu 415450Sgblack@eecs.umich.edu#include "arch/hsail/insts/decl.hh" 425450Sgblack@eecs.umich.edu#include "arch/hsail/insts/gpu_static_inst.hh" 435610Snate@binkert.org#include "arch/hsail/operand.hh" 445450Sgblack@eecs.umich.edu#include "gpu-compute/compute_unit.hh" 455450Sgblack@eecs.umich.edu 465450Sgblack@eecs.umich.edunamespace HsailISA 475450Sgblack@eecs.umich.edu{ 485450Sgblack@eecs.umich.edu class MemInst 495450Sgblack@eecs.umich.edu { 505450Sgblack@eecs.umich.edu public: 515610Snate@binkert.org MemInst() : size(0), addr_operand(nullptr) { } 525450Sgblack@eecs.umich.edu 538323Ssteve.reinhardt@amd.com MemInst(Enums::MemType m_type) 54 { 55 if (m_type == Enums::M_U64 || 56 m_type == Enums::M_S64 || 57 m_type == Enums::M_F64) { 58 size = 8; 59 } else if (m_type == Enums::M_U32 || 60 m_type == Enums::M_S32 || 61 m_type == Enums::M_F32) { 62 size = 4; 63 } else if (m_type == Enums::M_U16 || 64 m_type == Enums::M_S16 || 65 m_type == Enums::M_F16) { 66 size = 2; 67 } else { 68 size = 1; 69 } 70 71 addr_operand = nullptr; 72 } 73 74 void 75 init_addr(AddrOperandBase *_addr_operand) 76 { 77 addr_operand = _addr_operand; 78 } 79 80 private: 81 int size; 82 AddrOperandBase *addr_operand; 83 84 public: 85 int getMemOperandSize() { return size; } 86 AddrOperandBase *getAddressOperand() { return addr_operand; } 87 }; 88 89 template<typename DestOperandType, typename AddrOperandType> 90 class LdaInstBase : public HsailGPUStaticInst 91 { 92 public: 93 typename DestOperandType::DestOperand dest; 94 AddrOperandType addr; 95 96 LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 97 const char *_opcode) 98 : HsailGPUStaticInst(obj, _opcode) 99 { 100 using namespace Brig; 101 102 setFlag(ALU); 103 104 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 105 dest.init(op_offs, obj); 106 op_offs = obj->getOperandPtr(ib->operands, 1); 107 addr.init(op_offs, obj); 108 } 109 110 int numSrcRegOperands() override 111 { return(this->addr.isVectorRegister()); } 112 int numDstRegOperands() override 113 { return dest.isVectorRegister(); } 114 bool isVectorRegister(int operandIndex) override 115 { 116 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 117 return((operandIndex == 0) ? dest.isVectorRegister() : 118 this->addr.isVectorRegister()); 119 } 120 bool isCondRegister(int operandIndex) override 121 { 122 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 123 return((operandIndex == 0) ? dest.isCondRegister() : 124 this->addr.isCondRegister()); 125 } 126 bool isScalarRegister(int operandIndex) override 127 { 128 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 129 return((operandIndex == 0) ? dest.isScalarRegister() : 130 this->addr.isScalarRegister()); 131 } 132 bool isSrcOperand(int operandIndex) override 133 { 134 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 135 if (operandIndex > 0) 136 return(this->addr.isVectorRegister()); 137 return false; 138 } 139 bool isDstOperand(int operandIndex) override { 140 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 141 return(operandIndex == 0); 142 } 143 int getOperandSize(int operandIndex) override 144 { 145 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 146 return((operandIndex == 0) ? dest.opSize() : 147 this->addr.opSize()); 148 } 149 int getRegisterIndex(int operandIndex) override 150 { 151 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 152 return((operandIndex == 0) ? dest.regIndex() : 153 this->addr.regIndex()); 154 } 155 int getNumOperands() override 156 { 157 if (this->addr.isVectorRegister()) 158 return 2; 159 return 1; 160 } 161 }; 162 163 template<typename DestDataType, typename AddrOperandType> 164 class LdaInst : 165 public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, 166 public MemInst 167 { 168 public: 169 void generateDisassembly(); 170 171 LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 172 const char *_opcode) 173 : LdaInstBase<typename DestDataType::OperandType, 174 AddrOperandType>(ib, obj, _opcode) 175 { 176 init_addr(&this->addr); 177 } 178 179 void execute(GPUDynInstPtr gpuDynInst); 180 }; 181 182 template<typename DataType> 183 GPUStaticInst* 184 decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) 185 { 186 unsigned op_offs = obj->getOperandPtr(ib->operands, 1); 187 BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); 188 189 if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 190 return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); 191 } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 192 // V2/V4 not allowed 193 switch (regDataType.regKind) { 194 case Brig::BRIG_REGISTER_KIND_SINGLE: 195 return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); 196 case Brig::BRIG_REGISTER_KIND_DOUBLE: 197 return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); 198 default: 199 fatal("Bad ldas register operand type %d\n", regDataType.type); 200 } 201 } else { 202 fatal("Bad ldas register operand kind %d\n", regDataType.kind); 203 } 204 } 205 206 template<typename MemOperandType, typename DestOperandType, 207 typename AddrOperandType> 208 class LdInstBase : public HsailGPUStaticInst 209 { 210 public: 211 Brig::BrigWidth8_t width; 212 typename DestOperandType::DestOperand dest; 213 AddrOperandType addr; 214 215 Brig::BrigSegment segment; 216 Brig::BrigMemoryOrder memoryOrder; 217 Brig::BrigMemoryScope memoryScope; 218 unsigned int equivClass; 219 220 LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 221 const char *_opcode) 222 : HsailGPUStaticInst(obj, _opcode) 223 { 224 using namespace Brig; 225 226 setFlag(MemoryRef); 227 setFlag(Load); 228 229 if (ib->opcode == BRIG_OPCODE_LD) { 230 const BrigInstMem *ldst = (const BrigInstMem*)ib; 231 232 segment = (BrigSegment)ldst->segment; 233 memoryOrder = BRIG_MEMORY_ORDER_NONE; 234 memoryScope = BRIG_MEMORY_SCOPE_NONE; 235 equivClass = ldst->equivClass; 236 237 width = ldst->width; 238 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 239 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 240 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 241 dest.init(op_offs, obj); 242 243 op_offs = obj->getOperandPtr(ib->operands, 1); 244 addr.init(op_offs, obj); 245 } else { 246 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 247 248 segment = (BrigSegment)at->segment; 249 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 250 memoryScope = (BrigMemoryScope)at->memoryScope; 251 equivClass = 0; 252 253 width = BRIG_WIDTH_1; 254 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 255 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 256 257 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) 258 dest.init(op_offs, obj); 259 260 op_offs = obj->getOperandPtr(ib->operands,1); 261 addr.init(op_offs, obj); 262 } 263 264 switch (memoryOrder) { 265 case BRIG_MEMORY_ORDER_NONE: 266 setFlag(NoOrder); 267 break; 268 case BRIG_MEMORY_ORDER_RELAXED: 269 setFlag(RelaxedOrder); 270 break; 271 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 272 setFlag(Acquire); 273 break; 274 case BRIG_MEMORY_ORDER_SC_RELEASE: 275 setFlag(Release); 276 break; 277 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 278 setFlag(AcquireRelease); 279 break; 280 default: 281 fatal("LdInst has bad memory order type\n"); 282 } 283 284 switch (memoryScope) { 285 case BRIG_MEMORY_SCOPE_NONE: 286 setFlag(NoScope); 287 break; 288 case BRIG_MEMORY_SCOPE_WORKITEM: 289 setFlag(WorkitemScope); 290 break; 291 case BRIG_MEMORY_SCOPE_WORKGROUP: 292 setFlag(WorkgroupScope); 293 break; 294 case BRIG_MEMORY_SCOPE_AGENT: 295 setFlag(DeviceScope); 296 break; 297 case BRIG_MEMORY_SCOPE_SYSTEM: 298 setFlag(SystemScope); 299 break; 300 default: 301 fatal("LdInst has bad memory scope type\n"); 302 } 303 304 switch (segment) { 305 case BRIG_SEGMENT_GLOBAL: 306 setFlag(GlobalSegment); 307 break; 308 case BRIG_SEGMENT_GROUP: 309 setFlag(GroupSegment); 310 break; 311 case BRIG_SEGMENT_PRIVATE: 312 setFlag(PrivateSegment); 313 break; 314 case BRIG_SEGMENT_READONLY: 315 setFlag(ReadOnlySegment); 316 break; 317 case BRIG_SEGMENT_SPILL: 318 setFlag(SpillSegment); 319 break; 320 case BRIG_SEGMENT_FLAT: 321 setFlag(Flat); 322 break; 323 case BRIG_SEGMENT_KERNARG: 324 setFlag(KernArgSegment); 325 break; 326 case BRIG_SEGMENT_ARG: 327 setFlag(ArgSegment); 328 break; 329 default: 330 panic("Ld: segment %d not supported\n", segment); 331 } 332 } 333 334 int numSrcRegOperands() override 335 { return(this->addr.isVectorRegister()); } 336 int numDstRegOperands() override { return dest.isVectorRegister(); } 337 int getNumOperands() override 338 { 339 if (this->addr.isVectorRegister()) 340 return 2; 341 else 342 return 1; 343 } 344 bool isVectorRegister(int operandIndex) override 345 { 346 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 347 return((operandIndex == 0) ? dest.isVectorRegister() : 348 this->addr.isVectorRegister()); 349 } 350 bool isCondRegister(int operandIndex) override 351 { 352 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 353 return((operandIndex == 0) ? dest.isCondRegister() : 354 this->addr.isCondRegister()); 355 } 356 bool isScalarRegister(int operandIndex) override 357 { 358 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 359 return((operandIndex == 0) ? dest.isScalarRegister() : 360 this->addr.isScalarRegister()); 361 } 362 bool isSrcOperand(int operandIndex) override 363 { 364 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 365 if (operandIndex > 0) 366 return(this->addr.isVectorRegister()); 367 return false; 368 } 369 bool isDstOperand(int operandIndex) override 370 { 371 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 372 return(operandIndex == 0); 373 } 374 int getOperandSize(int operandIndex) override 375 { 376 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 377 return((operandIndex == 0) ? dest.opSize() : 378 this->addr.opSize()); 379 } 380 int getRegisterIndex(int operandIndex) override 381 { 382 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 383 return((operandIndex == 0) ? dest.regIndex() : 384 this->addr.regIndex()); 385 } 386 }; 387 388 template<typename MemDataType, typename DestDataType, 389 typename AddrOperandType> 390 class LdInst : 391 public LdInstBase<typename MemDataType::CType, 392 typename DestDataType::OperandType, AddrOperandType>, 393 public MemInst 394 { 395 typename DestDataType::OperandType::DestOperand dest_vect[4]; 396 uint16_t num_dest_operands; 397 void generateDisassembly() override; 398 399 public: 400 LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 401 const char *_opcode) 402 : LdInstBase<typename MemDataType::CType, 403 typename DestDataType::OperandType, 404 AddrOperandType>(ib, obj, _opcode), 405 MemInst(MemDataType::memType) 406 { 407 init_addr(&this->addr); 408 409 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 410 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); 411 412 if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 413 const Brig::BrigOperandOperandList *brigRegVecOp = 414 (const Brig::BrigOperandOperandList*)brigOp; 415 416 num_dest_operands = 417 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 418 419 assert(num_dest_operands <= 4); 420 } else { 421 num_dest_operands = 1; 422 } 423 424 if (num_dest_operands > 1) { 425 assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 426 427 for (int i = 0; i < num_dest_operands; ++i) { 428 dest_vect[i].init_from_vect(op_offs, obj, i); 429 } 430 } 431 } 432 433 void 434 initiateAcc(GPUDynInstPtr gpuDynInst) override 435 { 436 typedef typename MemDataType::CType c0; 437 438 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 439 440 if (num_dest_operands > 1) { 441 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 442 if (gpuDynInst->exec_mask[i]) 443 gpuDynInst->statusVector.push_back(num_dest_operands); 444 else 445 gpuDynInst->statusVector.push_back(0); 446 } 447 448 for (int k = 0; k < num_dest_operands; ++k) { 449 450 c0 *d = &((c0*)gpuDynInst->d_data) 451 [k * gpuDynInst->computeUnit()->wfSize()]; 452 453 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 454 if (gpuDynInst->exec_mask[i]) { 455 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 456 457 if (this->isLocalMem()) { 458 // load from shared memory 459 *d = gpuDynInst->wavefront()->ldsChunk-> 460 read<c0>(vaddr); 461 } else { 462 Request *req = new Request(0, vaddr, sizeof(c0), 0, 463 gpuDynInst->computeUnit()->masterId(), 464 0, gpuDynInst->wfDynId); 465 466 gpuDynInst->setRequestFlags(req); 467 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 468 pkt->dataStatic(d); 469 470 if (gpuDynInst->computeUnit()->shader-> 471 separate_acquire_release && 472 gpuDynInst->isAcquire()) { 473 // if this load has acquire semantics, 474 // set the response continuation function 475 // to perform an Acquire request 476 gpuDynInst->execContinuation = 477 &GPUStaticInst::execLdAcq; 478 479 gpuDynInst->useContinuation = true; 480 } else { 481 // the request will be finished when 482 // the load completes 483 gpuDynInst->useContinuation = false; 484 } 485 // translation is performed in sendRequest() 486 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 487 i, pkt); 488 } 489 } 490 ++d; 491 } 492 } 493 494 gpuDynInst->updateStats(); 495 } 496 497 void 498 completeAcc(GPUDynInstPtr gpuDynInst) override 499 { 500 typedef typename MemDataType::CType c1; 501 502 constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; 503 504 /** 505 * this code essentially replaces the long if-else chain 506 * that was in used GlobalMemPipeline::exec() to infer the 507 * size (single/double) and type (floating point/integer) of 508 * the destination register. this is needed for load 509 * instructions because the loaded value and the 510 * destination type can be of different sizes, and we also 511 * need to know if the value we're writing back is floating 512 * point and signed/unsigned, so we can properly cast the 513 * writeback value 514 */ 515 typedef typename std::conditional<is_vt_32, 516 typename std::conditional<std::is_floating_point<c1>::value, 517 float, typename std::conditional<std::is_signed<c1>::value, 518 int32_t, uint32_t>::type>::type, 519 typename std::conditional<std::is_floating_point<c1>::value, 520 double, typename std::conditional<std::is_signed<c1>::value, 521 int64_t, uint64_t>::type>::type>::type c0; 522 523 524 Wavefront *w = gpuDynInst->wavefront(); 525 526 std::vector<uint32_t> regVec; 527 // iterate over number of destination register operands since 528 // this is a load 529 for (int k = 0; k < num_dest_operands; ++k) { 530 assert((sizeof(c1) * num_dest_operands) 531 <= MAX_WIDTH_FOR_MEM_INST); 532 533 int dst = this->dest.regIndex() + k; 534 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) 535 dst = dest_vect[k].regIndex(); 536 // virtual->physical VGPR mapping 537 int physVgpr = w->remap(dst, sizeof(c0), 1); 538 // save the physical VGPR index 539 regVec.push_back(physVgpr); 540 541 c1 *p1 = 542 &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; 543 544 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 545 if (gpuDynInst->exec_mask[i]) { 546 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 547 "$%s%d <- %d global ld done (src = wavefront " 548 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 549 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", 550 dst, *p1); 551 // write the value into the physical VGPR. This is a 552 // purely functional operation. No timing is modeled. 553 w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, 554 *p1, i); 555 } 556 ++p1; 557 } 558 } 559 560 // Schedule the write operation of the load data on the VRF. 561 // This simply models the timing aspect of the VRF write operation. 562 // It does not modify the physical VGPR. 563 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 564 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 565 sizeof(c0), gpuDynInst->time); 566 567 if (this->isGlobalMem()) { 568 gpuDynInst->computeUnit()->globalMemoryPipe 569 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 570 } else { 571 assert(this->isLocalMem()); 572 gpuDynInst->computeUnit()->localMemoryPipe 573 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 574 } 575 } 576 577 private: 578 void 579 execLdAcq(GPUDynInstPtr gpuDynInst) override 580 { 581 // after the load has complete and if the load has acquire 582 // semantics, issue an acquire request. 583 if (!this->isLocalMem()) { 584 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 585 && gpuDynInst->isAcquire()) { 586 gpuDynInst->statusBitVector = VectorMask(1); 587 gpuDynInst->useContinuation = false; 588 // create request 589 Request *req = new Request(0, 0, 0, 0, 590 gpuDynInst->computeUnit()->masterId(), 591 0, gpuDynInst->wfDynId); 592 req->setFlags(Request::ACQUIRE); 593 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 594 } 595 } 596 } 597 598 public: 599 bool isVectorRegister(int operandIndex) override 600 { 601 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 602 if ((num_dest_operands != getNumOperands()) && 603 (operandIndex == (getNumOperands()-1))) 604 return(this->addr.isVectorRegister()); 605 if (num_dest_operands > 1) { 606 return dest_vect[operandIndex].isVectorRegister(); 607 } 608 else if (num_dest_operands == 1) { 609 return LdInstBase<typename MemDataType::CType, 610 typename DestDataType::OperandType, 611 AddrOperandType>::dest.isVectorRegister(); 612 } 613 return false; 614 } 615 bool isCondRegister(int operandIndex) override 616 { 617 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 618 if ((num_dest_operands != getNumOperands()) && 619 (operandIndex == (getNumOperands()-1))) 620 return(this->addr.isCondRegister()); 621 if (num_dest_operands > 1) 622 return dest_vect[operandIndex].isCondRegister(); 623 else if (num_dest_operands == 1) 624 return LdInstBase<typename MemDataType::CType, 625 typename DestDataType::OperandType, 626 AddrOperandType>::dest.isCondRegister(); 627 return false; 628 } 629 bool isScalarRegister(int operandIndex) override 630 { 631 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 632 if ((num_dest_operands != getNumOperands()) && 633 (operandIndex == (getNumOperands()-1))) 634 return(this->addr.isScalarRegister()); 635 if (num_dest_operands > 1) 636 return dest_vect[operandIndex].isScalarRegister(); 637 else if (num_dest_operands == 1) 638 return LdInstBase<typename MemDataType::CType, 639 typename DestDataType::OperandType, 640 AddrOperandType>::dest.isScalarRegister(); 641 return false; 642 } 643 bool isSrcOperand(int operandIndex) override 644 { 645 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 646 if ((num_dest_operands != getNumOperands()) && 647 (operandIndex == (getNumOperands()-1))) 648 return(this->addr.isVectorRegister()); 649 return false; 650 } 651 bool isDstOperand(int operandIndex) override 652 { 653 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 654 if ((num_dest_operands != getNumOperands()) && 655 (operandIndex == (getNumOperands()-1))) 656 return false; 657 return true; 658 } 659 int getOperandSize(int operandIndex) override 660 { 661 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 662 if ((num_dest_operands != getNumOperands()) && 663 (operandIndex == (getNumOperands()-1))) 664 return(this->addr.opSize()); 665 if (num_dest_operands > 1) 666 return(dest_vect[operandIndex].opSize()); 667 else if (num_dest_operands == 1) 668 return(LdInstBase<typename MemDataType::CType, 669 typename DestDataType::OperandType, 670 AddrOperandType>::dest.opSize()); 671 return 0; 672 } 673 int getRegisterIndex(int operandIndex) override 674 { 675 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 676 if ((num_dest_operands != getNumOperands()) && 677 (operandIndex == (getNumOperands()-1))) 678 return(this->addr.regIndex()); 679 if (num_dest_operands > 1) 680 return(dest_vect[operandIndex].regIndex()); 681 else if (num_dest_operands == 1) 682 return(LdInstBase<typename MemDataType::CType, 683 typename DestDataType::OperandType, 684 AddrOperandType>::dest.regIndex()); 685 return -1; 686 } 687 int getNumOperands() override 688 { 689 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 690 return(num_dest_operands+1); 691 else 692 return(num_dest_operands); 693 } 694 void execute(GPUDynInstPtr gpuDynInst) override; 695 }; 696 697 template<typename MemDT, typename DestDT> 698 GPUStaticInst* 699 decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) 700 { 701 unsigned op_offs = obj->getOperandPtr(ib->operands,1); 702 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 703 704 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 705 return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); 706 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 707 tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 708 switch (tmp.regKind) { 709 case Brig::BRIG_REGISTER_KIND_SINGLE: 710 return new LdInst<MemDT, DestDT, 711 SRegAddrOperand>(ib, obj, "ld"); 712 case Brig::BRIG_REGISTER_KIND_DOUBLE: 713 return new LdInst<MemDT, DestDT, 714 DRegAddrOperand>(ib, obj, "ld"); 715 default: 716 fatal("Bad ld register operand type %d\n", tmp.regKind); 717 } 718 } else { 719 fatal("Bad ld register operand kind %d\n", tmp.kind); 720 } 721 } 722 723 template<typename MemDT> 724 GPUStaticInst* 725 decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) 726 { 727 unsigned op_offs = obj->getOperandPtr(ib->operands,0); 728 BrigRegOperandInfo dest = findRegDataType(op_offs, obj); 729 730 assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || 731 dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 732 switch(dest.regKind) { 733 case Brig::BRIG_REGISTER_KIND_SINGLE: 734 switch (ib->type) { 735 case Brig::BRIG_TYPE_B8: 736 case Brig::BRIG_TYPE_B16: 737 case Brig::BRIG_TYPE_B32: 738 return decodeLd2<MemDT, B32>(ib, obj); 739 case Brig::BRIG_TYPE_U8: 740 case Brig::BRIG_TYPE_U16: 741 case Brig::BRIG_TYPE_U32: 742 return decodeLd2<MemDT, U32>(ib, obj); 743 case Brig::BRIG_TYPE_S8: 744 case Brig::BRIG_TYPE_S16: 745 case Brig::BRIG_TYPE_S32: 746 return decodeLd2<MemDT, S32>(ib, obj); 747 case Brig::BRIG_TYPE_F16: 748 case Brig::BRIG_TYPE_F32: 749 return decodeLd2<MemDT, U32>(ib, obj); 750 default: 751 fatal("Bad ld register operand type %d, %d\n", 752 dest.regKind, ib->type); 753 }; 754 case Brig::BRIG_REGISTER_KIND_DOUBLE: 755 switch (ib->type) { 756 case Brig::BRIG_TYPE_B64: 757 return decodeLd2<MemDT, B64>(ib, obj); 758 case Brig::BRIG_TYPE_U64: 759 return decodeLd2<MemDT, U64>(ib, obj); 760 case Brig::BRIG_TYPE_S64: 761 return decodeLd2<MemDT, S64>(ib, obj); 762 case Brig::BRIG_TYPE_F64: 763 return decodeLd2<MemDT, U64>(ib, obj); 764 default: 765 fatal("Bad ld register operand type %d, %d\n", 766 dest.regKind, ib->type); 767 }; 768 default: 769 fatal("Bad ld register operand type %d, %d\n", dest.regKind, 770 ib->type); 771 } 772 } 773 774 template<typename MemDataType, typename SrcOperandType, 775 typename AddrOperandType> 776 class StInstBase : public HsailGPUStaticInst 777 { 778 public: 779 typename SrcOperandType::SrcOperand src; 780 AddrOperandType addr; 781 782 Brig::BrigSegment segment; 783 Brig::BrigMemoryScope memoryScope; 784 Brig::BrigMemoryOrder memoryOrder; 785 unsigned int equivClass; 786 787 StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 788 const char *_opcode) 789 : HsailGPUStaticInst(obj, _opcode) 790 { 791 using namespace Brig; 792 793 setFlag(MemoryRef); 794 setFlag(Store); 795 796 if (ib->opcode == BRIG_OPCODE_ST) { 797 const BrigInstMem *ldst = (const BrigInstMem*)ib; 798 799 segment = (BrigSegment)ldst->segment; 800 memoryOrder = BRIG_MEMORY_ORDER_NONE; 801 memoryScope = BRIG_MEMORY_SCOPE_NONE; 802 equivClass = ldst->equivClass; 803 804 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 805 const BrigOperand *baseOp = obj->getOperand(op_offs); 806 807 if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || 808 (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { 809 src.init(op_offs, obj); 810 } 811 812 op_offs = obj->getOperandPtr(ib->operands, 1); 813 addr.init(op_offs, obj); 814 } else { 815 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 816 817 segment = (BrigSegment)at->segment; 818 memoryScope = (BrigMemoryScope)at->memoryScope; 819 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 820 equivClass = 0; 821 822 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 823 addr.init(op_offs, obj); 824 825 op_offs = obj->getOperandPtr(ib->operands, 1); 826 src.init(op_offs, obj); 827 } 828 829 switch (memoryOrder) { 830 case BRIG_MEMORY_ORDER_NONE: 831 setFlag(NoOrder); 832 break; 833 case BRIG_MEMORY_ORDER_RELAXED: 834 setFlag(RelaxedOrder); 835 break; 836 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 837 setFlag(Acquire); 838 break; 839 case BRIG_MEMORY_ORDER_SC_RELEASE: 840 setFlag(Release); 841 break; 842 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 843 setFlag(AcquireRelease); 844 break; 845 default: 846 fatal("StInst has bad memory order type\n"); 847 } 848 849 switch (memoryScope) { 850 case BRIG_MEMORY_SCOPE_NONE: 851 setFlag(NoScope); 852 break; 853 case BRIG_MEMORY_SCOPE_WORKITEM: 854 setFlag(WorkitemScope); 855 break; 856 case BRIG_MEMORY_SCOPE_WORKGROUP: 857 setFlag(WorkgroupScope); 858 break; 859 case BRIG_MEMORY_SCOPE_AGENT: 860 setFlag(DeviceScope); 861 break; 862 case BRIG_MEMORY_SCOPE_SYSTEM: 863 setFlag(SystemScope); 864 break; 865 default: 866 fatal("StInst has bad memory scope type\n"); 867 } 868 869 switch (segment) { 870 case BRIG_SEGMENT_GLOBAL: 871 setFlag(GlobalSegment); 872 break; 873 case BRIG_SEGMENT_GROUP: 874 setFlag(GroupSegment); 875 break; 876 case BRIG_SEGMENT_PRIVATE: 877 setFlag(PrivateSegment); 878 break; 879 case BRIG_SEGMENT_READONLY: 880 setFlag(ReadOnlySegment); 881 break; 882 case BRIG_SEGMENT_SPILL: 883 setFlag(SpillSegment); 884 break; 885 case BRIG_SEGMENT_FLAT: 886 setFlag(Flat); 887 break; 888 case BRIG_SEGMENT_ARG: 889 setFlag(ArgSegment); 890 break; 891 default: 892 panic("St: segment %d not supported\n", segment); 893 } 894 } 895 896 int numDstRegOperands() override { return 0; } 897 int numSrcRegOperands() override 898 { 899 return src.isVectorRegister() + this->addr.isVectorRegister(); 900 } 901 int getNumOperands() override 902 { 903 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 904 return 2; 905 else 906 return 1; 907 } 908 bool isVectorRegister(int operandIndex) override 909 { 910 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 911 return !operandIndex ? src.isVectorRegister() : 912 this->addr.isVectorRegister(); 913 } 914 bool isCondRegister(int operandIndex) override 915 { 916 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 917 return !operandIndex ? src.isCondRegister() : 918 this->addr.isCondRegister(); 919 } 920 bool isScalarRegister(int operandIndex) override 921 { 922 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 923 return !operandIndex ? src.isScalarRegister() : 924 this->addr.isScalarRegister(); 925 } 926 bool isSrcOperand(int operandIndex) override 927 { 928 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 929 return true; 930 } 931 bool isDstOperand(int operandIndex) override { return false; } 932 int getOperandSize(int operandIndex) override 933 { 934 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 935 return !operandIndex ? src.opSize() : this->addr.opSize(); 936 } 937 int getRegisterIndex(int operandIndex) override 938 { 939 assert(operandIndex >= 0 && operandIndex < getNumOperands()); 940 return !operandIndex ? src.regIndex() : this->addr.regIndex(); 941 } 942 }; 943 944 945 template<typename MemDataType, typename SrcDataType, 946 typename AddrOperandType> 947 class StInst : 948 public StInstBase<MemDataType, typename SrcDataType::OperandType, 949 AddrOperandType>, 950 public MemInst 951 { 952 public: 953 typename SrcDataType::OperandType::SrcOperand src_vect[4]; 954 uint16_t num_src_operands; 955 void generateDisassembly() override; 956 957 StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 958 const char *_opcode, int srcIdx) 959 : StInstBase<MemDataType, typename SrcDataType::OperandType, 960 AddrOperandType>(ib, obj, _opcode), 961 MemInst(SrcDataType::memType) 962 { 963 init_addr(&this->addr); 964 965 BrigRegOperandInfo rinfo; 966 unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); 967 const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); 968 969 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { 970 const Brig::BrigOperandConstantBytes *op = 971 (Brig::BrigOperandConstantBytes*)baseOp; 972 973 rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, 974 Brig::BRIG_TYPE_NONE); 975 } else { 976 rinfo = findRegDataType(op_offs, obj); 977 } 978 979 if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { 980 const Brig::BrigOperandOperandList *brigRegVecOp = 981 (const Brig::BrigOperandOperandList*)baseOp; 982 983 num_src_operands = 984 *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; 985 986 assert(num_src_operands <= 4); 987 } else { 988 num_src_operands = 1; 989 } 990 991 if (num_src_operands > 1) { 992 assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); 993 994 for (int i = 0; i < num_src_operands; ++i) { 995 src_vect[i].init_from_vect(op_offs, obj, i); 996 } 997 } 998 } 999 1000 void 1001 initiateAcc(GPUDynInstPtr gpuDynInst) override 1002 { 1003 // before performing a store, check if this store has 1004 // release semantics, and if so issue a release first 1005 if (!this->isLocalMem()) { 1006 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1007 && gpuDynInst->isRelease()) { 1008 1009 gpuDynInst->statusBitVector = VectorMask(1); 1010 gpuDynInst->execContinuation = &GPUStaticInst::execSt; 1011 gpuDynInst->useContinuation = true; 1012 // create request 1013 Request *req = new Request(0, 0, 0, 0, 1014 gpuDynInst->computeUnit()->masterId(), 1015 0, gpuDynInst->wfDynId); 1016 req->setFlags(Request::RELEASE); 1017 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1018 1019 return; 1020 } 1021 } 1022 1023 // if there is no release semantic, perform stores immediately 1024 execSt(gpuDynInst); 1025 } 1026 1027 // stores don't write anything back, so there is nothing 1028 // to do here. we only override this method to avoid the 1029 // fatal in the base class implementation 1030 void completeAcc(GPUDynInstPtr gpuDynInst) override { } 1031 1032 private: 1033 // execSt may be called through a continuation 1034 // if the store had release semantics. see comment for 1035 // execSt in gpu_static_inst.hh 1036 void 1037 execSt(GPUDynInstPtr gpuDynInst) override 1038 { 1039 typedef typename MemDataType::CType c0; 1040 1041 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1042 1043 if (num_src_operands > 1) { 1044 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) 1045 if (gpuDynInst->exec_mask[i]) 1046 gpuDynInst->statusVector.push_back(num_src_operands); 1047 else 1048 gpuDynInst->statusVector.push_back(0); 1049 } 1050 1051 for (int k = 0; k < num_src_operands; ++k) { 1052 c0 *d = &((c0*)gpuDynInst->d_data) 1053 [k * gpuDynInst->computeUnit()->wfSize()]; 1054 1055 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1056 if (gpuDynInst->exec_mask[i]) { 1057 Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); 1058 1059 if (this->isLocalMem()) { 1060 //store to shared memory 1061 gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, 1062 *d); 1063 } else { 1064 Request *req = 1065 new Request(0, vaddr, sizeof(c0), 0, 1066 gpuDynInst->computeUnit()->masterId(), 1067 0, gpuDynInst->wfDynId); 1068 1069 gpuDynInst->setRequestFlags(req); 1070 PacketPtr pkt = new Packet(req, MemCmd::WriteReq); 1071 pkt->dataStatic<c0>(d); 1072 1073 // translation is performed in sendRequest() 1074 // the request will be finished when the store completes 1075 gpuDynInst->useContinuation = false; 1076 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, 1077 i, pkt); 1078 1079 } 1080 } 1081 ++d; 1082 } 1083 } 1084 1085 gpuDynInst->updateStats(); 1086 } 1087 1088 public: 1089 bool isVectorRegister(int operandIndex) override 1090 { 1091 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1092 if (operandIndex == num_src_operands) 1093 return this->addr.isVectorRegister(); 1094 if (num_src_operands > 1) 1095 return src_vect[operandIndex].isVectorRegister(); 1096 else if (num_src_operands == 1) 1097 return StInstBase<MemDataType, 1098 typename SrcDataType::OperandType, 1099 AddrOperandType>::src.isVectorRegister(); 1100 return false; 1101 } 1102 bool isCondRegister(int operandIndex) override 1103 { 1104 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1105 if (operandIndex == num_src_operands) 1106 return this->addr.isCondRegister(); 1107 if (num_src_operands > 1) 1108 return src_vect[operandIndex].isCondRegister(); 1109 else if (num_src_operands == 1) 1110 return StInstBase<MemDataType, 1111 typename SrcDataType::OperandType, 1112 AddrOperandType>::src.isCondRegister(); 1113 return false; 1114 } 1115 bool isScalarRegister(int operandIndex) override 1116 { 1117 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1118 if (operandIndex == num_src_operands) 1119 return this->addr.isScalarRegister(); 1120 if (num_src_operands > 1) 1121 return src_vect[operandIndex].isScalarRegister(); 1122 else if (num_src_operands == 1) 1123 return StInstBase<MemDataType, 1124 typename SrcDataType::OperandType, 1125 AddrOperandType>::src.isScalarRegister(); 1126 return false; 1127 } 1128 bool isSrcOperand(int operandIndex) override 1129 { 1130 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1131 return true; 1132 } 1133 bool isDstOperand(int operandIndex) override { return false; } 1134 int getOperandSize(int operandIndex) override 1135 { 1136 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1137 if (operandIndex == num_src_operands) 1138 return this->addr.opSize(); 1139 if (num_src_operands > 1) 1140 return src_vect[operandIndex].opSize(); 1141 else if (num_src_operands == 1) 1142 return StInstBase<MemDataType, 1143 typename SrcDataType::OperandType, 1144 AddrOperandType>::src.opSize(); 1145 return 0; 1146 } 1147 int getRegisterIndex(int operandIndex) override 1148 { 1149 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1150 if (operandIndex == num_src_operands) 1151 return this->addr.regIndex(); 1152 if (num_src_operands > 1) 1153 return src_vect[operandIndex].regIndex(); 1154 else if (num_src_operands == 1) 1155 return StInstBase<MemDataType, 1156 typename SrcDataType::OperandType, 1157 AddrOperandType>::src.regIndex(); 1158 return -1; 1159 } 1160 int getNumOperands() override 1161 { 1162 if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) 1163 return num_src_operands + 1; 1164 else 1165 return num_src_operands; 1166 } 1167 void execute(GPUDynInstPtr gpuDynInst) override; 1168 }; 1169 1170 template<typename DataType, typename SrcDataType> 1171 GPUStaticInst* 1172 decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) 1173 { 1174 int srcIdx = 0; 1175 int destIdx = 1; 1176 if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || 1177 ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { 1178 srcIdx = 1; 1179 destIdx = 0; 1180 } 1181 unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); 1182 1183 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1184 1185 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1186 return new StInst<DataType, SrcDataType, 1187 NoRegAddrOperand>(ib, obj, "st", srcIdx); 1188 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1189 // V2/V4 not allowed 1190 switch (tmp.regKind) { 1191 case Brig::BRIG_REGISTER_KIND_SINGLE: 1192 return new StInst<DataType, SrcDataType, 1193 SRegAddrOperand>(ib, obj, "st", srcIdx); 1194 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1195 return new StInst<DataType, SrcDataType, 1196 DRegAddrOperand>(ib, obj, "st", srcIdx); 1197 default: 1198 fatal("Bad st register operand type %d\n", tmp.type); 1199 } 1200 } else { 1201 fatal("Bad st register operand kind %d\n", tmp.kind); 1202 } 1203 } 1204 1205 template<typename OperandType, typename AddrOperandType, int NumSrcOperands, 1206 bool HasDst> 1207 class AtomicInstBase : public HsailGPUStaticInst 1208 { 1209 public: 1210 typename OperandType::DestOperand dest; 1211 typename OperandType::SrcOperand src[NumSrcOperands]; 1212 AddrOperandType addr; 1213 1214 Brig::BrigSegment segment; 1215 Brig::BrigMemoryOrder memoryOrder; 1216 Brig::BrigAtomicOperation atomicOperation; 1217 Brig::BrigMemoryScope memoryScope; 1218 Brig::BrigOpcode opcode; 1219 1220 AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, 1221 const char *_opcode) 1222 : HsailGPUStaticInst(obj, _opcode) 1223 { 1224 using namespace Brig; 1225 1226 const BrigInstAtomic *at = (const BrigInstAtomic*)ib; 1227 1228 segment = (BrigSegment)at->segment; 1229 memoryScope = (BrigMemoryScope)at->memoryScope; 1230 memoryOrder = (BrigMemoryOrder)at->memoryOrder; 1231 atomicOperation = (BrigAtomicOperation)at->atomicOperation; 1232 opcode = (BrigOpcode)ib->opcode; 1233 1234 assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || 1235 opcode == Brig::BRIG_OPCODE_ATOMIC); 1236 1237 setFlag(MemoryRef); 1238 1239 if (opcode == Brig::BRIG_OPCODE_ATOMIC) { 1240 setFlag(AtomicReturn); 1241 } else { 1242 setFlag(AtomicNoReturn); 1243 } 1244 1245 switch (memoryOrder) { 1246 case BRIG_MEMORY_ORDER_NONE: 1247 setFlag(NoOrder); 1248 break; 1249 case BRIG_MEMORY_ORDER_RELAXED: 1250 setFlag(RelaxedOrder); 1251 break; 1252 case BRIG_MEMORY_ORDER_SC_ACQUIRE: 1253 setFlag(Acquire); 1254 break; 1255 case BRIG_MEMORY_ORDER_SC_RELEASE: 1256 setFlag(Release); 1257 break; 1258 case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: 1259 setFlag(AcquireRelease); 1260 break; 1261 default: 1262 fatal("AtomicInst has bad memory order type\n"); 1263 } 1264 1265 switch (memoryScope) { 1266 case BRIG_MEMORY_SCOPE_NONE: 1267 setFlag(NoScope); 1268 break; 1269 case BRIG_MEMORY_SCOPE_WORKITEM: 1270 setFlag(WorkitemScope); 1271 break; 1272 case BRIG_MEMORY_SCOPE_WORKGROUP: 1273 setFlag(WorkgroupScope); 1274 break; 1275 case BRIG_MEMORY_SCOPE_AGENT: 1276 setFlag(DeviceScope); 1277 break; 1278 case BRIG_MEMORY_SCOPE_SYSTEM: 1279 setFlag(SystemScope); 1280 break; 1281 default: 1282 fatal("AtomicInst has bad memory scope type\n"); 1283 } 1284 1285 switch (atomicOperation) { 1286 case Brig::BRIG_ATOMIC_AND: 1287 setFlag(AtomicAnd); 1288 break; 1289 case Brig::BRIG_ATOMIC_OR: 1290 setFlag(AtomicOr); 1291 break; 1292 case Brig::BRIG_ATOMIC_XOR: 1293 setFlag(AtomicXor); 1294 break; 1295 case Brig::BRIG_ATOMIC_CAS: 1296 setFlag(AtomicCAS); 1297 break; 1298 case Brig::BRIG_ATOMIC_EXCH: 1299 setFlag(AtomicExch); 1300 break; 1301 case Brig::BRIG_ATOMIC_ADD: 1302 setFlag(AtomicAdd); 1303 break; 1304 case Brig::BRIG_ATOMIC_WRAPINC: 1305 setFlag(AtomicInc); 1306 break; 1307 case Brig::BRIG_ATOMIC_WRAPDEC: 1308 setFlag(AtomicDec); 1309 break; 1310 case Brig::BRIG_ATOMIC_MIN: 1311 setFlag(AtomicMin); 1312 break; 1313 case Brig::BRIG_ATOMIC_MAX: 1314 setFlag(AtomicMax); 1315 break; 1316 case Brig::BRIG_ATOMIC_SUB: 1317 setFlag(AtomicSub); 1318 break; 1319 default: 1320 fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); 1321 } 1322 1323 switch (segment) { 1324 case BRIG_SEGMENT_GLOBAL: 1325 setFlag(GlobalSegment); 1326 break; 1327 case BRIG_SEGMENT_GROUP: 1328 setFlag(GroupSegment); 1329 break; 1330 case BRIG_SEGMENT_FLAT: 1331 setFlag(Flat); 1332 break; 1333 default: 1334 panic("Atomic: segment %d not supported\n", segment); 1335 } 1336 1337 if (HasDst) { 1338 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1339 dest.init(op_offs, obj); 1340 1341 op_offs = obj->getOperandPtr(ib->operands, 1); 1342 addr.init(op_offs, obj); 1343 1344 for (int i = 0; i < NumSrcOperands; ++i) { 1345 op_offs = obj->getOperandPtr(ib->operands, i + 2); 1346 src[i].init(op_offs, obj); 1347 } 1348 } else { 1349 1350 unsigned op_offs = obj->getOperandPtr(ib->operands, 0); 1351 addr.init(op_offs, obj); 1352 1353 for (int i = 0; i < NumSrcOperands; ++i) { 1354 op_offs = obj->getOperandPtr(ib->operands, i + 1); 1355 src[i].init(op_offs, obj); 1356 } 1357 } 1358 } 1359 1360 int numSrcRegOperands() 1361 { 1362 int operands = 0; 1363 for (int i = 0; i < NumSrcOperands; i++) { 1364 if (src[i].isVectorRegister()) { 1365 operands++; 1366 } 1367 } 1368 if (addr.isVectorRegister()) 1369 operands++; 1370 return operands; 1371 } 1372 int numDstRegOperands() { return dest.isVectorRegister(); } 1373 int getNumOperands() 1374 { 1375 if (addr.isVectorRegister()) 1376 return(NumSrcOperands + 2); 1377 return(NumSrcOperands + 1); 1378 } 1379 bool isVectorRegister(int operandIndex) 1380 { 1381 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1382 if (operandIndex < NumSrcOperands) 1383 return src[operandIndex].isVectorRegister(); 1384 else if (operandIndex == NumSrcOperands) 1385 return(addr.isVectorRegister()); 1386 else 1387 return dest.isVectorRegister(); 1388 } 1389 bool isCondRegister(int operandIndex) 1390 { 1391 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1392 if (operandIndex < NumSrcOperands) 1393 return src[operandIndex].isCondRegister(); 1394 else if (operandIndex == NumSrcOperands) 1395 return(addr.isCondRegister()); 1396 else 1397 return dest.isCondRegister(); 1398 } 1399 bool isScalarRegister(int operandIndex) 1400 { 1401 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1402 if (operandIndex < NumSrcOperands) 1403 return src[operandIndex].isScalarRegister(); 1404 else if (operandIndex == NumSrcOperands) 1405 return(addr.isScalarRegister()); 1406 else 1407 return dest.isScalarRegister(); 1408 } 1409 bool isSrcOperand(int operandIndex) 1410 { 1411 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1412 if (operandIndex < NumSrcOperands) 1413 return true; 1414 else if (operandIndex == NumSrcOperands) 1415 return(addr.isVectorRegister()); 1416 else 1417 return false; 1418 } 1419 bool isDstOperand(int operandIndex) 1420 { 1421 if (operandIndex <= NumSrcOperands) 1422 return false; 1423 else 1424 return true; 1425 } 1426 int getOperandSize(int operandIndex) 1427 { 1428 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1429 if (operandIndex < NumSrcOperands) 1430 return(src[operandIndex].opSize()); 1431 else if (operandIndex == NumSrcOperands) 1432 return(addr.opSize()); 1433 else 1434 return(dest.opSize()); 1435 } 1436 int getRegisterIndex(int operandIndex) 1437 { 1438 assert((operandIndex >= 0) && (operandIndex < getNumOperands())); 1439 if (operandIndex < NumSrcOperands) 1440 return(src[operandIndex].regIndex()); 1441 else if (operandIndex == NumSrcOperands) 1442 return(addr.regIndex()); 1443 else 1444 return(dest.regIndex()); 1445 return -1; 1446 } 1447 }; 1448 1449 template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, 1450 bool HasDst> 1451 class AtomicInst : 1452 public AtomicInstBase<typename MemDataType::OperandType, 1453 AddrOperandType, NumSrcOperands, HasDst>, 1454 public MemInst 1455 { 1456 public: 1457 void generateDisassembly() override; 1458 1459 AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, 1460 const char *_opcode) 1461 : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, 1462 NumSrcOperands, HasDst> 1463 (ib, obj, _opcode), 1464 MemInst(MemDataType::memType) 1465 { 1466 init_addr(&this->addr); 1467 } 1468 1469 void 1470 initiateAcc(GPUDynInstPtr gpuDynInst) override 1471 { 1472 // before doing the RMW, check if this atomic has 1473 // release semantics, and if so issue a release first 1474 if (!this->isLocalMem()) { 1475 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1476 && (gpuDynInst->isRelease() 1477 || gpuDynInst->isAcquireRelease())) { 1478 1479 gpuDynInst->statusBitVector = VectorMask(1); 1480 1481 gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; 1482 gpuDynInst->useContinuation = true; 1483 1484 // create request 1485 Request *req = new Request(0, 0, 0, 0, 1486 gpuDynInst->computeUnit()->masterId(), 1487 0, gpuDynInst->wfDynId); 1488 req->setFlags(Request::RELEASE); 1489 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1490 1491 return; 1492 } 1493 } 1494 1495 // if there is no release semantic, execute the RMW immediately 1496 execAtomic(gpuDynInst); 1497 1498 } 1499 1500 void 1501 completeAcc(GPUDynInstPtr gpuDynInst) override 1502 { 1503 // if this is not an atomic return op, then we 1504 // have nothing more to do. 1505 if (this->isAtomicRet()) { 1506 // the size of the src operands and the 1507 // memory being operated on must match 1508 // for HSAIL atomics - this assumption may 1509 // not apply to all ISAs 1510 typedef typename MemDataType::CType CType; 1511 1512 Wavefront *w = gpuDynInst->wavefront(); 1513 int dst = this->dest.regIndex(); 1514 std::vector<uint32_t> regVec; 1515 // virtual->physical VGPR mapping 1516 int physVgpr = w->remap(dst, sizeof(CType), 1); 1517 regVec.push_back(physVgpr); 1518 CType *p1 = &((CType*)gpuDynInst->d_data)[0]; 1519 1520 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 1521 if (gpuDynInst->exec_mask[i]) { 1522 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 1523 "$%s%d <- %d global ld done (src = wavefront " 1524 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 1525 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", 1526 dst, *p1); 1527 // write the value into the physical VGPR. This is a 1528 // purely functional operation. No timing is modeled. 1529 w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); 1530 } 1531 ++p1; 1532 } 1533 1534 // Schedule the write operation of the load data on the VRF. 1535 // This simply models the timing aspect of the VRF write operation. 1536 // It does not modify the physical VGPR. 1537 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 1538 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 1539 sizeof(CType), gpuDynInst->time); 1540 1541 if (this->isGlobalMem()) { 1542 gpuDynInst->computeUnit()->globalMemoryPipe 1543 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1544 } else { 1545 assert(this->isLocalMem()); 1546 gpuDynInst->computeUnit()->localMemoryPipe 1547 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1548 } 1549 } 1550 } 1551 1552 void execute(GPUDynInstPtr gpuDynInst) override; 1553 1554 private: 1555 // execAtomic may be called through a continuation 1556 // if the RMW had release semantics. see comment for 1557 // execContinuation in gpu_dyn_inst.hh 1558 void 1559 execAtomic(GPUDynInstPtr gpuDynInst) override 1560 { 1561 gpuDynInst->statusBitVector = gpuDynInst->exec_mask; 1562 1563 typedef typename MemDataType::CType c0; 1564 1565 c0 *d = &((c0*) gpuDynInst->d_data)[0]; 1566 c0 *e = &((c0*) gpuDynInst->a_data)[0]; 1567 c0 *f = &((c0*) gpuDynInst->x_data)[0]; 1568 1569 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { 1570 if (gpuDynInst->exec_mask[i]) { 1571 Addr vaddr = gpuDynInst->addr[i]; 1572 1573 if (this->isLocalMem()) { 1574 Wavefront *wavefront = gpuDynInst->wavefront(); 1575 *d = wavefront->ldsChunk->read<c0>(vaddr); 1576 1577 if (this->isAtomicAdd()) { 1578 wavefront->ldsChunk->write<c0>(vaddr, 1579 wavefront->ldsChunk->read<c0>(vaddr) + (*e)); 1580 } else if (this->isAtomicSub()) { 1581 wavefront->ldsChunk->write<c0>(vaddr, 1582 wavefront->ldsChunk->read<c0>(vaddr) - (*e)); 1583 } else if (this->isAtomicMax()) { 1584 wavefront->ldsChunk->write<c0>(vaddr, 1585 std::max(wavefront->ldsChunk->read<c0>(vaddr), 1586 (*e))); 1587 } else if (this->isAtomicMin()) { 1588 wavefront->ldsChunk->write<c0>(vaddr, 1589 std::min(wavefront->ldsChunk->read<c0>(vaddr), 1590 (*e))); 1591 } else if (this->isAtomicAnd()) { 1592 wavefront->ldsChunk->write<c0>(vaddr, 1593 wavefront->ldsChunk->read<c0>(vaddr) & (*e)); 1594 } else if (this->isAtomicOr()) { 1595 wavefront->ldsChunk->write<c0>(vaddr, 1596 wavefront->ldsChunk->read<c0>(vaddr) | (*e)); 1597 } else if (this->isAtomicXor()) { 1598 wavefront->ldsChunk->write<c0>(vaddr, 1599 wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); 1600 } else if (this->isAtomicInc()) { 1601 wavefront->ldsChunk->write<c0>(vaddr, 1602 wavefront->ldsChunk->read<c0>(vaddr) + 1); 1603 } else if (this->isAtomicDec()) { 1604 wavefront->ldsChunk->write<c0>(vaddr, 1605 wavefront->ldsChunk->read<c0>(vaddr) - 1); 1606 } else if (this->isAtomicExch()) { 1607 wavefront->ldsChunk->write<c0>(vaddr, (*e)); 1608 } else if (this->isAtomicCAS()) { 1609 wavefront->ldsChunk->write<c0>(vaddr, 1610 (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? 1611 (*f) : wavefront->ldsChunk->read<c0>(vaddr)); 1612 } else { 1613 fatal("Unrecognized or invalid HSAIL atomic op " 1614 "type.\n"); 1615 } 1616 } else { 1617 Request *req = 1618 new Request(0, vaddr, sizeof(c0), 0, 1619 gpuDynInst->computeUnit()->masterId(), 1620 0, gpuDynInst->wfDynId, 1621 gpuDynInst->makeAtomicOpFunctor<c0>(e, 1622 f)); 1623 1624 gpuDynInst->setRequestFlags(req); 1625 PacketPtr pkt = new Packet(req, MemCmd::SwapReq); 1626 pkt->dataStatic(d); 1627 1628 if (gpuDynInst->computeUnit()->shader-> 1629 separate_acquire_release && 1630 (gpuDynInst->isAcquire())) { 1631 // if this atomic has acquire semantics, 1632 // schedule the continuation to perform an 1633 // acquire after the RMW completes 1634 gpuDynInst->execContinuation = 1635 &GPUStaticInst::execAtomicAcq; 1636 1637 gpuDynInst->useContinuation = true; 1638 } else { 1639 // the request will be finished when the RMW completes 1640 gpuDynInst->useContinuation = false; 1641 } 1642 // translation is performed in sendRequest() 1643 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, 1644 pkt); 1645 } 1646 } 1647 1648 ++d; 1649 ++e; 1650 ++f; 1651 } 1652 1653 gpuDynInst->updateStats(); 1654 } 1655 1656 // execAtomicACq will always be called through a continuation. 1657 // see comment for execContinuation in gpu_dyn_inst.hh 1658 void 1659 execAtomicAcq(GPUDynInstPtr gpuDynInst) override 1660 { 1661 // after performing the RMW, check to see if this instruction 1662 // has acquire semantics, and if so, issue an acquire 1663 if (!this->isLocalMem()) { 1664 if (gpuDynInst->computeUnit()->shader->separate_acquire_release 1665 && gpuDynInst->isAcquire()) { 1666 gpuDynInst->statusBitVector = VectorMask(1); 1667 1668 // the request will be finished when 1669 // the acquire completes 1670 gpuDynInst->useContinuation = false; 1671 // create request 1672 Request *req = new Request(0, 0, 0, 0, 1673 gpuDynInst->computeUnit()->masterId(), 1674 0, gpuDynInst->wfDynId); 1675 req->setFlags(Request::ACQUIRE); 1676 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); 1677 } 1678 } 1679 } 1680 }; 1681 1682 template<typename DataType, typename AddrOperandType, int NumSrcOperands> 1683 GPUStaticInst* 1684 constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1685 { 1686 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1687 1688 if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { 1689 return decodeLd<DataType>(ib, obj); 1690 } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { 1691 switch (ib->type) { 1692 case Brig::BRIG_TYPE_B8: 1693 return decodeSt<S8,S8>(ib, obj); 1694 case Brig::BRIG_TYPE_B16: 1695 return decodeSt<S16,S16>(ib, obj); 1696 case Brig::BRIG_TYPE_B32: 1697 return decodeSt<S32,S32>(ib, obj); 1698 case Brig::BRIG_TYPE_B64: 1699 return decodeSt<S64,S64>(ib, obj); 1700 default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); 1701 } 1702 } else { 1703 if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) 1704 return new AtomicInst<DataType, AddrOperandType, 1705 NumSrcOperands, false>(ib, obj, "atomicnoret"); 1706 else 1707 return new AtomicInst<DataType, AddrOperandType, 1708 NumSrcOperands, true>(ib, obj, "atomic"); 1709 } 1710 } 1711 1712 template<typename DataType, int NumSrcOperands> 1713 GPUStaticInst* 1714 decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) 1715 { 1716 unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == 1717 Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; 1718 1719 unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); 1720 1721 BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); 1722 1723 if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { 1724 return constructAtomic<DataType, NoRegAddrOperand, 1725 NumSrcOperands>(ib, obj); 1726 } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { 1727 // V2/V4 not allowed 1728 switch (tmp.regKind) { 1729 case Brig::BRIG_REGISTER_KIND_SINGLE: 1730 return constructAtomic<DataType, SRegAddrOperand, 1731 NumSrcOperands>(ib, obj); 1732 case Brig::BRIG_REGISTER_KIND_DOUBLE: 1733 return constructAtomic<DataType, DRegAddrOperand, 1734 NumSrcOperands>(ib, obj); 1735 default: 1736 fatal("Bad atomic register operand type %d\n", tmp.type); 1737 } 1738 } else { 1739 fatal("Bad atomic register operand kind %d\n", tmp.kind); 1740 } 1741 } 1742 1743 1744 template<typename DataType> 1745 GPUStaticInst* 1746 decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) 1747 { 1748 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1749 1750 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1751 return decodeAtomicHelper<DataType, 2>(ib, obj); 1752 } else { 1753 return decodeAtomicHelper<DataType, 1>(ib, obj); 1754 } 1755 } 1756 1757 template<typename DataType> 1758 GPUStaticInst* 1759 decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) 1760 { 1761 const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; 1762 if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { 1763 return decodeAtomicHelper<DataType, 2>(ib, obj); 1764 } else { 1765 return decodeAtomicHelper<DataType, 1>(ib, obj); 1766 } 1767 } 1768} // namespace HsailISA 1769 1770#endif // __ARCH_HSAIL_INSTS_MEM_HH__ 1771