mem.hh (11692:e772fdcd3809) | mem.hh (11693:bc1f702c25b9) |
---|---|
1/* 2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 22 unchanged lines hidden (view full) --- 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 37#define __ARCH_HSAIL_INSTS_MEM_HH__ 38 | 1/* 2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 22 unchanged lines hidden (view full) --- 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 37#define __ARCH_HSAIL_INSTS_MEM_HH__ 38 |
39#include <type_traits> 40 |
|
39#include "arch/hsail/insts/decl.hh" 40#include "arch/hsail/insts/gpu_static_inst.hh" 41#include "arch/hsail/operand.hh" | 41#include "arch/hsail/insts/decl.hh" 42#include "arch/hsail/insts/gpu_static_inst.hh" 43#include "arch/hsail/operand.hh" |
44#include "gpu-compute/compute_unit.hh" |
|
42 43namespace HsailISA 44{ 45 class MemInst 46 { 47 public: 48 MemInst() : size(0), addr_operand(nullptr) { } 49 --- 436 unchanged lines hidden (view full) --- 486 } 487 ++d; 488 } 489 } 490 491 gpuDynInst->updateStats(); 492 } 493 | 45 46namespace HsailISA 47{ 48 class MemInst 49 { 50 public: 51 MemInst() : size(0), addr_operand(nullptr) { } 52 --- 436 unchanged lines hidden (view full) --- 489 } 490 ++d; 491 } 492 } 493 494 gpuDynInst->updateStats(); 495 } 496 |
497 void 498 completeAcc(GPUDynInstPtr gpuDynInst) override 499 { 500 typedef typename MemDataType::CType c1; 501 502 constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; 503 504 /** 505 * this code essentially replaces the long if-else chain 506 * that was in used GlobalMemPipeline::exec() to infer the 507 * size (single/double) and type (floating point/integer) of 508 * the destination register. this is needed for load 509 * instructions because the loaded value and the 510 * destination type can be of different sizes, and we also 511 * need to know if the value we're writing back is floating 512 * point and signed/unsigned, so we can properly cast the 513 * writeback value 514 */ 515 typedef typename std::conditional<is_vt_32, 516 typename std::conditional<std::is_floating_point<c1>::value, 517 float, typename std::conditional<std::is_signed<c1>::value, 518 int32_t, uint32_t>::type>::type, 519 typename std::conditional<std::is_floating_point<c1>::value, 520 double, typename std::conditional<std::is_signed<c1>::value, 521 int64_t, uint64_t>::type>::type>::type c0; 522 523 524 Wavefront *w = gpuDynInst->wavefront(); 525 526 std::vector<uint32_t> regVec; 527 // iterate over number of destination register operands since 528 // this is a load 529 for (int k = 0; k < num_dest_operands; ++k) { 530 assert((sizeof(c1) * num_dest_operands) 531 <= MAX_WIDTH_FOR_MEM_INST); 532 533 int dst = this->dest.regIndex() + k; 534 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) 535 dst = dest_vect[k].regIndex(); 536 // virtual->physical VGPR mapping 537 int physVgpr = w->remap(dst, sizeof(c0), 1); 538 // save the physical VGPR index 539 regVec.push_back(physVgpr); 540 541 c1 *p1 = 542 &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; 543 544 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 545 if (gpuDynInst->exec_mask[i]) { 546 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 547 "$%s%d <- %d global ld done (src = wavefront " 548 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 549 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", 550 dst, *p1); 551 // write the value into the physical VGPR. This is a 552 // purely functional operation. No timing is modeled. 553 w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, 554 *p1, i); 555 } 556 ++p1; 557 } 558 } 559 560 // Schedule the write operation of the load data on the VRF. 561 // This simply models the timing aspect of the VRF write operation. 562 // It does not modify the physical VGPR. 563 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 564 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 565 sizeof(c0), gpuDynInst->time); 566 567 if (this->isGlobalMem()) { 568 gpuDynInst->computeUnit()->globalMemoryPipe 569 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 570 } else { 571 assert(this->isLocalMem()); 572 gpuDynInst->computeUnit()->localMemoryPipe 573 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 574 } 575 } 576 |
|
494 private: 495 void 496 execLdAcq(GPUDynInstPtr gpuDynInst) override 497 { 498 // after the load has complete and if the load has acquire 499 // semantics, issue an acquire request. 500 if (!this->isLocalMem()) { 501 if (gpuDynInst->computeUnit()->shader->separate_acquire_release --- 434 unchanged lines hidden (view full) --- 936 return; 937 } 938 } 939 940 // if there is no release semantic, perform stores immediately 941 execSt(gpuDynInst); 942 } 943 | 577 private: 578 void 579 execLdAcq(GPUDynInstPtr gpuDynInst) override 580 { 581 // after the load has complete and if the load has acquire 582 // semantics, issue an acquire request. 583 if (!this->isLocalMem()) { 584 if (gpuDynInst->computeUnit()->shader->separate_acquire_release --- 434 unchanged lines hidden (view full) --- 1019 return; 1020 } 1021 } 1022 1023 // if there is no release semantic, perform stores immediately 1024 execSt(gpuDynInst); 1025 } 1026 |
1027 // stores don't write anything back, so there is nothing 1028 // to do here. we only override this method to avoid the 1029 // fatal in the base class implementation 1030 void completeAcc(GPUDynInstPtr gpuDynInst) override { } 1031 |
|
944 private: 945 // execSt may be called through a continuation 946 // if the store had release semantics. see comment for 947 // execSt in gpu_static_inst.hh 948 void 949 execSt(GPUDynInstPtr gpuDynInst) override 950 { 951 typedef typename MemDataType::CType c0; --- 452 unchanged lines hidden (view full) --- 1404 } 1405 } 1406 1407 // if there is no release semantic, execute the RMW immediately 1408 execAtomic(gpuDynInst); 1409 1410 } 1411 | 1032 private: 1033 // execSt may be called through a continuation 1034 // if the store had release semantics. see comment for 1035 // execSt in gpu_static_inst.hh 1036 void 1037 execSt(GPUDynInstPtr gpuDynInst) override 1038 { 1039 typedef typename MemDataType::CType c0; --- 452 unchanged lines hidden (view full) --- 1492 } 1493 } 1494 1495 // if there is no release semantic, execute the RMW immediately 1496 execAtomic(gpuDynInst); 1497 1498 } 1499 |
1500 void 1501 completeAcc(GPUDynInstPtr gpuDynInst) override 1502 { 1503 // if this is not an atomic return op, then we 1504 // have nothing more to do. 1505 if (this->isAtomicRet()) { 1506 // the size of the src operands and the 1507 // memory being operated on must match 1508 // for HSAIL atomics - this assumption may 1509 // not apply to all ISAs 1510 typedef typename MemDataType::CType CType; 1511 1512 Wavefront *w = gpuDynInst->wavefront(); 1513 int dst = this->dest.regIndex(); 1514 std::vector<uint32_t> regVec; 1515 // virtual->physical VGPR mapping 1516 int physVgpr = w->remap(dst, sizeof(CType), 1); 1517 regVec.push_back(physVgpr); 1518 CType *p1 = &((CType*)gpuDynInst->d_data)[0]; 1519 1520 for (int i = 0; i < w->computeUnit->wfSize(); ++i) { 1521 if (gpuDynInst->exec_mask[i]) { 1522 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " 1523 "$%s%d <- %d global ld done (src = wavefront " 1524 "ld inst)\n", w->computeUnit->cu_id, w->simdId, 1525 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", 1526 dst, *p1); 1527 // write the value into the physical VGPR. This is a 1528 // purely functional operation. No timing is modeled. 1529 w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); 1530 } 1531 ++p1; 1532 } 1533 1534 // Schedule the write operation of the load data on the VRF. 1535 // This simply models the timing aspect of the VRF write operation. 1536 // It does not modify the physical VGPR. 1537 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> 1538 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, 1539 sizeof(CType), gpuDynInst->time); 1540 1541 if (this->isGlobalMem()) { 1542 gpuDynInst->computeUnit()->globalMemoryPipe 1543 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1544 } else { 1545 assert(this->isLocalMem()); 1546 gpuDynInst->computeUnit()->localMemoryPipe 1547 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); 1548 } 1549 } 1550 } 1551 |
|
1412 void execute(GPUDynInstPtr gpuDynInst) override; 1413 1414 private: 1415 // execAtomic may be called through a continuation 1416 // if the RMW had release semantics. see comment for 1417 // execContinuation in gpu_dyn_inst.hh 1418 void 1419 execAtomic(GPUDynInstPtr gpuDynInst) override --- 211 unchanged lines hidden --- | 1552 void execute(GPUDynInstPtr gpuDynInst) override; 1553 1554 private: 1555 // execAtomic may be called through a continuation 1556 // if the RMW had release semantics. see comment for 1557 // execContinuation in gpu_dyn_inst.hh 1558 void 1559 execAtomic(GPUDynInstPtr gpuDynInst) override --- 211 unchanged lines hidden --- |