mem.hh (11692:e772fdcd3809) mem.hh (11693:bc1f702c25b9)
1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 22 unchanged lines hidden (view full) ---

31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 22 unchanged lines hidden (view full) ---

31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
39#include <type_traits>
40
39#include "arch/hsail/insts/decl.hh"
40#include "arch/hsail/insts/gpu_static_inst.hh"
41#include "arch/hsail/operand.hh"
41#include "arch/hsail/insts/decl.hh"
42#include "arch/hsail/insts/gpu_static_inst.hh"
43#include "arch/hsail/operand.hh"
44#include "gpu-compute/compute_unit.hh"
42
43namespace HsailISA
44{
45 class MemInst
46 {
47 public:
48 MemInst() : size(0), addr_operand(nullptr) { }
49

--- 436 unchanged lines hidden (view full) ---

486 }
487 ++d;
488 }
489 }
490
491 gpuDynInst->updateStats();
492 }
493
45
46namespace HsailISA
47{
48 class MemInst
49 {
50 public:
51 MemInst() : size(0), addr_operand(nullptr) { }
52

--- 436 unchanged lines hidden (view full) ---

489 }
490 ++d;
491 }
492 }
493
494 gpuDynInst->updateStats();
495 }
496
497 void
498 completeAcc(GPUDynInstPtr gpuDynInst) override
499 {
500 typedef typename MemDataType::CType c1;
501
502 constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
503
504 /**
505 * this code essentially replaces the long if-else chain
506 * that was in used GlobalMemPipeline::exec() to infer the
507 * size (single/double) and type (floating point/integer) of
508 * the destination register. this is needed for load
509 * instructions because the loaded value and the
510 * destination type can be of different sizes, and we also
511 * need to know if the value we're writing back is floating
512 * point and signed/unsigned, so we can properly cast the
513 * writeback value
514 */
515 typedef typename std::conditional<is_vt_32,
516 typename std::conditional<std::is_floating_point<c1>::value,
517 float, typename std::conditional<std::is_signed<c1>::value,
518 int32_t, uint32_t>::type>::type,
519 typename std::conditional<std::is_floating_point<c1>::value,
520 double, typename std::conditional<std::is_signed<c1>::value,
521 int64_t, uint64_t>::type>::type>::type c0;
522
523
524 Wavefront *w = gpuDynInst->wavefront();
525
526 std::vector<uint32_t> regVec;
527 // iterate over number of destination register operands since
528 // this is a load
529 for (int k = 0; k < num_dest_operands; ++k) {
530 assert((sizeof(c1) * num_dest_operands)
531 <= MAX_WIDTH_FOR_MEM_INST);
532
533 int dst = this->dest.regIndex() + k;
534 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
535 dst = dest_vect[k].regIndex();
536 // virtual->physical VGPR mapping
537 int physVgpr = w->remap(dst, sizeof(c0), 1);
538 // save the physical VGPR index
539 regVec.push_back(physVgpr);
540
541 c1 *p1 =
542 &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
543
544 for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
545 if (gpuDynInst->exec_mask[i]) {
546 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
547 "$%s%d <- %d global ld done (src = wavefront "
548 "ld inst)\n", w->computeUnit->cu_id, w->simdId,
549 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
550 dst, *p1);
551 // write the value into the physical VGPR. This is a
552 // purely functional operation. No timing is modeled.
553 w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
554 *p1, i);
555 }
556 ++p1;
557 }
558 }
559
560 // Schedule the write operation of the load data on the VRF.
561 // This simply models the timing aspect of the VRF write operation.
562 // It does not modify the physical VGPR.
563 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
564 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
565 sizeof(c0), gpuDynInst->time);
566
567 if (this->isGlobalMem()) {
568 gpuDynInst->computeUnit()->globalMemoryPipe
569 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
570 } else {
571 assert(this->isLocalMem());
572 gpuDynInst->computeUnit()->localMemoryPipe
573 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
574 }
575 }
576
494 private:
495 void
496 execLdAcq(GPUDynInstPtr gpuDynInst) override
497 {
498 // after the load has complete and if the load has acquire
499 // semantics, issue an acquire request.
500 if (!this->isLocalMem()) {
501 if (gpuDynInst->computeUnit()->shader->separate_acquire_release

--- 434 unchanged lines hidden (view full) ---

936 return;
937 }
938 }
939
940 // if there is no release semantic, perform stores immediately
941 execSt(gpuDynInst);
942 }
943
577 private:
578 void
579 execLdAcq(GPUDynInstPtr gpuDynInst) override
580 {
581 // after the load has complete and if the load has acquire
582 // semantics, issue an acquire request.
583 if (!this->isLocalMem()) {
584 if (gpuDynInst->computeUnit()->shader->separate_acquire_release

--- 434 unchanged lines hidden (view full) ---

1019 return;
1020 }
1021 }
1022
1023 // if there is no release semantic, perform stores immediately
1024 execSt(gpuDynInst);
1025 }
1026
1027 // stores don't write anything back, so there is nothing
1028 // to do here. we only override this method to avoid the
1029 // fatal in the base class implementation
1030 void completeAcc(GPUDynInstPtr gpuDynInst) override { }
1031
944 private:
945 // execSt may be called through a continuation
946 // if the store had release semantics. see comment for
947 // execSt in gpu_static_inst.hh
948 void
949 execSt(GPUDynInstPtr gpuDynInst) override
950 {
951 typedef typename MemDataType::CType c0;

--- 452 unchanged lines hidden (view full) ---

1404 }
1405 }
1406
1407 // if there is no release semantic, execute the RMW immediately
1408 execAtomic(gpuDynInst);
1409
1410 }
1411
1032 private:
1033 // execSt may be called through a continuation
1034 // if the store had release semantics. see comment for
1035 // execSt in gpu_static_inst.hh
1036 void
1037 execSt(GPUDynInstPtr gpuDynInst) override
1038 {
1039 typedef typename MemDataType::CType c0;

--- 452 unchanged lines hidden (view full) ---

1492 }
1493 }
1494
1495 // if there is no release semantic, execute the RMW immediately
1496 execAtomic(gpuDynInst);
1497
1498 }
1499
1500 void
1501 completeAcc(GPUDynInstPtr gpuDynInst) override
1502 {
1503 // if this is not an atomic return op, then we
1504 // have nothing more to do.
1505 if (this->isAtomicRet()) {
1506 // the size of the src operands and the
1507 // memory being operated on must match
1508 // for HSAIL atomics - this assumption may
1509 // not apply to all ISAs
1510 typedef typename MemDataType::CType CType;
1511
1512 Wavefront *w = gpuDynInst->wavefront();
1513 int dst = this->dest.regIndex();
1514 std::vector<uint32_t> regVec;
1515 // virtual->physical VGPR mapping
1516 int physVgpr = w->remap(dst, sizeof(CType), 1);
1517 regVec.push_back(physVgpr);
1518 CType *p1 = &((CType*)gpuDynInst->d_data)[0];
1519
1520 for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
1521 if (gpuDynInst->exec_mask[i]) {
1522 DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
1523 "$%s%d <- %d global ld done (src = wavefront "
1524 "ld inst)\n", w->computeUnit->cu_id, w->simdId,
1525 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
1526 dst, *p1);
1527 // write the value into the physical VGPR. This is a
1528 // purely functional operation. No timing is modeled.
1529 w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
1530 }
1531 ++p1;
1532 }
1533
1534 // Schedule the write operation of the load data on the VRF.
1535 // This simply models the timing aspect of the VRF write operation.
1536 // It does not modify the physical VGPR.
1537 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
1538 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
1539 sizeof(CType), gpuDynInst->time);
1540
1541 if (this->isGlobalMem()) {
1542 gpuDynInst->computeUnit()->globalMemoryPipe
1543 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1544 } else {
1545 assert(this->isLocalMem());
1546 gpuDynInst->computeUnit()->localMemoryPipe
1547 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1548 }
1549 }
1550 }
1551
1412 void execute(GPUDynInstPtr gpuDynInst) override;
1413
1414 private:
1415 // execAtomic may be called through a continuation
1416 // if the RMW had release semantics. see comment for
1417 // execContinuation in gpu_dyn_inst.hh
1418 void
1419 execAtomic(GPUDynInstPtr gpuDynInst) override

--- 211 unchanged lines hidden ---
1552 void execute(GPUDynInstPtr gpuDynInst) override;
1553
1554 private:
1555 // execAtomic may be called through a continuation
1556 // if the RMW had release semantics. see comment for
1557 // execContinuation in gpu_dyn_inst.hh
1558 void
1559 execAtomic(GPUDynInstPtr gpuDynInst) override

--- 211 unchanged lines hidden ---