wavefront.hh revision 11639
1955SN/A/* 2955SN/A * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 31762SN/A * All rights reserved. 4955SN/A * 5955SN/A * For use for simulation and test purposes only 6955SN/A * 7955SN/A * Redistribution and use in source and binary forms, with or without 8955SN/A * modification, are permitted provided that the following conditions are met: 9955SN/A * 10955SN/A * 1. Redistributions of source code must retain the above copyright notice, 11955SN/A * this list of conditions and the following disclaimer. 12955SN/A * 13955SN/A * 2. Redistributions in binary form must reproduce the above copyright notice, 14955SN/A * this list of conditions and the following disclaimer in the documentation 15955SN/A * and/or other materials provided with the distribution. 16955SN/A * 17955SN/A * 3. Neither the name of the copyright holder nor the names of its contributors 18955SN/A * may be used to endorse or promote products derived from this software 19955SN/A * without specific prior written permission. 20955SN/A * 21955SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22955SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23955SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24955SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25955SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26955SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27955SN/A * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 282665Ssaidi@eecs.umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 292665Ssaidi@eecs.umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30955SN/A * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31955SN/A * POSSIBILITY OF SUCH DAMAGE. 32955SN/A * 33955SN/A * Author: Lisa Hsu 34955SN/A */ 352632Sstever@eecs.umich.edu 362632Sstever@eecs.umich.edu#ifndef __WAVEFRONT_HH__ 372632Sstever@eecs.umich.edu#define __WAVEFRONT_HH__ 382632Sstever@eecs.umich.edu 39955SN/A#include <cassert> 402632Sstever@eecs.umich.edu#include <deque> 412632Sstever@eecs.umich.edu#include <memory> 422761Sstever@eecs.umich.edu#include <stack> 432632Sstever@eecs.umich.edu#include <vector> 442632Sstever@eecs.umich.edu 452632Sstever@eecs.umich.edu#include "base/misc.hh" 462761Sstever@eecs.umich.edu#include "base/types.hh" 472761Sstever@eecs.umich.edu#include "gpu-compute/condition_register_state.hh" 482761Sstever@eecs.umich.edu#include "gpu-compute/lds_state.hh" 492632Sstever@eecs.umich.edu#include "gpu-compute/misc.hh" 502632Sstever@eecs.umich.edu#include "params/Wavefront.hh" 512761Sstever@eecs.umich.edu#include "sim/sim_object.hh" 522761Sstever@eecs.umich.edu 532761Sstever@eecs.umich.edustatic const int MAX_NUM_INSTS_PER_WF = 12; 542761Sstever@eecs.umich.edu 552761Sstever@eecs.umich.edu/* 562632Sstever@eecs.umich.edu * Arguments for the hsail opcode call, are user defined and variable length. 572632Sstever@eecs.umich.edu * The hardware/finalizer can support arguments in hardware or use memory to 582632Sstever@eecs.umich.edu * pass arguments. For now, let's assume that an unlimited number of arguments 592632Sstever@eecs.umich.edu * are supported in hardware (the compiler inlines functions whenver it can 602632Sstever@eecs.umich.edu * anyways, so unless someone is interested in the implications of linking/ 612632Sstever@eecs.umich.edu * library functions, I think this is a reasonable assumption given the typical 622632Sstever@eecs.umich.edu * size of an OpenCL kernel). 63955SN/A * 64955SN/A * Note that call args are different than kernel arguments: 65955SN/A * * All work-items in a kernel refer the same set of kernel arguments 66955SN/A * * Each work-item has it's on set of call args. So a call argument at 67955SN/A * address 0x4 is different for work-item 0 and work-item 1. 685396Ssaidi@eecs.umich.edu * 694202Sbinkertn@umich.edu * Ok, the table below shows an example of how we organize the call arguments in 705342Sstever@gmail.com * the CallArgMem class. 71955SN/A * 725273Sstever@gmail.com * int foo(int arg1, double arg2) 735273Sstever@gmail.com * ___________________________________________________ 742656Sstever@eecs.umich.edu * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 752656Sstever@eecs.umich.edu * |---------------------------------------------------| 762656Sstever@eecs.umich.edu * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 772656Sstever@eecs.umich.edu * |---------------------------------------------------| 782656Sstever@eecs.umich.edu * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 792656Sstever@eecs.umich.edu * ___________________________________________________ 802656Sstever@eecs.umich.edu */ 812653Sstever@eecs.umich.educlass CallArgMem 825227Ssaidi@eecs.umich.edu{ 835227Ssaidi@eecs.umich.edu public: 845227Ssaidi@eecs.umich.edu // pointer to buffer for storing function arguments 855227Ssaidi@eecs.umich.edu uint8_t *mem; 865396Ssaidi@eecs.umich.edu int wfSize; 875396Ssaidi@eecs.umich.edu // size of function args 885396Ssaidi@eecs.umich.edu int funcArgsSizePerItem; 895396Ssaidi@eecs.umich.edu 905396Ssaidi@eecs.umich.edu template<typename CType> 915396Ssaidi@eecs.umich.edu int 925396Ssaidi@eecs.umich.edu getLaneOffset(int lane, int addr) 935396Ssaidi@eecs.umich.edu { 945588Ssaidi@eecs.umich.edu return addr * wfSize + sizeof(CType) * lane; 955396Ssaidi@eecs.umich.edu } 965396Ssaidi@eecs.umich.edu 975396Ssaidi@eecs.umich.edu CallArgMem(int func_args_size_per_item, int wf_size) 985396Ssaidi@eecs.umich.edu : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 995396Ssaidi@eecs.umich.edu { 1005396Ssaidi@eecs.umich.edu mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 1015396Ssaidi@eecs.umich.edu } 1025396Ssaidi@eecs.umich.edu 1035396Ssaidi@eecs.umich.edu ~CallArgMem() 1045396Ssaidi@eecs.umich.edu { 1055396Ssaidi@eecs.umich.edu free(mem); 1065396Ssaidi@eecs.umich.edu } 1075396Ssaidi@eecs.umich.edu 1085396Ssaidi@eecs.umich.edu template<typename CType> 1095396Ssaidi@eecs.umich.edu uint8_t* 1105396Ssaidi@eecs.umich.edu getLaneAddr(int lane, int addr) 1115396Ssaidi@eecs.umich.edu { 1125396Ssaidi@eecs.umich.edu return mem + getLaneOffset<CType>(lane, addr); 1135396Ssaidi@eecs.umich.edu } 1145396Ssaidi@eecs.umich.edu 1155396Ssaidi@eecs.umich.edu template<typename CType> 1165396Ssaidi@eecs.umich.edu void 1175396Ssaidi@eecs.umich.edu setLaneAddr(int lane, int addr, CType val) 1185396Ssaidi@eecs.umich.edu { 1195396Ssaidi@eecs.umich.edu *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 1205396Ssaidi@eecs.umich.edu } 1215396Ssaidi@eecs.umich.edu}; 1225396Ssaidi@eecs.umich.edu 1235396Ssaidi@eecs.umich.edu/** 1245396Ssaidi@eecs.umich.edu * A reconvergence stack entry conveys the necessary state to implement 1255396Ssaidi@eecs.umich.edu * control flow divergence. 1265396Ssaidi@eecs.umich.edu */ 1275396Ssaidi@eecs.umich.educlass ReconvergenceStackEntry { 1285396Ssaidi@eecs.umich.edu 1295396Ssaidi@eecs.umich.edu public: 1305396Ssaidi@eecs.umich.edu ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, 1315396Ssaidi@eecs.umich.edu VectorMask new_mask) : pc(new_pc), rpc(new_rpc), 1325396Ssaidi@eecs.umich.edu execMask(new_mask) { 1335396Ssaidi@eecs.umich.edu } 1345396Ssaidi@eecs.umich.edu 1355396Ssaidi@eecs.umich.edu /** 1365396Ssaidi@eecs.umich.edu * PC of current instruction. 1375396Ssaidi@eecs.umich.edu */ 1385396Ssaidi@eecs.umich.edu uint32_t pc; 1395396Ssaidi@eecs.umich.edu /** 1405396Ssaidi@eecs.umich.edu * PC of the immediate post-dominator instruction, i.e., the value of 1415396Ssaidi@eecs.umich.edu * @a pc for the first instruction that will be executed by the wavefront 1425396Ssaidi@eecs.umich.edu * when a reconvergence point is reached. 1435396Ssaidi@eecs.umich.edu */ 1445396Ssaidi@eecs.umich.edu uint32_t rpc; 1455396Ssaidi@eecs.umich.edu /** 1465396Ssaidi@eecs.umich.edu * Execution mask. 1474781Snate@binkert.org */ 1481852SN/A VectorMask execMask; 149955SN/A}; 150955SN/A 151955SN/Aclass Wavefront : public SimObject 1523717Sstever@eecs.umich.edu{ 1533716Sstever@eecs.umich.edu public: 154955SN/A enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 1551533SN/A enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 1563716Sstever@eecs.umich.edu 1571533SN/A // Base pointer for array of instruction pointers 1584678Snate@binkert.org uint64_t basePtr; 1594678Snate@binkert.org 1604678Snate@binkert.org uint32_t oldBarrierCnt; 1614678Snate@binkert.org uint32_t barrierCnt; 1624678Snate@binkert.org uint32_t barrierId; 1634678Snate@binkert.org uint32_t barrierSlots; 1644678Snate@binkert.org status_e status; 1654678Snate@binkert.org // HW slot id where the WF is mapped to inside a SIMD unit 1664678Snate@binkert.org int wfSlotId; 1674678Snate@binkert.org int kernId; 1684678Snate@binkert.org // SIMD unit where the WV has been scheduled 1694678Snate@binkert.org int simdId; 1704678Snate@binkert.org // pointer to parent CU 1714678Snate@binkert.org ComputeUnit *computeUnit; 1724678Snate@binkert.org 1734678Snate@binkert.org std::deque<GPUDynInstPtr> instructionBuffer; 1744678Snate@binkert.org 1754678Snate@binkert.org bool pendingFetch; 1764678Snate@binkert.org bool dropFetch; 1774678Snate@binkert.org 1784678Snate@binkert.org // Condition Register State (for HSAIL simulations only) 1794973Ssaidi@eecs.umich.edu class ConditionRegisterState *condRegState; 1804678Snate@binkert.org // number of single precision VGPRs required by WF 1814678Snate@binkert.org uint32_t maxSpVgprs; 1824678Snate@binkert.org // number of double precision VGPRs required by WF 1834678Snate@binkert.org uint32_t maxDpVgprs; 1844678Snate@binkert.org // map virtual to physical vector register 1854678Snate@binkert.org uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 186955SN/A void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 187955SN/A bool isGmInstruction(GPUDynInstPtr ii); 1882632Sstever@eecs.umich.edu bool isLmInstruction(GPUDynInstPtr ii); 1892632Sstever@eecs.umich.edu bool isOldestInstGMem(); 190955SN/A bool isOldestInstLMem(); 191955SN/A bool isOldestInstPrivMem(); 192955SN/A bool isOldestInstFlatMem(); 193955SN/A bool isOldestInstALU(); 1942632Sstever@eecs.umich.edu bool isOldestInstBarrier(); 195955SN/A // used for passing spill address to DDInstGPU 1962632Sstever@eecs.umich.edu std::vector<Addr> lastAddr; 1972632Sstever@eecs.umich.edu std::vector<uint32_t> workItemId[3]; 1982632Sstever@eecs.umich.edu std::vector<uint32_t> workItemFlatId; 1992632Sstever@eecs.umich.edu uint32_t workGroupId[3]; 2002632Sstever@eecs.umich.edu uint32_t workGroupSz[3]; 2012632Sstever@eecs.umich.edu uint32_t gridSz[3]; 2022632Sstever@eecs.umich.edu uint32_t wgId; 2032632Sstever@eecs.umich.edu uint32_t wgSz; 2042632Sstever@eecs.umich.edu uint32_t dynWaveId; 2052632Sstever@eecs.umich.edu uint32_t maxDynWaveId; 2062632Sstever@eecs.umich.edu uint32_t dispatchId; 2072632Sstever@eecs.umich.edu // outstanding global+local memory requests 2082632Sstever@eecs.umich.edu uint32_t outstandingReqs; 2093718Sstever@eecs.umich.edu // memory requests between scoreboard 2103718Sstever@eecs.umich.edu // and execute stage not yet executed 2113718Sstever@eecs.umich.edu uint32_t memReqsInPipe; 2123718Sstever@eecs.umich.edu // outstanding global memory write requests 2133718Sstever@eecs.umich.edu uint32_t outstandingReqsWrGm; 2143718Sstever@eecs.umich.edu // outstanding local memory write requests 2153718Sstever@eecs.umich.edu uint32_t outstandingReqsWrLm; 2163718Sstever@eecs.umich.edu // outstanding global memory read requests 2173718Sstever@eecs.umich.edu uint32_t outstandingReqsRdGm; 2183718Sstever@eecs.umich.edu // outstanding local memory read requests 2193718Sstever@eecs.umich.edu uint32_t outstandingReqsRdLm; 2203718Sstever@eecs.umich.edu uint32_t rdLmReqsInPipe; 2213718Sstever@eecs.umich.edu uint32_t rdGmReqsInPipe; 2222634Sstever@eecs.umich.edu uint32_t wrLmReqsInPipe; 2232634Sstever@eecs.umich.edu uint32_t wrGmReqsInPipe; 2242632Sstever@eecs.umich.edu 2252638Sstever@eecs.umich.edu int memTraceBusy; 2262632Sstever@eecs.umich.edu uint64_t lastTrace; 2272632Sstever@eecs.umich.edu // number of vector registers reserved by WF 2282632Sstever@eecs.umich.edu int reservedVectorRegs; 2292632Sstever@eecs.umich.edu // Index into the Vector Register File's namespace where the WF's registers 2302632Sstever@eecs.umich.edu // will live while the WF is executed 2312632Sstever@eecs.umich.edu uint32_t startVgprIndex; 2321858SN/A 2333716Sstever@eecs.umich.edu // Old value of destination gpr (for trace) 2342638Sstever@eecs.umich.edu std::vector<uint32_t> oldVgpr; 2352638Sstever@eecs.umich.edu // Id of destination gpr (for trace) 2362638Sstever@eecs.umich.edu uint32_t oldVgprId; 2372638Sstever@eecs.umich.edu // Tick count of last old_vgpr copy 2382638Sstever@eecs.umich.edu uint64_t oldVgprTcnt; 2392638Sstever@eecs.umich.edu 2402638Sstever@eecs.umich.edu // Old value of destination gpr (for trace) 2413716Sstever@eecs.umich.edu std::vector<uint64_t> oldDgpr; 2422634Sstever@eecs.umich.edu // Id of destination gpr (for trace) 2432634Sstever@eecs.umich.edu uint32_t oldDgprId; 244955SN/A // Tick count of last old_vgpr copy 2455341Sstever@gmail.com uint64_t oldDgprTcnt; 2465341Sstever@gmail.com 2475341Sstever@gmail.com // Execution mask at wavefront start 2485341Sstever@gmail.com VectorMask initMask; 249955SN/A 250955SN/A // number of barriers this WF has joined 251955SN/A std::vector<int> barCnt; 252955SN/A int maxBarCnt; 253955SN/A // Flag to stall a wave on barrier 254955SN/A bool stalledAtBarrier; 255955SN/A 2561858SN/A // a pointer to the fraction of the LDS allocated 2571858SN/A // to this workgroup (thus this wavefront) 2582632Sstever@eecs.umich.edu LdsChunk *ldsChunk; 259955SN/A 2604494Ssaidi@eecs.umich.edu // A pointer to the spill area 2614494Ssaidi@eecs.umich.edu Addr spillBase; 2623716Sstever@eecs.umich.edu // The size of the spill area 2631105SN/A uint32_t spillSizePerItem; 2642667Sstever@eecs.umich.edu // The vector width of the spill area 2652667Sstever@eecs.umich.edu uint32_t spillWidth; 2662667Sstever@eecs.umich.edu 2672667Sstever@eecs.umich.edu // A pointer to the private memory area 2682667Sstever@eecs.umich.edu Addr privBase; 2692667Sstever@eecs.umich.edu // The size of the private memory area 2701869SN/A uint32_t privSizePerItem; 2711869SN/A 2721869SN/A // A pointer ot the read-only memory area 2731869SN/A Addr roBase; 2741869SN/A // size of the read-only memory area 2751065SN/A uint32_t roSize; 2765341Sstever@gmail.com 2775341Sstever@gmail.com // pointer to buffer for storing kernel arguments 2785341Sstever@gmail.com uint8_t *kernelArgs; 2795341Sstever@gmail.com // unique WF id over all WFs executed across all CUs 2805341Sstever@gmail.com uint64_t wfDynId; 2815341Sstever@gmail.com 2825341Sstever@gmail.com // number of times instruction issue for this wavefront is blocked 2835341Sstever@gmail.com // due to VRF port availability 2845341Sstever@gmail.com Stats::Scalar numTimesBlockedDueVrfPortAvail; 2855341Sstever@gmail.com // number of times an instruction of a WF is blocked from being issued 2865341Sstever@gmail.com // due to WAR and WAW dependencies 2875341Sstever@gmail.com Stats::Scalar numTimesBlockedDueWAXDependencies; 2885341Sstever@gmail.com // number of times an instruction of a WF is blocked from being issued 2895341Sstever@gmail.com // due to WAR and WAW dependencies 2905341Sstever@gmail.com Stats::Scalar numTimesBlockedDueRAWDependencies; 2915341Sstever@gmail.com // distribution of executed instructions based on their register 2925341Sstever@gmail.com // operands; this is used to highlight the load on the VRF 2935341Sstever@gmail.com Stats::Distribution srcRegOpDist; 2945341Sstever@gmail.com Stats::Distribution dstRegOpDist; 2955341Sstever@gmail.com 2965341Sstever@gmail.com // Functions to operate on call argument memory 2975341Sstever@gmail.com // argument memory for hsail call instruction 2985341Sstever@gmail.com CallArgMem *callArgMem; 2995341Sstever@gmail.com void 3005341Sstever@gmail.com initCallArgMem(int func_args_size_per_item, int wf_size) 3015341Sstever@gmail.com { 3025341Sstever@gmail.com callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 3035397Ssaidi@eecs.umich.edu } 3045397Ssaidi@eecs.umich.edu 3055341Sstever@gmail.com template<typename CType> 3065341Sstever@gmail.com CType 3075341Sstever@gmail.com readCallArgMem(int lane, int addr) 3085341Sstever@gmail.com { 3095341Sstever@gmail.com return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 3105341Sstever@gmail.com } 3115341Sstever@gmail.com 3125341Sstever@gmail.com template<typename CType> 3135341Sstever@gmail.com void 3145341Sstever@gmail.com writeCallArgMem(int lane, int addr, CType val) 3155341Sstever@gmail.com { 3165341Sstever@gmail.com callArgMem->setLaneAddr<CType>(lane, addr, val); 3175341Sstever@gmail.com } 3185341Sstever@gmail.com 3195341Sstever@gmail.com typedef WavefrontParams Params; 3205341Sstever@gmail.com Wavefront(const Params *p); 3215341Sstever@gmail.com ~Wavefront(); 3225341Sstever@gmail.com virtual void init(); 3235341Sstever@gmail.com 3245341Sstever@gmail.com void 3255341Sstever@gmail.com setParent(ComputeUnit *cu) 3265341Sstever@gmail.com { 3275742Snate@binkert.org computeUnit = cu; 3285341Sstever@gmail.com } 3295742Snate@binkert.org 3305742Snate@binkert.org void start(uint64_t _wfDynId, uint64_t _base_ptr); 3315742Snate@binkert.org void exec(); 3325341Sstever@gmail.com void updateResources(); 3335742Snate@binkert.org int ready(itype_e type); 3345742Snate@binkert.org bool instructionBufferHasBranch(); 3355341Sstever@gmail.com void regStats(); 3362632Sstever@eecs.umich.edu VectorMask getPred() { return execMask() & initMask; } 3375199Sstever@gmail.com 3384781Snate@binkert.org bool waitingAtBarrier(int lane); 3394781Snate@binkert.org 3405550Snate@binkert.org void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 3414781Snate@binkert.org const VectorMask& exec_mask); 3424781Snate@binkert.org 3433918Ssaidi@eecs.umich.edu void popFromReconvergenceStack(); 3444781Snate@binkert.org 3454781Snate@binkert.org uint32_t pc() const; 3463940Ssaidi@eecs.umich.edu 3473942Ssaidi@eecs.umich.edu uint32_t rpc() const; 3483940Ssaidi@eecs.umich.edu 3493918Ssaidi@eecs.umich.edu VectorMask execMask() const; 3503918Ssaidi@eecs.umich.edu 351955SN/A bool execMask(int lane) const; 3521858SN/A 3533918Ssaidi@eecs.umich.edu void pc(uint32_t new_pc); 3543918Ssaidi@eecs.umich.edu 3553918Ssaidi@eecs.umich.edu void discardFetch(); 3563918Ssaidi@eecs.umich.edu 3575571Snate@binkert.org private: 3583940Ssaidi@eecs.umich.edu /** 3593940Ssaidi@eecs.umich.edu * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 3603918Ssaidi@eecs.umich.edu * to be visited by the wavefront, and the associated execution masks. The 3613918Ssaidi@eecs.umich.edu * reconvergence stack grows every time the wavefront reaches a divergence 3623918Ssaidi@eecs.umich.edu * point (branch instruction), and shrinks every time the wavefront 3633918Ssaidi@eecs.umich.edu * reaches a reconvergence point (immediate post-dominator instruction). 3643918Ssaidi@eecs.umich.edu */ 3653918Ssaidi@eecs.umich.edu std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 3663918Ssaidi@eecs.umich.edu}; 3673918Ssaidi@eecs.umich.edu 3683918Ssaidi@eecs.umich.edu#endif // __WAVEFRONT_HH__ 3693940Ssaidi@eecs.umich.edu