compute_unit.hh (11692:e772fdcd3809) compute_unit.hh (11695:0a65922d564d)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Anthony Gutierrez
34 */
35
36#ifndef __COMPUTE_UNIT_HH__
37#define __COMPUTE_UNIT_HH__
38
39#include <deque>
40#include <map>
41#include <unordered_map>
42#include <vector>
43
44#include "base/callback.hh"
45#include "base/statistics.hh"
46#include "base/types.hh"
47#include "enums/PrefetchType.hh"
48#include "gpu-compute/exec_stage.hh"
49#include "gpu-compute/fetch_stage.hh"
50#include "gpu-compute/global_memory_pipeline.hh"
51#include "gpu-compute/local_memory_pipeline.hh"
52#include "gpu-compute/qstruct.hh"
53#include "gpu-compute/schedule_stage.hh"
54#include "gpu-compute/scoreboard_check_stage.hh"
55#include "mem/mem_object.hh"
56#include "mem/port.hh"
57
58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
59static const int MAX_WIDTH_FOR_MEM_INST = 32;
60
61class NDRange;
62class Shader;
63class VectorRegisterFile;
64
65struct ComputeUnitParams;
66
67enum EXEC_POLICY
68{
69 OLDEST = 0,
70 RR
71};
72
73// List of execution units
74enum EXEC_UNIT
75{
76 SIMD0 = 0,
77 SIMD1,
78 SIMD2,
79 SIMD3,
80 GLBMEM_PIPE,
81 LDSMEM_PIPE,
82 NUM_UNITS
83};
84
85enum TLB_CACHE
86{
87 TLB_MISS_CACHE_MISS = 0,
88 TLB_MISS_CACHE_HIT,
89 TLB_HIT_CACHE_MISS,
90 TLB_HIT_CACHE_HIT
91};
92
93class ComputeUnit : public MemObject
94{
95 public:
96 FetchStage fetchStage;
97 ScoreboardCheckStage scoreboardCheckStage;
98 ScheduleStage scheduleStage;
99 ExecStage execStage;
100 GlobalMemPipeline globalMemoryPipe;
101 LocalMemPipeline localMemoryPipe;
102
103 // Buffers used to communicate between various pipeline stages
104
105 // List of waves which are ready to be scheduled.
106 // Each execution resource has a ready list. readyList is
107 // used to communicate between scoreboardCheck stage and
108 // schedule stage
109 // TODO: make enum to index readyList
110 std::vector<std::vector<Wavefront*>> readyList;
111
112 // Stores the status of waves. A READY implies the
113 // wave is ready to be scheduled this cycle and
114 // is already present in the readyList. waveStatusList is
115 // used to communicate between scoreboardCheck stage and
116 // schedule stage
117 // TODO: convert std::pair to a class to increase readability
118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
119
120 // List of waves which will be dispatched to
121 // each execution resource. A FILLED implies
122 // dispatch list is non-empty and
123 // execution unit has something to execute
124 // this cycle. Currently, the dispatch list of
125 // an execution resource can hold only one wave because
126 // an execution resource can execute only one wave in a cycle.
127 // dispatchList is used to communicate between schedule
128 // and exec stage
129 // TODO: convert std::pair to a class to increase readability
130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
131
132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's
133 int rrNextALUWp;
134 typedef ComputeUnitParams Params;
135 std::vector<std::vector<Wavefront*>> wfList;
136 int cu_id;
137
138 // array of vector register files, one per SIMD
139 std::vector<VectorRegisterFile*> vrf;
140 // Number of vector ALU units (SIMDs) in CU
141 int numSIMDs;
142 // number of pipe stages for bypassing data to next dependent single
143 // precision vector instruction inside the vector ALU pipeline
144 int spBypassPipeLength;
145 // number of pipe stages for bypassing data to next dependent double
146 // precision vector instruction inside the vector ALU pipeline
147 int dpBypassPipeLength;
148 // number of cycles per issue period
149 int issuePeriod;
150
151 // Number of global and local memory execution resources in CU
152 int numGlbMemUnits;
153 int numLocMemUnits;
154 // tracks the last cycle a vector instruction was executed on a SIMD
155 std::vector<uint64_t> lastExecCycle;
156
157 // true if we allow a separate TLB per lane
158 bool perLaneTLB;
159 // if 0, TLB prefetching is off.
160 int prefetchDepth;
161 // if fixed-stride prefetching, this is the stride.
162 int prefetchStride;
163
164 std::vector<Addr> lastVaddrCU;
165 std::vector<std::vector<Addr>> lastVaddrSimd;
166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
167 Enums::PrefetchType prefetchType;
168 EXEC_POLICY exec_policy;
169
170 bool xact_cas_mode;
171 bool debugSegFault;
172 bool functionalTLB;
173 bool localMemBarrier;
174
175 /*
176 * for Counting page accesses
177 *
178 * cuExitCallback inherits from Callback. When you register a callback
179 * function as an exit callback, it will get added to an exit callback
180 * queue, such that on simulation exit, all callbacks in the callback
181 * queue will have their process() function called.
182 */
183 bool countPages;
184
185 Shader *shader;
186 uint32_t barrier_id;
187 // vector of Vector ALU (MACC) pipelines
188 std::vector<WaitClass> aluPipe;
189 // minimum issue period per SIMD unit (in cycles)
190 std::vector<WaitClass> wfWait;
191
192 // Resource control for Vector Register File->Global Memory pipe buses
193 std::vector<WaitClass> vrfToGlobalMemPipeBus;
194 // Resource control for Vector Register File->Local Memory pipe buses
195 std::vector<WaitClass> vrfToLocalMemPipeBus;
196 int nextGlbMemBus;
197 int nextLocMemBus;
198 // Resource control for global memory to VRF data/address bus
199 WaitClass glbMemToVrfBus;
200 // Resource control for local memory to VRF data/address bus
201 WaitClass locMemToVrfBus;
202
203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
207
208 Tick req_tick_latency;
209 Tick resp_tick_latency;
210
211 // number of vector registers being reserved for each SIMD unit
212 std::vector<int> vectorRegsReserved;
213 // number of vector registers per SIMD unit
214 uint32_t numVecRegsPerSimd;
215 // Support for scheduling VGPR status update events
216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
217 std::vector<uint64_t> timestampVec;
218 std::vector<uint8_t> statusVec;
219
220 void
221 registerEvent(uint32_t simdId,
222 uint32_t regIdx,
223 uint32_t operandSize,
224 uint64_t when,
225 uint8_t newStatus) {
226 regIdxVec.push_back(std::make_pair(simdId, regIdx));
227 timestampVec.push_back(when);
228 statusVec.push_back(newStatus);
229 if (operandSize > 4) {
230 regIdxVec.push_back(std::make_pair(simdId,
231 ((regIdx + 1) %
232 numVecRegsPerSimd)));
233 timestampVec.push_back(when);
234 statusVec.push_back(newStatus);
235 }
236 }
237
238 void updateEvents();
239
240 // this hash map will keep track of page divergence
241 // per memory instruction per wavefront. The hash map
242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
243 std::map<Addr, int> pagesTouched;
244
245 ComputeUnit(const Params *p);
246 ~ComputeUnit();
247 int spBypassLength() { return spBypassPipeLength; };
248 int dpBypassLength() { return dpBypassPipeLength; };
249 int storeBusLength() { return numCyclesPerStoreTransfer; };
250 int loadBusLength() { return numCyclesPerLoadTransfer; };
251 int wfSize() const { return wavefrontSize; };
252
253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
254 void exec();
255 void initiateFetch(Wavefront *wavefront);
256 void fetch(PacketPtr pkt, Wavefront *wavefront);
257 void fillKernelState(Wavefront *w, NDRange *ndr);
258
259 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
260 NDRange *ndr);
261
262 void StartWorkgroup(NDRange *ndr);
263 int ReadyWorkgroup(NDRange *ndr);
264
265 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
266 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
267 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
268 int GlbMemUnitId() { return GLBMEM_PIPE; }
269 int ShrMemUnitId() { return LDSMEM_PIPE; }
270 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
271 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
272 /* This function cycles through all the wavefronts in all the phases to see
273 * if all of the wavefronts which should be associated with one barrier
274 * (denoted with _barrier_id), are all at the same barrier in the program
275 * (denoted by bcnt). When the number at the barrier matches bslots, then
276 * return true.
277 */
278 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
279 bool cedeSIMD(int simdId, int wfSlotId);
280
281 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
282 virtual void init();
283 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
284 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
286 bool kernelLaunch=true,
287 RequestPtr req=nullptr);
288 void handleMemPacket(PacketPtr pkt, int memport_index);
289 bool processTimingPacket(PacketPtr pkt);
290 void processFetchReturn(PacketPtr pkt);
291 void updatePageDivergenceDist(Addr addr);
292
293 MasterID masterId() { return _masterId; }
294
295 bool isDone() const;
296 bool isSimdDone(uint32_t) const;
297
298 protected:
299 MasterID _masterId;
300
301 LdsState &lds;
302
303 public:
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Anthony Gutierrez
34 */
35
36#ifndef __COMPUTE_UNIT_HH__
37#define __COMPUTE_UNIT_HH__
38
39#include <deque>
40#include <map>
41#include <unordered_map>
42#include <vector>
43
44#include "base/callback.hh"
45#include "base/statistics.hh"
46#include "base/types.hh"
47#include "enums/PrefetchType.hh"
48#include "gpu-compute/exec_stage.hh"
49#include "gpu-compute/fetch_stage.hh"
50#include "gpu-compute/global_memory_pipeline.hh"
51#include "gpu-compute/local_memory_pipeline.hh"
52#include "gpu-compute/qstruct.hh"
53#include "gpu-compute/schedule_stage.hh"
54#include "gpu-compute/scoreboard_check_stage.hh"
55#include "mem/mem_object.hh"
56#include "mem/port.hh"
57
58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
59static const int MAX_WIDTH_FOR_MEM_INST = 32;
60
61class NDRange;
62class Shader;
63class VectorRegisterFile;
64
65struct ComputeUnitParams;
66
67enum EXEC_POLICY
68{
69 OLDEST = 0,
70 RR
71};
72
73// List of execution units
74enum EXEC_UNIT
75{
76 SIMD0 = 0,
77 SIMD1,
78 SIMD2,
79 SIMD3,
80 GLBMEM_PIPE,
81 LDSMEM_PIPE,
82 NUM_UNITS
83};
84
85enum TLB_CACHE
86{
87 TLB_MISS_CACHE_MISS = 0,
88 TLB_MISS_CACHE_HIT,
89 TLB_HIT_CACHE_MISS,
90 TLB_HIT_CACHE_HIT
91};
92
93class ComputeUnit : public MemObject
94{
95 public:
96 FetchStage fetchStage;
97 ScoreboardCheckStage scoreboardCheckStage;
98 ScheduleStage scheduleStage;
99 ExecStage execStage;
100 GlobalMemPipeline globalMemoryPipe;
101 LocalMemPipeline localMemoryPipe;
102
103 // Buffers used to communicate between various pipeline stages
104
105 // List of waves which are ready to be scheduled.
106 // Each execution resource has a ready list. readyList is
107 // used to communicate between scoreboardCheck stage and
108 // schedule stage
109 // TODO: make enum to index readyList
110 std::vector<std::vector<Wavefront*>> readyList;
111
112 // Stores the status of waves. A READY implies the
113 // wave is ready to be scheduled this cycle and
114 // is already present in the readyList. waveStatusList is
115 // used to communicate between scoreboardCheck stage and
116 // schedule stage
117 // TODO: convert std::pair to a class to increase readability
118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
119
120 // List of waves which will be dispatched to
121 // each execution resource. A FILLED implies
122 // dispatch list is non-empty and
123 // execution unit has something to execute
124 // this cycle. Currently, the dispatch list of
125 // an execution resource can hold only one wave because
126 // an execution resource can execute only one wave in a cycle.
127 // dispatchList is used to communicate between schedule
128 // and exec stage
129 // TODO: convert std::pair to a class to increase readability
130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
131
132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's
133 int rrNextALUWp;
134 typedef ComputeUnitParams Params;
135 std::vector<std::vector<Wavefront*>> wfList;
136 int cu_id;
137
138 // array of vector register files, one per SIMD
139 std::vector<VectorRegisterFile*> vrf;
140 // Number of vector ALU units (SIMDs) in CU
141 int numSIMDs;
142 // number of pipe stages for bypassing data to next dependent single
143 // precision vector instruction inside the vector ALU pipeline
144 int spBypassPipeLength;
145 // number of pipe stages for bypassing data to next dependent double
146 // precision vector instruction inside the vector ALU pipeline
147 int dpBypassPipeLength;
148 // number of cycles per issue period
149 int issuePeriod;
150
151 // Number of global and local memory execution resources in CU
152 int numGlbMemUnits;
153 int numLocMemUnits;
154 // tracks the last cycle a vector instruction was executed on a SIMD
155 std::vector<uint64_t> lastExecCycle;
156
157 // true if we allow a separate TLB per lane
158 bool perLaneTLB;
159 // if 0, TLB prefetching is off.
160 int prefetchDepth;
161 // if fixed-stride prefetching, this is the stride.
162 int prefetchStride;
163
164 std::vector<Addr> lastVaddrCU;
165 std::vector<std::vector<Addr>> lastVaddrSimd;
166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
167 Enums::PrefetchType prefetchType;
168 EXEC_POLICY exec_policy;
169
170 bool xact_cas_mode;
171 bool debugSegFault;
172 bool functionalTLB;
173 bool localMemBarrier;
174
175 /*
176 * for Counting page accesses
177 *
178 * cuExitCallback inherits from Callback. When you register a callback
179 * function as an exit callback, it will get added to an exit callback
180 * queue, such that on simulation exit, all callbacks in the callback
181 * queue will have their process() function called.
182 */
183 bool countPages;
184
185 Shader *shader;
186 uint32_t barrier_id;
187 // vector of Vector ALU (MACC) pipelines
188 std::vector<WaitClass> aluPipe;
189 // minimum issue period per SIMD unit (in cycles)
190 std::vector<WaitClass> wfWait;
191
192 // Resource control for Vector Register File->Global Memory pipe buses
193 std::vector<WaitClass> vrfToGlobalMemPipeBus;
194 // Resource control for Vector Register File->Local Memory pipe buses
195 std::vector<WaitClass> vrfToLocalMemPipeBus;
196 int nextGlbMemBus;
197 int nextLocMemBus;
198 // Resource control for global memory to VRF data/address bus
199 WaitClass glbMemToVrfBus;
200 // Resource control for local memory to VRF data/address bus
201 WaitClass locMemToVrfBus;
202
203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
207
208 Tick req_tick_latency;
209 Tick resp_tick_latency;
210
211 // number of vector registers being reserved for each SIMD unit
212 std::vector<int> vectorRegsReserved;
213 // number of vector registers per SIMD unit
214 uint32_t numVecRegsPerSimd;
215 // Support for scheduling VGPR status update events
216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
217 std::vector<uint64_t> timestampVec;
218 std::vector<uint8_t> statusVec;
219
220 void
221 registerEvent(uint32_t simdId,
222 uint32_t regIdx,
223 uint32_t operandSize,
224 uint64_t when,
225 uint8_t newStatus) {
226 regIdxVec.push_back(std::make_pair(simdId, regIdx));
227 timestampVec.push_back(when);
228 statusVec.push_back(newStatus);
229 if (operandSize > 4) {
230 regIdxVec.push_back(std::make_pair(simdId,
231 ((regIdx + 1) %
232 numVecRegsPerSimd)));
233 timestampVec.push_back(when);
234 statusVec.push_back(newStatus);
235 }
236 }
237
238 void updateEvents();
239
240 // this hash map will keep track of page divergence
241 // per memory instruction per wavefront. The hash map
242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
243 std::map<Addr, int> pagesTouched;
244
245 ComputeUnit(const Params *p);
246 ~ComputeUnit();
247 int spBypassLength() { return spBypassPipeLength; };
248 int dpBypassLength() { return dpBypassPipeLength; };
249 int storeBusLength() { return numCyclesPerStoreTransfer; };
250 int loadBusLength() { return numCyclesPerLoadTransfer; };
251 int wfSize() const { return wavefrontSize; };
252
253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
254 void exec();
255 void initiateFetch(Wavefront *wavefront);
256 void fetch(PacketPtr pkt, Wavefront *wavefront);
257 void fillKernelState(Wavefront *w, NDRange *ndr);
258
259 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
260 NDRange *ndr);
261
262 void StartWorkgroup(NDRange *ndr);
263 int ReadyWorkgroup(NDRange *ndr);
264
265 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
266 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
267 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
268 int GlbMemUnitId() { return GLBMEM_PIPE; }
269 int ShrMemUnitId() { return LDSMEM_PIPE; }
270 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
271 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
272 /* This function cycles through all the wavefronts in all the phases to see
273 * if all of the wavefronts which should be associated with one barrier
274 * (denoted with _barrier_id), are all at the same barrier in the program
275 * (denoted by bcnt). When the number at the barrier matches bslots, then
276 * return true.
277 */
278 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
279 bool cedeSIMD(int simdId, int wfSlotId);
280
281 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
282 virtual void init();
283 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
284 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
286 bool kernelLaunch=true,
287 RequestPtr req=nullptr);
288 void handleMemPacket(PacketPtr pkt, int memport_index);
289 bool processTimingPacket(PacketPtr pkt);
290 void processFetchReturn(PacketPtr pkt);
291 void updatePageDivergenceDist(Addr addr);
292
293 MasterID masterId() { return _masterId; }
294
295 bool isDone() const;
296 bool isSimdDone(uint32_t) const;
297
298 protected:
299 MasterID _masterId;
300
301 LdsState &lds;
302
303 public:
304 Stats::Scalar vALUInsts;
305 Stats::Formula vALUInstsPerWF;
306 Stats::Scalar sALUInsts;
307 Stats::Formula sALUInstsPerWF;
308 Stats::Scalar instCyclesVALU;
309 Stats::Scalar instCyclesSALU;
310 Stats::Scalar threadCyclesVALU;
311 Stats::Formula vALUUtilization;
312 Stats::Scalar ldsNoFlatInsts;
313 Stats::Formula ldsNoFlatInstsPerWF;
314 Stats::Scalar flatVMemInsts;
315 Stats::Formula flatVMemInstsPerWF;
316 Stats::Scalar flatLDSInsts;
317 Stats::Formula flatLDSInstsPerWF;
318 Stats::Scalar vectorMemWrites;
319 Stats::Formula vectorMemWritesPerWF;
320 Stats::Scalar vectorMemReads;
321 Stats::Formula vectorMemReadsPerWF;
322 Stats::Scalar scalarMemWrites;
323 Stats::Formula scalarMemWritesPerWF;
324 Stats::Scalar scalarMemReads;
325 Stats::Formula scalarMemReadsPerWF;
326
327 void updateInstStats(GPUDynInstPtr gpuDynInst);
328
304 // the following stats compute the avg. TLB accesslatency per
305 // uncoalesced request (only for data)
306 Stats::Scalar tlbRequests;
307 Stats::Scalar tlbCycles;
308 Stats::Formula tlbLatency;
309 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
310 Stats::Vector hitsPerTLBLevel;
311
312 Stats::Scalar ldsBankAccesses;
313 Stats::Distribution ldsBankConflictDist;
314
315 // over all memory instructions executed over all wavefronts
316 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
317 Stats::Distribution pageDivergenceDist;
318 Stats::Scalar dynamicGMemInstrCnt;
319 Stats::Scalar dynamicLMemInstrCnt;
320
321 Stats::Scalar wgBlockedDueLdsAllocation;
322 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
323 // when the instruction is committed, this number is still incremented by 1
324 Stats::Scalar numInstrExecuted;
325 // Number of cycles among successive instruction executions across all
326 // wavefronts of the same CU
327 Stats::Distribution execRateDist;
328 // number of individual vector operations executed
329 Stats::Scalar numVecOpsExecuted;
330 // Total cycles that something is running on the GPU
331 Stats::Scalar totalCycles;
332 Stats::Formula vpc; // vector ops per cycle
333 Stats::Formula ipc; // vector instructions per cycle
334 Stats::Distribution controlFlowDivergenceDist;
335 Stats::Distribution activeLanesPerGMemInstrDist;
336 Stats::Distribution activeLanesPerLMemInstrDist;
337 // number of vector ALU instructions received
338 Stats::Formula numALUInstsExecuted;
339 // number of times a WG can not start due to lack of free VGPRs in SIMDs
340 Stats::Scalar numTimesWgBlockedDueVgprAlloc;
341 Stats::Scalar numCASOps;
342 Stats::Scalar numFailedCASOps;
343 Stats::Scalar completedWfs;
344 // flag per vector SIMD unit that is set when there is at least one
345 // WV that has a vector ALU instruction as the oldest in its
346 // Instruction Buffer: Defined in the Scoreboard stage, consumed
347 // by the Execute stage.
348 std::vector<bool> vectorAluInstAvail;
349 // number of available (oldest) LDS instructions that could have
350 // been issued to the LDS at a specific issue slot
351 int shrMemInstAvail;
352 // number of available Global memory instructions that could have
353 // been issued to TCP at a specific issue slot
354 int glbMemInstAvail;
355
356 void
357 regStats();
358
359 LdsState &
360 getLds() const
361 {
362 return lds;
363 }
364
365 int32_t
366 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
367
368 bool
369 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
370
371 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
372 pageDataStruct pageAccesses;
373
374 class CUExitCallback : public Callback
375 {
376 private:
377 ComputeUnit *computeUnit;
378
379 public:
380 virtual ~CUExitCallback() { }
381
382 CUExitCallback(ComputeUnit *_cu)
383 {
384 computeUnit = _cu;
385 }
386
387 virtual void
388 process();
389 };
390
391 CUExitCallback *cuExitCallback;
392
393 /** Data access Port **/
394 class DataPort : public MasterPort
395 {
396 public:
397 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
398 : MasterPort(_name, _cu), computeUnit(_cu),
399 index(_index) { }
400
401 bool snoopRangeSent;
402
403 struct SenderState : public Packet::SenderState
404 {
405 GPUDynInstPtr _gpuDynInst;
406 int port_index;
407 Packet::SenderState *saved;
408
409 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
410 Packet::SenderState *sender_state=nullptr)
411 : _gpuDynInst(gpuDynInst),
412 port_index(_port_index),
413 saved(sender_state) { }
414 };
415
416 class MemReqEvent : public Event
417 {
418 private:
419 DataPort *dataPort;
420 PacketPtr pkt;
421
422 public:
423 MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
424 : Event(), dataPort(_data_port), pkt(_pkt)
425 {
426 setFlags(Event::AutoDelete);
427 }
428
429 void process();
430 const char *description() const;
431 };
432
433 class MemRespEvent : public Event
434 {
435 private:
436 DataPort *dataPort;
437 PacketPtr pkt;
438
439 public:
440 MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
441 : Event(), dataPort(_data_port), pkt(_pkt)
442 {
443 setFlags(Event::AutoDelete);
444 }
445
446 void process();
447 const char *description() const;
448 };
449
450 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
451
452 protected:
453 ComputeUnit *computeUnit;
454 int index;
455
456 virtual bool recvTimingResp(PacketPtr pkt);
457 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
458 virtual void recvFunctional(PacketPtr pkt) { }
459 virtual void recvRangeChange() { }
460 virtual void recvReqRetry();
461
462 virtual void
463 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
464 {
465 resp.clear();
466 snoop = true;
467 }
468
469 };
470
471 // Instruction cache access port
472 class SQCPort : public MasterPort
473 {
474 public:
475 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
476 : MasterPort(_name, _cu), computeUnit(_cu),
477 index(_index) { }
478
479 bool snoopRangeSent;
480
481 struct SenderState : public Packet::SenderState
482 {
483 Wavefront *wavefront;
484 Packet::SenderState *saved;
485
486 SenderState(Wavefront *_wavefront, Packet::SenderState
487 *sender_state=nullptr)
488 : wavefront(_wavefront), saved(sender_state) { }
489 };
490
491 std::deque<std::pair<PacketPtr, Wavefront*>> retries;
492
493 protected:
494 ComputeUnit *computeUnit;
495 int index;
496
497 virtual bool recvTimingResp(PacketPtr pkt);
498 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
499 virtual void recvFunctional(PacketPtr pkt) { }
500 virtual void recvRangeChange() { }
501 virtual void recvReqRetry();
502
503 virtual void
504 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
505 {
506 resp.clear();
507 snoop = true;
508 }
509 };
510
511 /** Data TLB port **/
512 class DTLBPort : public MasterPort
513 {
514 public:
515 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
516 : MasterPort(_name, _cu), computeUnit(_cu),
517 index(_index), stalled(false)
518 { }
519
520 bool isStalled() { return stalled; }
521 void stallPort() { stalled = true; }
522 void unstallPort() { stalled = false; }
523
524 /**
525 * here we queue all the translation requests that were
526 * not successfully sent.
527 */
528 std::deque<PacketPtr> retries;
529
530 /** SenderState is information carried along with the packet
531 * throughout the TLB hierarchy
532 */
533 struct SenderState: public Packet::SenderState
534 {
535 // the memInst that this is associated with
536 GPUDynInstPtr _gpuDynInst;
537
538 // the lane in the memInst this is associated with, so we send
539 // the memory request down the right port
540 int portIndex;
541
542 // constructor used for packets involved in timing accesses
543 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
544 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
545
546 };
547
548 protected:
549 ComputeUnit *computeUnit;
550 int index;
551 bool stalled;
552
553 virtual bool recvTimingResp(PacketPtr pkt);
554 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
555 virtual void recvFunctional(PacketPtr pkt) { }
556 virtual void recvRangeChange() { }
557 virtual void recvReqRetry();
558 };
559
560 class ITLBPort : public MasterPort
561 {
562 public:
563 ITLBPort(const std::string &_name, ComputeUnit *_cu)
564 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
565
566
567 bool isStalled() { return stalled; }
568 void stallPort() { stalled = true; }
569 void unstallPort() { stalled = false; }
570
571 /**
572 * here we queue all the translation requests that were
573 * not successfully sent.
574 */
575 std::deque<PacketPtr> retries;
576
577 /** SenderState is information carried along with the packet
578 * throughout the TLB hierarchy
579 */
580 struct SenderState: public Packet::SenderState
581 {
582 // The wavefront associated with this request
583 Wavefront *wavefront;
584
585 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
586 };
587
588 protected:
589 ComputeUnit *computeUnit;
590 bool stalled;
591
592 virtual bool recvTimingResp(PacketPtr pkt);
593 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
594 virtual void recvFunctional(PacketPtr pkt) { }
595 virtual void recvRangeChange() { }
596 virtual void recvReqRetry();
597 };
598
599 /**
600 * the port intended to communicate between the CU and its LDS
601 */
602 class LDSPort : public MasterPort
603 {
604 public:
605 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
606 : MasterPort(_name, _cu, _id), computeUnit(_cu)
607 {
608 }
609
610 bool isStalled() const { return stalled; }
611 void stallPort() { stalled = true; }
612 void unstallPort() { stalled = false; }
613
614 /**
615 * here we queue all the requests that were
616 * not successfully sent.
617 */
618 std::queue<PacketPtr> retries;
619
620 /**
621 * SenderState is information carried along with the packet, esp. the
622 * GPUDynInstPtr
623 */
624 class SenderState: public Packet::SenderState
625 {
626 protected:
627 // The actual read/write/atomic request that goes with this command
628 GPUDynInstPtr _gpuDynInst = nullptr;
629
630 public:
631 SenderState(GPUDynInstPtr gpuDynInst):
632 _gpuDynInst(gpuDynInst)
633 {
634 }
635
636 GPUDynInstPtr
637 getMemInst() const
638 {
639 return _gpuDynInst;
640 }
641 };
642
643 virtual bool
644 sendTimingReq(PacketPtr pkt);
645
646 protected:
647
648 bool stalled = false; ///< whether or not it is stalled
649
650 ComputeUnit *computeUnit;
651
652 virtual bool
653 recvTimingResp(PacketPtr pkt);
654
655 virtual Tick
656 recvAtomic(PacketPtr pkt) { return 0; }
657
658 virtual void
659 recvFunctional(PacketPtr pkt)
660 {
661 }
662
663 virtual void
664 recvRangeChange()
665 {
666 }
667
668 virtual void
669 recvReqRetry();
670 };
671
672 /** The port to access the Local Data Store
673 * Can be connected to a LDS object
674 */
675 LDSPort *ldsPort = nullptr;
676
677 LDSPort *
678 getLdsPort() const
679 {
680 return ldsPort;
681 }
682
683 /** The memory port for SIMD data accesses.
684 * Can be connected to PhysMem for Ruby for timing simulations
685 */
686 std::vector<DataPort*> memPort;
687 // port to the TLB hierarchy (i.e., the L1 TLB)
688 std::vector<DTLBPort*> tlbPort;
689 // port to the SQC (i.e. the I-cache)
690 SQCPort *sqcPort;
691 // port to the SQC TLB (there's a separate TLB for each I-cache)
692 ITLBPort *sqcTLBPort;
693
694 virtual BaseMasterPort&
695 getMasterPort(const std::string &if_name, PortID idx)
696 {
697 if (if_name == "memory_port") {
698 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
699 this, idx);
700 return *memPort[idx];
701 } else if (if_name == "translation_port") {
702 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
703 this, idx);
704 return *tlbPort[idx];
705 } else if (if_name == "sqc_port") {
706 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
707 this, idx);
708 return *sqcPort;
709 } else if (if_name == "sqc_tlb_port") {
710 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
711 return *sqcTLBPort;
712 } else if (if_name == "ldsPort") {
713 if (ldsPort) {
714 fatal("an LDS port was already allocated");
715 }
716 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
717 return *ldsPort;
718 } else {
719 panic("incorrect port name");
720 }
721 }
722
723 // xact_cas_load()
724 class waveIdentifier
725 {
726 public:
727 waveIdentifier() { }
728 waveIdentifier(int _simdId, int _wfSlotId)
729 : simdId(_simdId), wfSlotId(_wfSlotId) { }
730
731 int simdId;
732 int wfSlotId;
733 };
734
735 class waveQueue
736 {
737 public:
738 std::list<waveIdentifier> waveIDQueue;
739 };
740 std::map<unsigned, waveQueue> xactCasLoadMap;
741
742 uint64_t getAndIncSeqNum() { return globalSeqNum++; }
743
744 private:
745 uint64_t globalSeqNum;
746 int wavefrontSize;
747 GPUStaticInst *kernelLaunchInst;
748};
749
750#endif // __COMPUTE_UNIT_HH__
329 // the following stats compute the avg. TLB accesslatency per
330 // uncoalesced request (only for data)
331 Stats::Scalar tlbRequests;
332 Stats::Scalar tlbCycles;
333 Stats::Formula tlbLatency;
334 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
335 Stats::Vector hitsPerTLBLevel;
336
337 Stats::Scalar ldsBankAccesses;
338 Stats::Distribution ldsBankConflictDist;
339
340 // over all memory instructions executed over all wavefronts
341 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
342 Stats::Distribution pageDivergenceDist;
343 Stats::Scalar dynamicGMemInstrCnt;
344 Stats::Scalar dynamicLMemInstrCnt;
345
346 Stats::Scalar wgBlockedDueLdsAllocation;
347 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
348 // when the instruction is committed, this number is still incremented by 1
349 Stats::Scalar numInstrExecuted;
350 // Number of cycles among successive instruction executions across all
351 // wavefronts of the same CU
352 Stats::Distribution execRateDist;
353 // number of individual vector operations executed
354 Stats::Scalar numVecOpsExecuted;
355 // Total cycles that something is running on the GPU
356 Stats::Scalar totalCycles;
357 Stats::Formula vpc; // vector ops per cycle
358 Stats::Formula ipc; // vector instructions per cycle
359 Stats::Distribution controlFlowDivergenceDist;
360 Stats::Distribution activeLanesPerGMemInstrDist;
361 Stats::Distribution activeLanesPerLMemInstrDist;
362 // number of vector ALU instructions received
363 Stats::Formula numALUInstsExecuted;
364 // number of times a WG can not start due to lack of free VGPRs in SIMDs
365 Stats::Scalar numTimesWgBlockedDueVgprAlloc;
366 Stats::Scalar numCASOps;
367 Stats::Scalar numFailedCASOps;
368 Stats::Scalar completedWfs;
369 // flag per vector SIMD unit that is set when there is at least one
370 // WV that has a vector ALU instruction as the oldest in its
371 // Instruction Buffer: Defined in the Scoreboard stage, consumed
372 // by the Execute stage.
373 std::vector<bool> vectorAluInstAvail;
374 // number of available (oldest) LDS instructions that could have
375 // been issued to the LDS at a specific issue slot
376 int shrMemInstAvail;
377 // number of available Global memory instructions that could have
378 // been issued to TCP at a specific issue slot
379 int glbMemInstAvail;
380
381 void
382 regStats();
383
384 LdsState &
385 getLds() const
386 {
387 return lds;
388 }
389
390 int32_t
391 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
392
393 bool
394 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
395
396 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
397 pageDataStruct pageAccesses;
398
399 class CUExitCallback : public Callback
400 {
401 private:
402 ComputeUnit *computeUnit;
403
404 public:
405 virtual ~CUExitCallback() { }
406
407 CUExitCallback(ComputeUnit *_cu)
408 {
409 computeUnit = _cu;
410 }
411
412 virtual void
413 process();
414 };
415
416 CUExitCallback *cuExitCallback;
417
418 /** Data access Port **/
419 class DataPort : public MasterPort
420 {
421 public:
422 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
423 : MasterPort(_name, _cu), computeUnit(_cu),
424 index(_index) { }
425
426 bool snoopRangeSent;
427
428 struct SenderState : public Packet::SenderState
429 {
430 GPUDynInstPtr _gpuDynInst;
431 int port_index;
432 Packet::SenderState *saved;
433
434 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
435 Packet::SenderState *sender_state=nullptr)
436 : _gpuDynInst(gpuDynInst),
437 port_index(_port_index),
438 saved(sender_state) { }
439 };
440
441 class MemReqEvent : public Event
442 {
443 private:
444 DataPort *dataPort;
445 PacketPtr pkt;
446
447 public:
448 MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
449 : Event(), dataPort(_data_port), pkt(_pkt)
450 {
451 setFlags(Event::AutoDelete);
452 }
453
454 void process();
455 const char *description() const;
456 };
457
458 class MemRespEvent : public Event
459 {
460 private:
461 DataPort *dataPort;
462 PacketPtr pkt;
463
464 public:
465 MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
466 : Event(), dataPort(_data_port), pkt(_pkt)
467 {
468 setFlags(Event::AutoDelete);
469 }
470
471 void process();
472 const char *description() const;
473 };
474
475 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
476
477 protected:
478 ComputeUnit *computeUnit;
479 int index;
480
481 virtual bool recvTimingResp(PacketPtr pkt);
482 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
483 virtual void recvFunctional(PacketPtr pkt) { }
484 virtual void recvRangeChange() { }
485 virtual void recvReqRetry();
486
487 virtual void
488 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
489 {
490 resp.clear();
491 snoop = true;
492 }
493
494 };
495
496 // Instruction cache access port
497 class SQCPort : public MasterPort
498 {
499 public:
500 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
501 : MasterPort(_name, _cu), computeUnit(_cu),
502 index(_index) { }
503
504 bool snoopRangeSent;
505
506 struct SenderState : public Packet::SenderState
507 {
508 Wavefront *wavefront;
509 Packet::SenderState *saved;
510
511 SenderState(Wavefront *_wavefront, Packet::SenderState
512 *sender_state=nullptr)
513 : wavefront(_wavefront), saved(sender_state) { }
514 };
515
516 std::deque<std::pair<PacketPtr, Wavefront*>> retries;
517
518 protected:
519 ComputeUnit *computeUnit;
520 int index;
521
522 virtual bool recvTimingResp(PacketPtr pkt);
523 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
524 virtual void recvFunctional(PacketPtr pkt) { }
525 virtual void recvRangeChange() { }
526 virtual void recvReqRetry();
527
528 virtual void
529 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
530 {
531 resp.clear();
532 snoop = true;
533 }
534 };
535
536 /** Data TLB port **/
537 class DTLBPort : public MasterPort
538 {
539 public:
540 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
541 : MasterPort(_name, _cu), computeUnit(_cu),
542 index(_index), stalled(false)
543 { }
544
545 bool isStalled() { return stalled; }
546 void stallPort() { stalled = true; }
547 void unstallPort() { stalled = false; }
548
549 /**
550 * here we queue all the translation requests that were
551 * not successfully sent.
552 */
553 std::deque<PacketPtr> retries;
554
555 /** SenderState is information carried along with the packet
556 * throughout the TLB hierarchy
557 */
558 struct SenderState: public Packet::SenderState
559 {
560 // the memInst that this is associated with
561 GPUDynInstPtr _gpuDynInst;
562
563 // the lane in the memInst this is associated with, so we send
564 // the memory request down the right port
565 int portIndex;
566
567 // constructor used for packets involved in timing accesses
568 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
569 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
570
571 };
572
573 protected:
574 ComputeUnit *computeUnit;
575 int index;
576 bool stalled;
577
578 virtual bool recvTimingResp(PacketPtr pkt);
579 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
580 virtual void recvFunctional(PacketPtr pkt) { }
581 virtual void recvRangeChange() { }
582 virtual void recvReqRetry();
583 };
584
585 class ITLBPort : public MasterPort
586 {
587 public:
588 ITLBPort(const std::string &_name, ComputeUnit *_cu)
589 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
590
591
592 bool isStalled() { return stalled; }
593 void stallPort() { stalled = true; }
594 void unstallPort() { stalled = false; }
595
596 /**
597 * here we queue all the translation requests that were
598 * not successfully sent.
599 */
600 std::deque<PacketPtr> retries;
601
602 /** SenderState is information carried along with the packet
603 * throughout the TLB hierarchy
604 */
605 struct SenderState: public Packet::SenderState
606 {
607 // The wavefront associated with this request
608 Wavefront *wavefront;
609
610 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
611 };
612
613 protected:
614 ComputeUnit *computeUnit;
615 bool stalled;
616
617 virtual bool recvTimingResp(PacketPtr pkt);
618 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
619 virtual void recvFunctional(PacketPtr pkt) { }
620 virtual void recvRangeChange() { }
621 virtual void recvReqRetry();
622 };
623
624 /**
625 * the port intended to communicate between the CU and its LDS
626 */
627 class LDSPort : public MasterPort
628 {
629 public:
630 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
631 : MasterPort(_name, _cu, _id), computeUnit(_cu)
632 {
633 }
634
635 bool isStalled() const { return stalled; }
636 void stallPort() { stalled = true; }
637 void unstallPort() { stalled = false; }
638
639 /**
640 * here we queue all the requests that were
641 * not successfully sent.
642 */
643 std::queue<PacketPtr> retries;
644
645 /**
646 * SenderState is information carried along with the packet, esp. the
647 * GPUDynInstPtr
648 */
649 class SenderState: public Packet::SenderState
650 {
651 protected:
652 // The actual read/write/atomic request that goes with this command
653 GPUDynInstPtr _gpuDynInst = nullptr;
654
655 public:
656 SenderState(GPUDynInstPtr gpuDynInst):
657 _gpuDynInst(gpuDynInst)
658 {
659 }
660
661 GPUDynInstPtr
662 getMemInst() const
663 {
664 return _gpuDynInst;
665 }
666 };
667
668 virtual bool
669 sendTimingReq(PacketPtr pkt);
670
671 protected:
672
673 bool stalled = false; ///< whether or not it is stalled
674
675 ComputeUnit *computeUnit;
676
677 virtual bool
678 recvTimingResp(PacketPtr pkt);
679
680 virtual Tick
681 recvAtomic(PacketPtr pkt) { return 0; }
682
683 virtual void
684 recvFunctional(PacketPtr pkt)
685 {
686 }
687
688 virtual void
689 recvRangeChange()
690 {
691 }
692
693 virtual void
694 recvReqRetry();
695 };
696
697 /** The port to access the Local Data Store
698 * Can be connected to a LDS object
699 */
700 LDSPort *ldsPort = nullptr;
701
702 LDSPort *
703 getLdsPort() const
704 {
705 return ldsPort;
706 }
707
708 /** The memory port for SIMD data accesses.
709 * Can be connected to PhysMem for Ruby for timing simulations
710 */
711 std::vector<DataPort*> memPort;
712 // port to the TLB hierarchy (i.e., the L1 TLB)
713 std::vector<DTLBPort*> tlbPort;
714 // port to the SQC (i.e. the I-cache)
715 SQCPort *sqcPort;
716 // port to the SQC TLB (there's a separate TLB for each I-cache)
717 ITLBPort *sqcTLBPort;
718
719 virtual BaseMasterPort&
720 getMasterPort(const std::string &if_name, PortID idx)
721 {
722 if (if_name == "memory_port") {
723 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
724 this, idx);
725 return *memPort[idx];
726 } else if (if_name == "translation_port") {
727 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
728 this, idx);
729 return *tlbPort[idx];
730 } else if (if_name == "sqc_port") {
731 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
732 this, idx);
733 return *sqcPort;
734 } else if (if_name == "sqc_tlb_port") {
735 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
736 return *sqcTLBPort;
737 } else if (if_name == "ldsPort") {
738 if (ldsPort) {
739 fatal("an LDS port was already allocated");
740 }
741 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
742 return *ldsPort;
743 } else {
744 panic("incorrect port name");
745 }
746 }
747
748 // xact_cas_load()
749 class waveIdentifier
750 {
751 public:
752 waveIdentifier() { }
753 waveIdentifier(int _simdId, int _wfSlotId)
754 : simdId(_simdId), wfSlotId(_wfSlotId) { }
755
756 int simdId;
757 int wfSlotId;
758 };
759
760 class waveQueue
761 {
762 public:
763 std::list<waveIdentifier> waveIDQueue;
764 };
765 std::map<unsigned, waveQueue> xactCasLoadMap;
766
767 uint64_t getAndIncSeqNum() { return globalSeqNum++; }
768
769 private:
770 uint64_t globalSeqNum;
771 int wavefrontSize;
772 GPUStaticInst *kernelLaunchInst;
773};
774
775#endif // __COMPUTE_UNIT_HH__