compute_unit.cc (11534:7106f550afad) compute_unit.cc (11638:b511733958d0)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 179 unchanged lines hidden (view full) ---

188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
193}
194
195void
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 179 unchanged lines hidden (view full) ---

188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
193}
194
195void
196ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
197 int trueWgSize[], int trueWgSizeTotal,
198 LdsChunk *ldsChunk, uint64_t origSpillMemStart)
199{
200 wfCtx->cnt = cnt;
201
202 VectorMask init_mask;
203 init_mask.reset();
204
205 for (int k = 0; k < wfSize(); ++k) {
206 if (k + cnt * wfSize() < trueWgSizeTotal)
207 init_mask[k] = 1;
208 }
209
210 wfCtx->init_mask = init_mask.to_ullong();
211 wfCtx->exec_mask = init_mask.to_ullong();
212
213 wfCtx->bar_cnt.resize(wfSize(), 0);
214
215 wfCtx->max_bar_cnt = 0;
216 wfCtx->old_barrier_cnt = 0;
217 wfCtx->barrier_cnt = 0;
218
219 wfCtx->privBase = ndr->q.privMemStart;
220 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
221
222 wfCtx->spillBase = ndr->q.spillMemStart;
223 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
224
225 wfCtx->pc = 0;
226 wfCtx->rpc = UINT32_MAX;
227
228 // set the wavefront context to have a pointer to this section of the LDS
229 wfCtx->ldsChunk = ldsChunk;
230
231 // WG state
232 wfCtx->wg_id = ndr->globalWgId;
233 wfCtx->barrier_id = barrier_id;
234
235 // Kernel wide state
236 wfCtx->ndr = ndr;
237}
238
239void
240ComputeUnit::updateEvents() {
241
242 if (!timestampVec.empty()) {
243 uint32_t vecSize = timestampVec.size();
244 uint32_t i = 0;
245 while (i < vecSize) {
246 if (timestampVec[i] <= shader->tick_cnt) {
247 std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];

--- 11 unchanged lines hidden (view full) ---

259
260 for (int i = 0; i< numSIMDs; ++i) {
261 vrf[i]->updateEvents();
262 }
263}
264
265
266void
196ComputeUnit::updateEvents() {
197
198 if (!timestampVec.empty()) {
199 uint32_t vecSize = timestampVec.size();
200 uint32_t i = 0;
201 while (i < vecSize) {
202 if (timestampVec[i] <= shader->tick_cnt) {
203 std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];

--- 11 unchanged lines hidden (view full) ---

215
216 for (int i = 0; i< numSIMDs; ++i) {
217 vrf[i]->updateEvents();
218 }
219}
220
221
222void
267ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
268 int trueWgSizeTotal)
223ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
224 int cnt, LdsChunk *ldsChunk, NDRange *ndr)
269{
270 static int _n_wave = 0;
225{
226 static int _n_wave = 0;
271 int cnt = wfCtx->cnt;
272 NDRange *ndr = wfCtx->ndr;
273
274 // Fill in Kernel state
275 FillKernelState(w, ndr);
276
227
228 // Fill in Kernel state
229 FillKernelState(w, ndr);
230
231 VectorMask init_mask;
232 init_mask.reset();
233
234 for (int k = 0; k < wfSize(); ++k) {
235 if (k + cnt * wfSize() < trueWgSizeTotal)
236 init_mask[k] = 1;
237 }
238
277 w->kern_id = ndr->dispatchId;
278 w->dynwaveid = cnt;
239 w->kern_id = ndr->dispatchId;
240 w->dynwaveid = cnt;
279 w->init_mask = wfCtx->init_mask;
241 w->init_mask = init_mask.to_ullong();
280
281 for (int k = 0; k < wfSize(); ++k) {
282 w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
283 w->workitemid[1][k] =
284 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
285 w->workitemid[2][k] =
286 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
287
288 w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
289 trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
290 w->workitemid[0][k];
291 }
292
242
243 for (int k = 0; k < wfSize(); ++k) {
244 w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
245 w->workitemid[1][k] =
246 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
247 w->workitemid[2][k] =
248 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
249
250 w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
251 trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
252 w->workitemid[0][k];
253 }
254
293 w->old_barrier_cnt = wfCtx->old_barrier_cnt;
294 w->barrier_cnt = wfCtx->barrier_cnt;
295 w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
296
255 w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
256
297 for (int i = 0; i < wfSize(); ++i) {
298 w->bar_cnt[i] = wfCtx->bar_cnt[i];
299 }
257 w->bar_cnt.resize(wfSize(), 0);
300
258
301 w->max_bar_cnt = wfCtx->max_bar_cnt;
302 w->privBase = wfCtx->privBase;
303 w->spillBase = wfCtx->spillBase;
259 w->max_bar_cnt = 0;
260 w->old_barrier_cnt = 0;
261 w->barrier_cnt = 0;
304
262
305 w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
263 w->privBase = ndr->q.privMemStart;
264 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
306
265
266 w->spillBase = ndr->q.spillMemStart;
267 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
268
269 w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
270
307 // WG state
271 // WG state
308 w->wg_id = wfCtx->wg_id;
309 w->dispatchid = wfCtx->ndr->dispatchId;
272 w->wg_id = ndr->globalWgId;
273 w->dispatchid = ndr->dispatchId;
310 w->workgroupid[0] = w->wg_id % ndr->numWg[0];
311 w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
312 w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
313
274 w->workgroupid[0] = w->wg_id % ndr->numWg[0];
275 w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
276 w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
277
314 w->barrier_id = wfCtx->barrier_id;
278 w->barrier_id = barrier_id;
315 w->stalledAtBarrier = false;
316
279 w->stalledAtBarrier = false;
280
317 // move this from the context into the actual wavefront
318 w->ldsChunk = wfCtx->ldsChunk;
281 // set the wavefront context to have a pointer to this section of the LDS
282 w->ldsChunk = ldsChunk;
319
320 int32_t refCount M5_VAR_USED =
321 lds.increaseRefCounter(w->dispatchid, w->wg_id);
322 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
323 cu_id, w->wg_id, refCount);
324
325 w->instructionBuffer.clear();
326

--- 8 unchanged lines hidden (view full) ---

335 } else {
336 w->spillWidth = wfSize();
337 }
338
339 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
340 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
341
342 w->start(++_n_wave, ndr->q.code_ptr);
283
284 int32_t refCount M5_VAR_USED =
285 lds.increaseRefCounter(w->dispatchid, w->wg_id);
286 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
287 cu_id, w->wg_id, refCount);
288
289 w->instructionBuffer.clear();
290

--- 8 unchanged lines hidden (view full) ---

299 } else {
300 w->spillWidth = wfSize();
301 }
302
303 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
304 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
305
306 w->start(++_n_wave, ndr->q.code_ptr);
343 wfCtx->bar_cnt.clear();
344}
345
346void
347ComputeUnit::StartWorkgroup(NDRange *ndr)
348{
349 // reserve the LDS capacity allocated to the work group
350 // disambiguated by the dispatch ID and workgroup ID, which should be
351 // globally unique

--- 19 unchanged lines hidden (view full) ---

371
372 for (int d = 0; d < 3; ++d) {
373 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
374 ndr->wgId[d] * ndr->q.wgSize[d]);
375
376 trueWgSizeTotal *= trueWgSize[d];
377 }
378
307}
308
309void
310ComputeUnit::StartWorkgroup(NDRange *ndr)
311{
312 // reserve the LDS capacity allocated to the work group
313 // disambiguated by the dispatch ID and workgroup ID, which should be
314 // globally unique

--- 19 unchanged lines hidden (view full) ---

334
335 for (int d = 0; d < 3; ++d) {
336 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
337 ndr->wgId[d] * ndr->q.wgSize[d]);
338
339 trueWgSizeTotal *= trueWgSize[d];
340 }
341
379 uint64_t origSpillMemStart = ndr->q.spillMemStart;
380 // calculate the number of 32-bit vector registers required by wavefront
381 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
382 int cnt = 0;
383
384 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
385 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
386 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
387 // Check if this wavefront slot is available:

--- 10 unchanged lines hidden (view full) ---

398 uint32_t normSize = 0;
399
400 w->startVgprIndex = vrf[m % numSIMDs]->manager->
401 allocateRegion(vregDemand, &normSize);
402
403 w->reservedVectorRegs = normSize;
404 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
405
342 // calculate the number of 32-bit vector registers required by wavefront
343 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
344 int cnt = 0;
345
346 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
347 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
348 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
349 // Check if this wavefront slot is available:

--- 10 unchanged lines hidden (view full) ---

360 uint32_t normSize = 0;
361
362 w->startVgprIndex = vrf[m % numSIMDs]->manager->
363 allocateRegion(vregDemand, &normSize);
364
365 w->reservedVectorRegs = normSize;
366 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
367
406 WFContext wfCtx;
407
408 InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
409 ldsChunk, origSpillMemStart);
410
411 StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
368 StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
412 ++cnt;
413 }
414 }
415 ++barrier_id;
416}
417
418int
419ComputeUnit::ReadyWorkgroup(NDRange *ndr)

--- 1417 unchanged lines hidden ---
369 ++cnt;
370 }
371 }
372 ++barrier_id;
373}
374
375int
376ComputeUnit::ReadyWorkgroup(NDRange *ndr)

--- 1417 unchanged lines hidden ---