compute_unit.cc (11638:b511733958d0) compute_unit.cc (11639:2e8d4bd8108d)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 164 unchanged lines hidden (view full) ---

173 delete ldsPort;
174}
175
176void
177ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
178{
179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
180
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 164 unchanged lines hidden (view full) ---

173 delete ldsPort;
174}
175
176void
177ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
178{
179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
180
181 w->workgroupsz[0] = ndr->q.wgSize[0];
182 w->workgroupsz[1] = ndr->q.wgSize[1];
183 w->workgroupsz[2] = ndr->q.wgSize[2];
184 w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
185 w->gridsz[0] = ndr->q.gdSize[0];
186 w->gridsz[1] = ndr->q.gdSize[1];
187 w->gridsz[2] = ndr->q.gdSize[2];
181 w->workGroupSz[0] = ndr->q.wgSize[0];
182 w->workGroupSz[1] = ndr->q.wgSize[1];
183 w->workGroupSz[2] = ndr->q.wgSize[2];
184 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
185 w->gridSz[0] = ndr->q.gdSize[0];
186 w->gridSz[1] = ndr->q.gdSize[1];
187 w->gridSz[2] = ndr->q.gdSize[2];
188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
193}
194
195void

--- 35 unchanged lines hidden (view full) ---

231 VectorMask init_mask;
232 init_mask.reset();
233
234 for (int k = 0; k < wfSize(); ++k) {
235 if (k + cnt * wfSize() < trueWgSizeTotal)
236 init_mask[k] = 1;
237 }
238
188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
193}
194
195void

--- 35 unchanged lines hidden (view full) ---

231 VectorMask init_mask;
232 init_mask.reset();
233
234 for (int k = 0; k < wfSize(); ++k) {
235 if (k + cnt * wfSize() < trueWgSizeTotal)
236 init_mask[k] = 1;
237 }
238
239 w->kern_id = ndr->dispatchId;
240 w->dynwaveid = cnt;
241 w->init_mask = init_mask.to_ullong();
239 w->kernId = ndr->dispatchId;
240 w->dynWaveId = cnt;
241 w->initMask = init_mask.to_ullong();
242
243 for (int k = 0; k < wfSize(); ++k) {
242
243 for (int k = 0; k < wfSize(); ++k) {
244 w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
245 w->workitemid[1][k] =
244 w->workItemId[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
245 w->workItemId[1][k] =
246 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
246 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
247 w->workitemid[2][k] =
247 w->workItemId[2][k] =
248 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
249
248 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
249
250 w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
251 trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
252 w->workitemid[0][k];
250 w->workItemFlatId[k] = w->workItemId[2][k] * trueWgSize[0] *
251 trueWgSize[1] + w->workItemId[1][k] * trueWgSize[0] +
252 w->workItemId[0][k];
253 }
254
253 }
254
255 w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
255 w->barrierSlots = divCeil(trueWgSizeTotal, wfSize());
256
256
257 w->bar_cnt.resize(wfSize(), 0);
257 w->barCnt.resize(wfSize(), 0);
258
258
259 w->max_bar_cnt = 0;
260 w->old_barrier_cnt = 0;
261 w->barrier_cnt = 0;
259 w->maxBarCnt = 0;
260 w->oldBarrierCnt = 0;
261 w->barrierCnt = 0;
262
263 w->privBase = ndr->q.privMemStart;
264 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
265
266 w->spillBase = ndr->q.spillMemStart;
267 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
268
269 w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
270
271 // WG state
262
263 w->privBase = ndr->q.privMemStart;
264 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
265
266 w->spillBase = ndr->q.spillMemStart;
267 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
268
269 w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
270
271 // WG state
272 w->wg_id = ndr->globalWgId;
273 w->dispatchid = ndr->dispatchId;
274 w->workgroupid[0] = w->wg_id % ndr->numWg[0];
275 w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
276 w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
272 w->wgId = ndr->globalWgId;
273 w->dispatchId = ndr->dispatchId;
274 w->workGroupId[0] = w->wgId % ndr->numWg[0];
275 w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
276 w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
277
277
278 w->barrier_id = barrier_id;
278 w->barrierId = barrier_id;
279 w->stalledAtBarrier = false;
280
281 // set the wavefront context to have a pointer to this section of the LDS
282 w->ldsChunk = ldsChunk;
283
284 int32_t refCount M5_VAR_USED =
279 w->stalledAtBarrier = false;
280
281 // set the wavefront context to have a pointer to this section of the LDS
282 w->ldsChunk = ldsChunk;
283
284 int32_t refCount M5_VAR_USED =
285 lds.increaseRefCounter(w->dispatchid, w->wg_id);
285 lds.increaseRefCounter(w->dispatchId, w->wgId);
286 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
286 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
287 cu_id, w->wg_id, refCount);
287 cu_id, w->wgId, refCount);
288
289 w->instructionBuffer.clear();
290
291 if (w->pendingFetch)
292 w->dropFetch = true;
293
294 // is this the last wavefront in the workgroup
295 // if set the spillWidth to be the remaining work-items

--- 167 unchanged lines hidden (view full) ---

463 for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
464 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
465 Wavefront *w = wfList[i_simd][i_wf];
466
467 if (w->status == Wavefront::S_RUNNING) {
468 DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
469
470 DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
288
289 w->instructionBuffer.clear();
290
291 if (w->pendingFetch)
292 w->dropFetch = true;
293
294 // is this the last wavefront in the workgroup
295 // if set the spillWidth to be the remaining work-items

--- 167 unchanged lines hidden (view full) ---

463 for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
464 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
465 Wavefront *w = wfList[i_simd][i_wf];
466
467 if (w->status == Wavefront::S_RUNNING) {
468 DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
469
470 DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
471 w->barrier_id, _barrier_id);
471 w->barrierId, _barrier_id);
472
473 DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
472
473 DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
474 w->barrier_cnt, bcnt);
474 w->barrierCnt, bcnt);
475 }
476
477 if (w->status == Wavefront::S_RUNNING &&
475 }
476
477 if (w->status == Wavefront::S_RUNNING &&
478 w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
479 !w->outstanding_reqs) {
478 w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
479 !w->outstandingReqs) {
480 ++ccnt;
481
482 DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
483 "%d\n", i_simd, i_wf, ccnt);
484 }
485 }
486 }
487

--- 153 unchanged lines hidden (view full) ---

641 if (pkt->req->isKernel() && pkt->req->isRelease()) {
642 Wavefront *w =
643 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
644
645 // Check if we are waiting on Kernel End Release
646 if (w->status == Wavefront::S_RETURNING) {
647 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
648 computeUnit->cu_id, w->simdId, w->wfSlotId,
480 ++ccnt;
481
482 DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
483 "%d\n", i_simd, i_wf, ccnt);
484 }
485 }
486 }
487

--- 153 unchanged lines hidden (view full) ---

641 if (pkt->req->isKernel() && pkt->req->isRelease()) {
642 Wavefront *w =
643 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
644
645 // Check if we are waiting on Kernel End Release
646 if (w->status == Wavefront::S_RETURNING) {
647 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
648 computeUnit->cu_id, w->simdId, w->wfSlotId,
649 w->wfDynId, w->kern_id);
649 w->wfDynId, w->kernId);
650
651 computeUnit->shader->dispatcher->notifyWgCompl(w);
652 w->status = Wavefront::S_STOPPED;
653 } else {
650
651 computeUnit->shader->dispatcher->notifyWgCompl(w);
652 w->status = Wavefront::S_STOPPED;
653 } else {
654 w->outstanding_reqs--;
654 w->outstandingReqs--;
655 }
656
657 DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
658 computeUnit->cu_id, gpuDynInst->simdId,
655 }
656
657 DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
658 computeUnit->cu_id, gpuDynInst->simdId,
659 gpuDynInst->wfSlotId, w->barrier_cnt);
659 gpuDynInst->wfSlotId, w->barrierCnt);
660
661 if (gpuDynInst->useContinuation) {
662 assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
663 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
664 gpuDynInst);
665 }
666
667 delete pkt->senderState;

--- 1126 unchanged lines hidden ---
660
661 if (gpuDynInst->useContinuation) {
662 assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
663 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
664 gpuDynInst);
665 }
666
667 delete pkt->senderState;

--- 1126 unchanged lines hidden ---