compute_unit.cc (11643:42a1873be45c) compute_unit.cc (11657:5fad5a37d6fc)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 160 unchanged lines hidden (view full) ---

169 waveStatusList.clear();
170 dispatchList.clear();
171 vectorAluInstAvail.clear();
172 delete cuExitCallback;
173 delete ldsPort;
174}
175
176void
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:

--- 160 unchanged lines hidden (view full) ---

169 waveStatusList.clear();
170 dispatchList.clear();
171 vectorAluInstAvail.clear();
172 delete cuExitCallback;
173 delete ldsPort;
174}
175
176void
177ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
177ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr)
178{
179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
180
181 w->workGroupSz[0] = ndr->q.wgSize[0];
182 w->workGroupSz[1] = ndr->q.wgSize[1];
183 w->workGroupSz[2] = ndr->q.wgSize[2];
184 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
185 w->gridSz[0] = ndr->q.gdSize[0];
186 w->gridSz[1] = ndr->q.gdSize[1];
187 w->gridSz[2] = ndr->q.gdSize[2];
188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
178{
179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
180
181 w->workGroupSz[0] = ndr->q.wgSize[0];
182 w->workGroupSz[1] = ndr->q.wgSize[1];
183 w->workGroupSz[2] = ndr->q.wgSize[2];
184 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
185 w->gridSz[0] = ndr->q.gdSize[0];
186 w->gridSz[1] = ndr->q.gdSize[1];
187 w->gridSz[2] = ndr->q.gdSize[2];
188 w->kernelArgs = ndr->q.args;
189 w->privSizePerItem = ndr->q.privMemPerItem;
190 w->spillSizePerItem = ndr->q.spillMemPerItem;
191 w->roBase = ndr->q.roMemStart;
192 w->roSize = ndr->q.roMemTotal;
193 w->computeActualWgSz(ndr);
193}
194
195void
196ComputeUnit::updateEvents() {
197
198 if (!timestampVec.empty()) {
199 uint32_t vecSize = timestampVec.size();
200 uint32_t i = 0;

--- 14 unchanged lines hidden (view full) ---

215
216 for (int i = 0; i< numSIMDs; ++i) {
217 vrf[i]->updateEvents();
218 }
219}
220
221
222void
194}
195
196void
197ComputeUnit::updateEvents() {
198
199 if (!timestampVec.empty()) {
200 uint32_t vecSize = timestampVec.size();
201 uint32_t i = 0;

--- 14 unchanged lines hidden (view full) ---

216
217 for (int i = 0; i< numSIMDs; ++i) {
218 vrf[i]->updateEvents();
219 }
220}
221
222
223void
223ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
224 int waveId, LdsChunk *ldsChunk, NDRange *ndr)
224ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
225 NDRange *ndr)
225{
226 static int _n_wave = 0;
227
226{
227 static int _n_wave = 0;
228
228 // Fill in Kernel state
229 FillKernelState(w, ndr);
230
231 VectorMask init_mask;
232 init_mask.reset();
233
234 for (int k = 0; k < wfSize(); ++k) {
229 VectorMask init_mask;
230 init_mask.reset();
231
232 for (int k = 0; k < wfSize(); ++k) {
235 if (k + waveId * wfSize() < trueWgSizeTotal)
233 if (k + waveId * wfSize() < w->actualWgSzTotal)
236 init_mask[k] = 1;
237 }
238
239 w->kernId = ndr->dispatchId;
240 w->wfId = waveId;
241 w->initMask = init_mask.to_ullong();
242
243 for (int k = 0; k < wfSize(); ++k) {
234 init_mask[k] = 1;
235 }
236
237 w->kernId = ndr->dispatchId;
238 w->wfId = waveId;
239 w->initMask = init_mask.to_ullong();
240
241 for (int k = 0; k < wfSize(); ++k) {
244 w->workItemId[0][k] = (k + waveId * wfSize()) % trueWgSize[0];
245 w->workItemId[1][k] =
246 ((k + waveId * wfSize()) / trueWgSize[0]) % trueWgSize[1];
247 w->workItemId[2][k] =
248 (k + waveId * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
242 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
243 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
244 w->actualWgSz[1];
245 w->workItemId[2][k] = (k + waveId * wfSize()) /
246 (w->actualWgSz[0] * w->actualWgSz[1]);
249
247
250 w->workItemFlatId[k] = w->workItemId[2][k] * trueWgSize[0] *
251 trueWgSize[1] + w->workItemId[1][k] * trueWgSize[0] +
248 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
249 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
252 w->workItemId[0][k];
253 }
254
250 w->workItemId[0][k];
251 }
252
255 w->barrierSlots = divCeil(trueWgSizeTotal, wfSize());
253 w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
256
257 w->barCnt.resize(wfSize(), 0);
258
259 w->maxBarCnt = 0;
260 w->oldBarrierCnt = 0;
261 w->barrierCnt = 0;
262
263 w->privBase = ndr->q.privMemStart;

--- 25 unchanged lines hidden (view full) ---

289 w->instructionBuffer.clear();
290
291 if (w->pendingFetch)
292 w->dropFetch = true;
293
294 // is this the last wavefront in the workgroup
295 // if set the spillWidth to be the remaining work-items
296 // so that the vector access is correct
254
255 w->barCnt.resize(wfSize(), 0);
256
257 w->maxBarCnt = 0;
258 w->oldBarrierCnt = 0;
259 w->barrierCnt = 0;
260
261 w->privBase = ndr->q.privMemStart;

--- 25 unchanged lines hidden (view full) ---

287 w->instructionBuffer.clear();
288
289 if (w->pendingFetch)
290 w->dropFetch = true;
291
292 // is this the last wavefront in the workgroup
293 // if set the spillWidth to be the remaining work-items
294 // so that the vector access is correct
297 if ((waveId + 1) * wfSize() >= trueWgSizeTotal) {
298 w->spillWidth = trueWgSizeTotal - (waveId * wfSize());
295 if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
296 w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
299 } else {
300 w->spillWidth = wfSize();
301 }
302
303 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
304 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
305
306 w->start(++_n_wave, ndr->q.code_ptr);

--- 16 unchanged lines hidden (view full) ---

323 nullptr, 0);
324
325 gpuDynInst->useContinuation = false;
326 gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
327 gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
328 injectGlobalMemFence(gpuDynInst, true);
329 }
330
297 } else {
298 w->spillWidth = wfSize();
299 }
300
301 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
302 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
303
304 w->start(++_n_wave, ndr->q.code_ptr);

--- 16 unchanged lines hidden (view full) ---

321 nullptr, 0);
322
323 gpuDynInst->useContinuation = false;
324 gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
325 gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
326 injectGlobalMemFence(gpuDynInst, true);
327 }
328
331 // Get true size of workgroup (after clamping to grid size)
332 int trueWgSize[3];
333 int trueWgSizeTotal = 1;
334
335 for (int d = 0; d < 3; ++d) {
336 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
337 ndr->wgId[d] * ndr->q.wgSize[d]);
338
339 trueWgSizeTotal *= trueWgSize[d];
340 }
341
342 // calculate the number of 32-bit vector registers required by wavefront
343 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
344 int wave_id = 0;
345
346 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
347 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
348 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
349 // Check if this wavefront slot is available:
350 // It must be stopped and not waiting
351 // for a release to complete S_RETURNING
352 if (w->status == Wavefront::S_STOPPED) {
329 // calculate the number of 32-bit vector registers required by wavefront
330 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
331 int wave_id = 0;
332
333 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
334 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
335 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
336 // Check if this wavefront slot is available:
337 // It must be stopped and not waiting
338 // for a release to complete S_RETURNING
339 if (w->status == Wavefront::S_STOPPED) {
340 fillKernelState(w, ndr);
353 // if we have scheduled all work items then stop
354 // scheduling wavefronts
341 // if we have scheduled all work items then stop
342 // scheduling wavefronts
355 if (wave_id * wfSize() >= trueWgSizeTotal)
343 if (wave_id * wfSize() >= w->actualWgSzTotal)
356 break;
357
358 // reserve vector registers for the scheduled wavefront
359 assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
360 uint32_t normSize = 0;
361
362 w->startVgprIndex = vrf[m % numSIMDs]->manager->
363 allocateRegion(vregDemand, &normSize);
364
365 w->reservedVectorRegs = normSize;
366 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
367
344 break;
345
346 // reserve vector registers for the scheduled wavefront
347 assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
348 uint32_t normSize = 0;
349
350 w->startVgprIndex = vrf[m % numSIMDs]->manager->
351 allocateRegion(vregDemand, &normSize);
352
353 w->reservedVectorRegs = normSize;
354 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
355
368 StartWF(w, trueWgSize, trueWgSizeTotal, wave_id, ldsChunk, ndr);
356 startWavefront(w, wave_id, ldsChunk, ndr);
369 ++wave_id;
370 }
371 }
372 ++barrier_id;
373}
374
375int
376ComputeUnit::ReadyWorkgroup(NDRange *ndr)

--- 1417 unchanged lines hidden ---
357 ++wave_id;
358 }
359 }
360 ++barrier_id;
361}
362
363int
364ComputeUnit::ReadyWorkgroup(NDRange *ndr)

--- 1417 unchanged lines hidden ---