compute_unit.cc (11643:42a1873be45c) | compute_unit.cc (11657:5fad5a37d6fc) |
---|---|
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 160 unchanged lines hidden (view full) --- 169 waveStatusList.clear(); 170 dispatchList.clear(); 171 vectorAluInstAvail.clear(); 172 delete cuExitCallback; 173 delete ldsPort; 174} 175 176void | 1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 160 unchanged lines hidden (view full) --- 169 waveStatusList.clear(); 170 dispatchList.clear(); 171 vectorAluInstAvail.clear(); 172 delete cuExitCallback; 173 delete ldsPort; 174} 175 176void |
177ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr) | 177ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr) |
178{ 179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); 180 181 w->workGroupSz[0] = ndr->q.wgSize[0]; 182 w->workGroupSz[1] = ndr->q.wgSize[1]; 183 w->workGroupSz[2] = ndr->q.wgSize[2]; 184 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2]; 185 w->gridSz[0] = ndr->q.gdSize[0]; 186 w->gridSz[1] = ndr->q.gdSize[1]; 187 w->gridSz[2] = ndr->q.gdSize[2]; 188 w->kernelArgs = ndr->q.args; 189 w->privSizePerItem = ndr->q.privMemPerItem; 190 w->spillSizePerItem = ndr->q.spillMemPerItem; 191 w->roBase = ndr->q.roMemStart; 192 w->roSize = ndr->q.roMemTotal; | 178{ 179 w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); 180 181 w->workGroupSz[0] = ndr->q.wgSize[0]; 182 w->workGroupSz[1] = ndr->q.wgSize[1]; 183 w->workGroupSz[2] = ndr->q.wgSize[2]; 184 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2]; 185 w->gridSz[0] = ndr->q.gdSize[0]; 186 w->gridSz[1] = ndr->q.gdSize[1]; 187 w->gridSz[2] = ndr->q.gdSize[2]; 188 w->kernelArgs = ndr->q.args; 189 w->privSizePerItem = ndr->q.privMemPerItem; 190 w->spillSizePerItem = ndr->q.spillMemPerItem; 191 w->roBase = ndr->q.roMemStart; 192 w->roSize = ndr->q.roMemTotal; |
193 w->computeActualWgSz(ndr); |
|
193} 194 195void 196ComputeUnit::updateEvents() { 197 198 if (!timestampVec.empty()) { 199 uint32_t vecSize = timestampVec.size(); 200 uint32_t i = 0; --- 14 unchanged lines hidden (view full) --- 215 216 for (int i = 0; i< numSIMDs; ++i) { 217 vrf[i]->updateEvents(); 218 } 219} 220 221 222void | 194} 195 196void 197ComputeUnit::updateEvents() { 198 199 if (!timestampVec.empty()) { 200 uint32_t vecSize = timestampVec.size(); 201 uint32_t i = 0; --- 14 unchanged lines hidden (view full) --- 216 217 for (int i = 0; i< numSIMDs; ++i) { 218 vrf[i]->updateEvents(); 219 } 220} 221 222 223void |
223ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal, 224 int waveId, LdsChunk *ldsChunk, NDRange *ndr) | 224ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, 225 NDRange *ndr) |
225{ 226 static int _n_wave = 0; 227 | 226{ 227 static int _n_wave = 0; 228 |
228 // Fill in Kernel state 229 FillKernelState(w, ndr); 230 | |
231 VectorMask init_mask; 232 init_mask.reset(); 233 234 for (int k = 0; k < wfSize(); ++k) { | 229 VectorMask init_mask; 230 init_mask.reset(); 231 232 for (int k = 0; k < wfSize(); ++k) { |
235 if (k + waveId * wfSize() < trueWgSizeTotal) | 233 if (k + waveId * wfSize() < w->actualWgSzTotal) |
236 init_mask[k] = 1; 237 } 238 239 w->kernId = ndr->dispatchId; 240 w->wfId = waveId; 241 w->initMask = init_mask.to_ullong(); 242 243 for (int k = 0; k < wfSize(); ++k) { | 234 init_mask[k] = 1; 235 } 236 237 w->kernId = ndr->dispatchId; 238 w->wfId = waveId; 239 w->initMask = init_mask.to_ullong(); 240 241 for (int k = 0; k < wfSize(); ++k) { |
244 w->workItemId[0][k] = (k + waveId * wfSize()) % trueWgSize[0]; 245 w->workItemId[1][k] = 246 ((k + waveId * wfSize()) / trueWgSize[0]) % trueWgSize[1]; 247 w->workItemId[2][k] = 248 (k + waveId * wfSize()) / (trueWgSize[0] * trueWgSize[1]); | 242 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0]; 243 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) % 244 w->actualWgSz[1]; 245 w->workItemId[2][k] = (k + waveId * wfSize()) / 246 (w->actualWgSz[0] * w->actualWgSz[1]); |
249 | 247 |
250 w->workItemFlatId[k] = w->workItemId[2][k] * trueWgSize[0] * 251 trueWgSize[1] + w->workItemId[1][k] * trueWgSize[0] + | 248 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] * 249 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] + |
252 w->workItemId[0][k]; 253 } 254 | 250 w->workItemId[0][k]; 251 } 252 |
255 w->barrierSlots = divCeil(trueWgSizeTotal, wfSize()); | 253 w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize()); |
256 257 w->barCnt.resize(wfSize(), 0); 258 259 w->maxBarCnt = 0; 260 w->oldBarrierCnt = 0; 261 w->barrierCnt = 0; 262 263 w->privBase = ndr->q.privMemStart; --- 25 unchanged lines hidden (view full) --- 289 w->instructionBuffer.clear(); 290 291 if (w->pendingFetch) 292 w->dropFetch = true; 293 294 // is this the last wavefront in the workgroup 295 // if set the spillWidth to be the remaining work-items 296 // so that the vector access is correct | 254 255 w->barCnt.resize(wfSize(), 0); 256 257 w->maxBarCnt = 0; 258 w->oldBarrierCnt = 0; 259 w->barrierCnt = 0; 260 261 w->privBase = ndr->q.privMemStart; --- 25 unchanged lines hidden (view full) --- 287 w->instructionBuffer.clear(); 288 289 if (w->pendingFetch) 290 w->dropFetch = true; 291 292 // is this the last wavefront in the workgroup 293 // if set the spillWidth to be the remaining work-items 294 // so that the vector access is correct |
297 if ((waveId + 1) * wfSize() >= trueWgSizeTotal) { 298 w->spillWidth = trueWgSizeTotal - (waveId * wfSize()); | 295 if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) { 296 w->spillWidth = w->actualWgSzTotal - (waveId * wfSize()); |
299 } else { 300 w->spillWidth = wfSize(); 301 } 302 303 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " 304 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); 305 306 w->start(++_n_wave, ndr->q.code_ptr); --- 16 unchanged lines hidden (view full) --- 323 nullptr, 0); 324 325 gpuDynInst->useContinuation = false; 326 gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; 327 gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; 328 injectGlobalMemFence(gpuDynInst, true); 329 } 330 | 297 } else { 298 w->spillWidth = wfSize(); 299 } 300 301 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " 302 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); 303 304 w->start(++_n_wave, ndr->q.code_ptr); --- 16 unchanged lines hidden (view full) --- 321 nullptr, 0); 322 323 gpuDynInst->useContinuation = false; 324 gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; 325 gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; 326 injectGlobalMemFence(gpuDynInst, true); 327 } 328 |
331 // Get true size of workgroup (after clamping to grid size) 332 int trueWgSize[3]; 333 int trueWgSizeTotal = 1; 334 335 for (int d = 0; d < 3; ++d) { 336 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - 337 ndr->wgId[d] * ndr->q.wgSize[d]); 338 339 trueWgSizeTotal *= trueWgSize[d]; 340 } 341 | |
342 // calculate the number of 32-bit vector registers required by wavefront 343 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); 344 int wave_id = 0; 345 346 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time 347 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { 348 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; 349 // Check if this wavefront slot is available: 350 // It must be stopped and not waiting 351 // for a release to complete S_RETURNING 352 if (w->status == Wavefront::S_STOPPED) { | 329 // calculate the number of 32-bit vector registers required by wavefront 330 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); 331 int wave_id = 0; 332 333 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time 334 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { 335 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; 336 // Check if this wavefront slot is available: 337 // It must be stopped and not waiting 338 // for a release to complete S_RETURNING 339 if (w->status == Wavefront::S_STOPPED) { |
340 fillKernelState(w, ndr); |
|
353 // if we have scheduled all work items then stop 354 // scheduling wavefronts | 341 // if we have scheduled all work items then stop 342 // scheduling wavefronts |
355 if (wave_id * wfSize() >= trueWgSizeTotal) | 343 if (wave_id * wfSize() >= w->actualWgSzTotal) |
356 break; 357 358 // reserve vector registers for the scheduled wavefront 359 assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); 360 uint32_t normSize = 0; 361 362 w->startVgprIndex = vrf[m % numSIMDs]->manager-> 363 allocateRegion(vregDemand, &normSize); 364 365 w->reservedVectorRegs = normSize; 366 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; 367 | 344 break; 345 346 // reserve vector registers for the scheduled wavefront 347 assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); 348 uint32_t normSize = 0; 349 350 w->startVgprIndex = vrf[m % numSIMDs]->manager-> 351 allocateRegion(vregDemand, &normSize); 352 353 w->reservedVectorRegs = normSize; 354 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; 355 |
368 StartWF(w, trueWgSize, trueWgSizeTotal, wave_id, ldsChunk, ndr); | 356 startWavefront(w, wave_id, ldsChunk, ndr); |
369 ++wave_id; 370 } 371 } 372 ++barrier_id; 373} 374 375int 376ComputeUnit::ReadyWorkgroup(NDRange *ndr) --- 1417 unchanged lines hidden --- | 357 ++wave_id; 358 } 359 } 360 ++barrier_id; 361} 362 363int 364ComputeUnit::ReadyWorkgroup(NDRange *ndr) --- 1417 unchanged lines hidden --- |