compute_unit.cc (11534:7106f550afad) | compute_unit.cc (11638:b511733958d0) |
---|---|
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 179 unchanged lines hidden (view full) --- 188 w->kernelArgs = ndr->q.args; 189 w->privSizePerItem = ndr->q.privMemPerItem; 190 w->spillSizePerItem = ndr->q.spillMemPerItem; 191 w->roBase = ndr->q.roMemStart; 192 w->roSize = ndr->q.roMemTotal; 193} 194 195void | 1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: --- 179 unchanged lines hidden (view full) --- 188 w->kernelArgs = ndr->q.args; 189 w->privSizePerItem = ndr->q.privMemPerItem; 190 w->spillSizePerItem = ndr->q.spillMemPerItem; 191 w->roBase = ndr->q.roMemStart; 192 w->roSize = ndr->q.roMemTotal; 193} 194 195void |
196ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, 197 int trueWgSize[], int trueWgSizeTotal, 198 LdsChunk *ldsChunk, uint64_t origSpillMemStart) 199{ 200 wfCtx->cnt = cnt; 201 202 VectorMask init_mask; 203 init_mask.reset(); 204 205 for (int k = 0; k < wfSize(); ++k) { 206 if (k + cnt * wfSize() < trueWgSizeTotal) 207 init_mask[k] = 1; 208 } 209 210 wfCtx->init_mask = init_mask.to_ullong(); 211 wfCtx->exec_mask = init_mask.to_ullong(); 212 213 wfCtx->bar_cnt.resize(wfSize(), 0); 214 215 wfCtx->max_bar_cnt = 0; 216 wfCtx->old_barrier_cnt = 0; 217 wfCtx->barrier_cnt = 0; 218 219 wfCtx->privBase = ndr->q.privMemStart; 220 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); 221 222 wfCtx->spillBase = ndr->q.spillMemStart; 223 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); 224 225 wfCtx->pc = 0; 226 wfCtx->rpc = UINT32_MAX; 227 228 // set the wavefront context to have a pointer to this section of the LDS 229 wfCtx->ldsChunk = ldsChunk; 230 231 // WG state 232 wfCtx->wg_id = ndr->globalWgId; 233 wfCtx->barrier_id = barrier_id; 234 235 // Kernel wide state 236 wfCtx->ndr = ndr; 237} 238 239void | |
240ComputeUnit::updateEvents() { 241 242 if (!timestampVec.empty()) { 243 uint32_t vecSize = timestampVec.size(); 244 uint32_t i = 0; 245 while (i < vecSize) { 246 if (timestampVec[i] <= shader->tick_cnt) { 247 std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i]; --- 11 unchanged lines hidden (view full) --- 259 260 for (int i = 0; i< numSIMDs; ++i) { 261 vrf[i]->updateEvents(); 262 } 263} 264 265 266void | 196ComputeUnit::updateEvents() { 197 198 if (!timestampVec.empty()) { 199 uint32_t vecSize = timestampVec.size(); 200 uint32_t i = 0; 201 while (i < vecSize) { 202 if (timestampVec[i] <= shader->tick_cnt) { 203 std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i]; --- 11 unchanged lines hidden (view full) --- 215 216 for (int i = 0; i< numSIMDs; ++i) { 217 vrf[i]->updateEvents(); 218 } 219} 220 221 222void |
267ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], 268 int trueWgSizeTotal) | 223ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal, 224 int cnt, LdsChunk *ldsChunk, NDRange *ndr) |
269{ 270 static int _n_wave = 0; | 225{ 226 static int _n_wave = 0; |
271 int cnt = wfCtx->cnt; 272 NDRange *ndr = wfCtx->ndr; | |
273 274 // Fill in Kernel state 275 FillKernelState(w, ndr); 276 | 227 228 // Fill in Kernel state 229 FillKernelState(w, ndr); 230 |
231 VectorMask init_mask; 232 init_mask.reset(); 233 234 for (int k = 0; k < wfSize(); ++k) { 235 if (k + cnt * wfSize() < trueWgSizeTotal) 236 init_mask[k] = 1; 237 } 238 |
|
277 w->kern_id = ndr->dispatchId; 278 w->dynwaveid = cnt; | 239 w->kern_id = ndr->dispatchId; 240 w->dynwaveid = cnt; |
279 w->init_mask = wfCtx->init_mask; | 241 w->init_mask = init_mask.to_ullong(); |
280 281 for (int k = 0; k < wfSize(); ++k) { 282 w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; 283 w->workitemid[1][k] = 284 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1]; 285 w->workitemid[2][k] = 286 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]); 287 288 w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * 289 trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + 290 w->workitemid[0][k]; 291 } 292 | 242 243 for (int k = 0; k < wfSize(); ++k) { 244 w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; 245 w->workitemid[1][k] = 246 ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1]; 247 w->workitemid[2][k] = 248 (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]); 249 250 w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * 251 trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + 252 w->workitemid[0][k]; 253 } 254 |
293 w->old_barrier_cnt = wfCtx->old_barrier_cnt; 294 w->barrier_cnt = wfCtx->barrier_cnt; | |
295 w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); 296 | 255 w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); 256 |
297 for (int i = 0; i < wfSize(); ++i) { 298 w->bar_cnt[i] = wfCtx->bar_cnt[i]; 299 } | 257 w->bar_cnt.resize(wfSize(), 0); |
300 | 258 |
301 w->max_bar_cnt = wfCtx->max_bar_cnt; 302 w->privBase = wfCtx->privBase; 303 w->spillBase = wfCtx->spillBase; | 259 w->max_bar_cnt = 0; 260 w->old_barrier_cnt = 0; 261 w->barrier_cnt = 0; |
304 | 262 |
305 w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask); | 263 w->privBase = ndr->q.privMemStart; 264 ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); |
306 | 265 |
266 w->spillBase = ndr->q.spillMemStart; 267 ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); 268 269 w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong()); 270 |
|
307 // WG state | 271 // WG state |
308 w->wg_id = wfCtx->wg_id; 309 w->dispatchid = wfCtx->ndr->dispatchId; | 272 w->wg_id = ndr->globalWgId; 273 w->dispatchid = ndr->dispatchId; |
310 w->workgroupid[0] = w->wg_id % ndr->numWg[0]; 311 w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; 312 w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); 313 | 274 w->workgroupid[0] = w->wg_id % ndr->numWg[0]; 275 w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; 276 w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); 277 |
314 w->barrier_id = wfCtx->barrier_id; | 278 w->barrier_id = barrier_id; |
315 w->stalledAtBarrier = false; 316 | 279 w->stalledAtBarrier = false; 280 |
317 // move this from the context into the actual wavefront 318 w->ldsChunk = wfCtx->ldsChunk; | 281 // set the wavefront context to have a pointer to this section of the LDS 282 w->ldsChunk = ldsChunk; |
319 320 int32_t refCount M5_VAR_USED = 321 lds.increaseRefCounter(w->dispatchid, w->wg_id); 322 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", 323 cu_id, w->wg_id, refCount); 324 325 w->instructionBuffer.clear(); 326 --- 8 unchanged lines hidden (view full) --- 335 } else { 336 w->spillWidth = wfSize(); 337 } 338 339 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " 340 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); 341 342 w->start(++_n_wave, ndr->q.code_ptr); | 283 284 int32_t refCount M5_VAR_USED = 285 lds.increaseRefCounter(w->dispatchid, w->wg_id); 286 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", 287 cu_id, w->wg_id, refCount); 288 289 w->instructionBuffer.clear(); 290 --- 8 unchanged lines hidden (view full) --- 299 } else { 300 w->spillWidth = wfSize(); 301 } 302 303 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " 304 "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); 305 306 w->start(++_n_wave, ndr->q.code_ptr); |
343 wfCtx->bar_cnt.clear(); | |
344} 345 346void 347ComputeUnit::StartWorkgroup(NDRange *ndr) 348{ 349 // reserve the LDS capacity allocated to the work group 350 // disambiguated by the dispatch ID and workgroup ID, which should be 351 // globally unique --- 19 unchanged lines hidden (view full) --- 371 372 for (int d = 0; d < 3; ++d) { 373 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - 374 ndr->wgId[d] * ndr->q.wgSize[d]); 375 376 trueWgSizeTotal *= trueWgSize[d]; 377 } 378 | 307} 308 309void 310ComputeUnit::StartWorkgroup(NDRange *ndr) 311{ 312 // reserve the LDS capacity allocated to the work group 313 // disambiguated by the dispatch ID and workgroup ID, which should be 314 // globally unique --- 19 unchanged lines hidden (view full) --- 334 335 for (int d = 0; d < 3; ++d) { 336 trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - 337 ndr->wgId[d] * ndr->q.wgSize[d]); 338 339 trueWgSizeTotal *= trueWgSize[d]; 340 } 341 |
379 uint64_t origSpillMemStart = ndr->q.spillMemStart; | |
380 // calculate the number of 32-bit vector registers required by wavefront 381 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); 382 int cnt = 0; 383 384 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time 385 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { 386 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; 387 // Check if this wavefront slot is available: --- 10 unchanged lines hidden (view full) --- 398 uint32_t normSize = 0; 399 400 w->startVgprIndex = vrf[m % numSIMDs]->manager-> 401 allocateRegion(vregDemand, &normSize); 402 403 w->reservedVectorRegs = normSize; 404 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; 405 | 342 // calculate the number of 32-bit vector registers required by wavefront 343 int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); 344 int cnt = 0; 345 346 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time 347 for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { 348 Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; 349 // Check if this wavefront slot is available: --- 10 unchanged lines hidden (view full) --- 360 uint32_t normSize = 0; 361 362 w->startVgprIndex = vrf[m % numSIMDs]->manager-> 363 allocateRegion(vregDemand, &normSize); 364 365 w->reservedVectorRegs = normSize; 366 vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; 367 |
406 WFContext wfCtx; 407 408 InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal, 409 ldsChunk, origSpillMemStart); 410 411 StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal); | 368 StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr); |
412 ++cnt; 413 } 414 } 415 ++barrier_id; 416} 417 418int 419ComputeUnit::ReadyWorkgroup(NDRange *ndr) --- 1417 unchanged lines hidden --- | 369 ++cnt; 370 } 371 } 372 ++barrier_id; 373} 374 375int 376ComputeUnit::ReadyWorkgroup(NDRange *ndr) --- 1417 unchanged lines hidden --- |