shader.cc revision 11905
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 */ 35 36#include "gpu-compute/shader.hh" 37 38#include <limits> 39 40#include "arch/x86/linux/linux.hh" 41#include "base/chunk_generator.hh" 42#include "debug/GPUDisp.hh" 43#include "debug/GPUMem.hh" 44#include "debug/HSAIL.hh" 45#include "gpu-compute/dispatcher.hh" 46#include "gpu-compute/gpu_static_inst.hh" 47#include "gpu-compute/qstruct.hh" 48#include "gpu-compute/wavefront.hh" 49#include "mem/packet.hh" 50#include "mem/ruby/system/RubySystem.hh" 51#include "sim/sim_exit.hh" 52 53Shader::Shader(const Params *p) : ClockedObject(p), 54 clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), 55 cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), 56 hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), 57 separate_acquire_release(p->separate_acquire_release), coissue_return(1), 58 trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), 59 globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), 60 box_tick_cnt(0), start_tick_cnt(0) 61{ 62 63 cuList.resize(n_cu); 64 65 for (int i = 0; i < n_cu; ++i) { 66 cuList[i] = p->CUs[i]; 67 assert(i == cuList[i]->cu_id); 68 cuList[i]->shader = this; 69 } 70} 71 72Addr 73Shader::mmap(int length) 74{ 75 76 Addr start; 77 78 // round up length to the next page 79 length = roundUp(length, TheISA::PageBytes); 80 81 Process *proc = gpuTc->getProcessPtr(); 82 auto mem_state = proc->memState; 83 84 if (proc->mmapGrowsDown()) { 85 DPRINTF(HSAIL, "GROWS DOWN"); 86 start = mem_state->getMmapEnd() - length; 87 mem_state->setMmapEnd(start); 88 } else { 89 DPRINTF(HSAIL, "GROWS UP"); 90 start = mem_state->getMmapEnd(); 91 mem_state->setMmapEnd(start + length); 92 93 // assertion to make sure we don't overwrite the stack (it grows down) 94 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() > 95 mem_state->getMmapEnd()); 96 } 97 98 DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); 99 100 proc->allocateMem(start, length); 101 102 return start; 103} 104 105void 106Shader::init() 107{ 108 // grab the threadContext of the thread running on the CPU 109 assert(cpuPointer); 110 gpuTc = cpuPointer->getContext(0); 111 assert(gpuTc); 112} 113 114Shader::~Shader() 115{ 116 for (int j = 0; j < n_cu; ++j) 117 delete cuList[j]; 118} 119 120void 121Shader::updateContext(int cid) { 122 // context of the thread which dispatched work 123 assert(cpuPointer); 124 gpuTc = cpuPointer->getContext(cid); 125 assert(gpuTc); 126} 127 128void 129Shader::hostWakeUp(BaseCPU *cpu) { 130 if (cpuPointer == cpu) { 131 if (gpuTc->status() == ThreadContext::Suspended) 132 cpu->activateContext(gpuTc->threadId()); 133 } else { 134 //Make sure both dispatcher and shader are trying to 135 //wakeup same host. Hack here to enable kernel launch 136 //from multiple CPUs 137 panic("Dispatcher wants to wakeup a different host"); 138 } 139} 140 141Shader* 142ShaderParams::create() 143{ 144 return new Shader(this); 145} 146 147void 148Shader::exec() 149{ 150 tick_cnt = curTick(); 151 box_tick_cnt = curTick() - start_tick_cnt; 152 153 // apply any scheduled adds 154 for (int i = 0; i < sa_n; ++i) { 155 if (sa_when[i] <= tick_cnt) { 156 *sa_val[i] += sa_x[i]; 157 sa_val.erase(sa_val.begin() + i); 158 sa_x.erase(sa_x.begin() + i); 159 sa_when.erase(sa_when.begin() + i); 160 --sa_n; 161 --i; 162 } 163 } 164 165 // clock all of the cu's 166 for (int i = 0; i < n_cu; ++i) 167 cuList[i]->exec(); 168} 169 170bool 171Shader::dispatch_workgroups(NDRange *ndr) 172{ 173 bool scheduledSomething = false; 174 int cuCount = 0; 175 int curCu = nextSchedCu; 176 177 while (cuCount < n_cu) { 178 //Every time we try a CU, update nextSchedCu 179 nextSchedCu = (nextSchedCu + 1) % n_cu; 180 181 // dispatch workgroup iff the following two conditions are met: 182 // (a) wg_rem is true - there are unassigned workgroups in the grid 183 // (b) there are enough free slots in cu cuList[i] for this wg 184 if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { 185 scheduledSomething = true; 186 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); 187 188 // ticks() member function translates cycles to simulation ticks. 189 if (!tickEvent.scheduled()) { 190 schedule(tickEvent, curTick() + this->ticks(1)); 191 } 192 193 cuList[curCu]->StartWorkgroup(ndr); 194 ndr->wgId[0]++; 195 ndr->globalWgId++; 196 if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { 197 ndr->wgId[0] = 0; 198 ndr->wgId[1]++; 199 200 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { 201 ndr->wgId[1] = 0; 202 ndr->wgId[2]++; 203 204 if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { 205 ndr->wg_disp_rem = false; 206 break; 207 } 208 } 209 } 210 } 211 212 ++cuCount; 213 curCu = nextSchedCu; 214 } 215 216 return scheduledSomething; 217} 218 219void 220Shader::handshake(GpuDispatcher *_dispatcher) 221{ 222 dispatcher = _dispatcher; 223} 224 225void 226Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, 227 bool suppress_func_errors, int cu_id) 228{ 229 int block_size = cuList.at(cu_id)->cacheLineSize(); 230 unsigned size = req->getSize(); 231 232 Addr tmp_addr; 233 BaseTLB::Mode trans_mode; 234 235 if (cmd == MemCmd::ReadReq) { 236 trans_mode = BaseTLB::Read; 237 } else if (cmd == MemCmd::WriteReq) { 238 trans_mode = BaseTLB::Write; 239 } else { 240 fatal("unexcepted MemCmd\n"); 241 } 242 243 tmp_addr = req->getVaddr(); 244 Addr split_addr = roundDown(tmp_addr + size - 1, block_size); 245 246 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); 247 248 // Misaligned access 249 if (split_addr > tmp_addr) { 250 RequestPtr req1, req2; 251 req->splitOnVaddr(split_addr, req1, req2); 252 253 254 PacketPtr pkt1 = new Packet(req2, cmd); 255 PacketPtr pkt2 = new Packet(req1, cmd); 256 257 functionalTLBAccess(pkt1, cu_id, trans_mode); 258 functionalTLBAccess(pkt2, cu_id, trans_mode); 259 260 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); 261 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); 262 263 new_pkt1->dataStatic(data); 264 new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); 265 266 if (suppress_func_errors) { 267 new_pkt1->setSuppressFuncError(); 268 new_pkt2->setSuppressFuncError(); 269 } 270 271 // fixme: this should be cuList[cu_id] if cu_id != n_cu 272 // The latter requires a memPort in the dispatcher 273 cuList[0]->memPort[0]->sendFunctional(new_pkt1); 274 cuList[0]->memPort[0]->sendFunctional(new_pkt2); 275 276 delete new_pkt1; 277 delete new_pkt2; 278 delete pkt1; 279 delete pkt2; 280 } else { 281 PacketPtr pkt = new Packet(req, cmd); 282 functionalTLBAccess(pkt, cu_id, trans_mode); 283 PacketPtr new_pkt = new Packet(pkt->req, cmd); 284 new_pkt->dataStatic(data); 285 286 if (suppress_func_errors) { 287 new_pkt->setSuppressFuncError(); 288 }; 289 290 // fixme: this should be cuList[cu_id] if cu_id != n_cu 291 // The latter requires a memPort in the dispatcher 292 cuList[0]->memPort[0]->sendFunctional(new_pkt); 293 294 delete new_pkt; 295 delete pkt; 296 } 297} 298 299bool 300Shader::busy() 301{ 302 for (int i_cu = 0; i_cu < n_cu; ++i_cu) { 303 if (!cuList[i_cu]->isDone()) { 304 return true; 305 } 306 } 307 308 return false; 309} 310 311void 312Shader::ScheduleAdd(uint32_t *val,Tick when,int x) 313{ 314 sa_val.push_back(val); 315 sa_when.push_back(tick_cnt + when); 316 sa_x.push_back(x); 317 ++sa_n; 318} 319 320Shader::TickEvent::TickEvent(Shader *_shader) 321 : Event(CPU_Tick_Pri), shader(_shader) 322{ 323} 324 325 326void 327Shader::TickEvent::process() 328{ 329 if (shader->busy()) { 330 shader->exec(); 331 shader->schedule(this, curTick() + shader->ticks(1)); 332 } 333} 334 335const char* 336Shader::TickEvent::description() const 337{ 338 return "Shader tick"; 339} 340 341void 342Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 343 MemCmd cmd, bool suppress_func_errors) 344{ 345 uint8_t *data_buf = (uint8_t*)ptr; 346 347 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize()); 348 !gen.done(); gen.next()) { 349 Request *req = new Request(0, gen.addr(), gen.size(), 0, 350 cuList[0]->masterId(), 0, 0, 0); 351 352 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); 353 data_buf += gen.size(); 354 delete req; 355 } 356} 357 358void 359Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) 360{ 361 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); 362} 363 364void 365Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 366 bool suppress_func_errors) 367{ 368 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); 369} 370 371void 372Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) 373{ 374 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); 375} 376 377void 378Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 379 bool suppress_func_errors) 380{ 381 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, 382 suppress_func_errors); 383} 384 385/* 386 * Send a packet through the appropriate TLB functional port. 387 * If cu_id=n_cu, then this is the dispatcher's TLB. 388 * Otherwise it's the TLB of the cu_id compute unit. 389 */ 390void 391Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) 392{ 393 // update senderState. Need to know the gpuTc and the TLB mode 394 pkt->senderState = 395 new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); 396 397 if (cu_id == n_cu) { 398 dispatcher->tlbPort->sendFunctional(pkt); 399 } else { 400 // even when the perLaneTLB flag is turned on 401 // it's ok tp send all accesses through lane 0 402 // since the lane # is not known here, 403 // This isn't important since these are functional accesses. 404 cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); 405 } 406 407 /* safe_cast the senderState */ 408 TheISA::GpuTLB::TranslationState *sender_state = 409 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 410 411 delete sender_state->tlbEntry; 412 delete pkt->senderState; 413} 414