shader.cc revision 12697:cd71b966be1e
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Steve Reinhardt 34 */ 35 36#include "gpu-compute/shader.hh" 37 38#include <limits> 39 40#include "arch/x86/linux/linux.hh" 41#include "base/chunk_generator.hh" 42#include "debug/GPUDisp.hh" 43#include "debug/GPUMem.hh" 44#include "debug/HSAIL.hh" 45#include "gpu-compute/dispatcher.hh" 46#include "gpu-compute/gpu_static_inst.hh" 47#include "gpu-compute/qstruct.hh" 48#include "gpu-compute/wavefront.hh" 49#include "mem/packet.hh" 50#include "mem/ruby/system/RubySystem.hh" 51#include "sim/sim_exit.hh" 52 53Shader::Shader(const Params *p) 54 : ClockedObject(p), clock(p->clk_domain->clockPeriod()), 55 cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer), 56 tickEvent([this]{ processTick(); }, "Shader tick", 57 false, Event::CPU_Tick_Pri), 58 timingSim(p->timing), hsail_mode(SIMT), 59 impl_kern_boundary_sync(p->impl_kern_boundary_sync), 60 separate_acquire_release(p->separate_acquire_release), coissue_return(1), 61 trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), 62 globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), 63 box_tick_cnt(0), start_tick_cnt(0) 64{ 65 66 cuList.resize(n_cu); 67 68 for (int i = 0; i < n_cu; ++i) { 69 cuList[i] = p->CUs[i]; 70 assert(i == cuList[i]->cu_id); 71 cuList[i]->shader = this; 72 } 73} 74 75Addr 76Shader::mmap(int length) 77{ 78 79 Addr start; 80 81 // round up length to the next page 82 length = roundUp(length, TheISA::PageBytes); 83 84 Process *proc = gpuTc->getProcessPtr(); 85 auto mem_state = proc->memState; 86 87 if (proc->mmapGrowsDown()) { 88 DPRINTF(HSAIL, "GROWS DOWN"); 89 start = mem_state->getMmapEnd() - length; 90 mem_state->setMmapEnd(start); 91 } else { 92 DPRINTF(HSAIL, "GROWS UP"); 93 start = mem_state->getMmapEnd(); 94 mem_state->setMmapEnd(start + length); 95 96 // assertion to make sure we don't overwrite the stack (it grows down) 97 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() > 98 mem_state->getMmapEnd()); 99 } 100 101 DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); 102 103 proc->allocateMem(start, length); 104 105 return start; 106} 107 108void 109Shader::init() 110{ 111 // grab the threadContext of the thread running on the CPU 112 assert(cpuPointer); 113 gpuTc = cpuPointer->getContext(0); 114 assert(gpuTc); 115} 116 117Shader::~Shader() 118{ 119 for (int j = 0; j < n_cu; ++j) 120 delete cuList[j]; 121} 122 123void 124Shader::updateContext(int cid) { 125 // context of the thread which dispatched work 126 assert(cpuPointer); 127 gpuTc = cpuPointer->getContext(cid); 128 assert(gpuTc); 129} 130 131void 132Shader::hostWakeUp(BaseCPU *cpu) { 133 if (cpuPointer == cpu) { 134 if (gpuTc->status() == ThreadContext::Suspended) 135 cpu->activateContext(gpuTc->threadId()); 136 } else { 137 //Make sure both dispatcher and shader are trying to 138 //wakeup same host. Hack here to enable kernel launch 139 //from multiple CPUs 140 panic("Dispatcher wants to wakeup a different host"); 141 } 142} 143 144Shader* 145ShaderParams::create() 146{ 147 return new Shader(this); 148} 149 150void 151Shader::exec() 152{ 153 tick_cnt = curTick(); 154 box_tick_cnt = curTick() - start_tick_cnt; 155 156 // apply any scheduled adds 157 for (int i = 0; i < sa_n; ++i) { 158 if (sa_when[i] <= tick_cnt) { 159 *sa_val[i] += sa_x[i]; 160 sa_val.erase(sa_val.begin() + i); 161 sa_x.erase(sa_x.begin() + i); 162 sa_when.erase(sa_when.begin() + i); 163 --sa_n; 164 --i; 165 } 166 } 167 168 // clock all of the cu's 169 for (int i = 0; i < n_cu; ++i) 170 cuList[i]->exec(); 171} 172 173bool 174Shader::dispatch_workgroups(NDRange *ndr) 175{ 176 bool scheduledSomething = false; 177 int cuCount = 0; 178 int curCu = nextSchedCu; 179 180 while (cuCount < n_cu) { 181 //Every time we try a CU, update nextSchedCu 182 nextSchedCu = (nextSchedCu + 1) % n_cu; 183 184 // dispatch workgroup iff the following two conditions are met: 185 // (a) wg_rem is true - there are unassigned workgroups in the grid 186 // (b) there are enough free slots in cu cuList[i] for this wg 187 if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { 188 scheduledSomething = true; 189 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); 190 191 // ticks() member function translates cycles to simulation ticks. 192 if (!tickEvent.scheduled()) { 193 schedule(tickEvent, curTick() + this->ticks(1)); 194 } 195 196 cuList[curCu]->StartWorkgroup(ndr); 197 ndr->wgId[0]++; 198 ndr->globalWgId++; 199 if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { 200 ndr->wgId[0] = 0; 201 ndr->wgId[1]++; 202 203 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { 204 ndr->wgId[1] = 0; 205 ndr->wgId[2]++; 206 207 if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { 208 ndr->wg_disp_rem = false; 209 break; 210 } 211 } 212 } 213 } 214 215 ++cuCount; 216 curCu = nextSchedCu; 217 } 218 219 return scheduledSomething; 220} 221 222void 223Shader::handshake(GpuDispatcher *_dispatcher) 224{ 225 dispatcher = _dispatcher; 226} 227 228void 229Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, 230 bool suppress_func_errors, int cu_id) 231{ 232 int block_size = cuList.at(cu_id)->cacheLineSize(); 233 unsigned size = req->getSize(); 234 235 Addr tmp_addr; 236 BaseTLB::Mode trans_mode; 237 238 if (cmd == MemCmd::ReadReq) { 239 trans_mode = BaseTLB::Read; 240 } else if (cmd == MemCmd::WriteReq) { 241 trans_mode = BaseTLB::Write; 242 } else { 243 fatal("unexcepted MemCmd\n"); 244 } 245 246 tmp_addr = req->getVaddr(); 247 Addr split_addr = roundDown(tmp_addr + size - 1, block_size); 248 249 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); 250 251 // Misaligned access 252 if (split_addr > tmp_addr) { 253 RequestPtr req1, req2; 254 req->splitOnVaddr(split_addr, req1, req2); 255 256 257 PacketPtr pkt1 = new Packet(req2, cmd); 258 PacketPtr pkt2 = new Packet(req1, cmd); 259 260 functionalTLBAccess(pkt1, cu_id, trans_mode); 261 functionalTLBAccess(pkt2, cu_id, trans_mode); 262 263 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); 264 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); 265 266 new_pkt1->dataStatic(data); 267 new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); 268 269 if (suppress_func_errors) { 270 new_pkt1->setSuppressFuncError(); 271 new_pkt2->setSuppressFuncError(); 272 } 273 274 // fixme: this should be cuList[cu_id] if cu_id != n_cu 275 // The latter requires a memPort in the dispatcher 276 cuList[0]->memPort[0]->sendFunctional(new_pkt1); 277 cuList[0]->memPort[0]->sendFunctional(new_pkt2); 278 279 delete new_pkt1; 280 delete new_pkt2; 281 delete pkt1; 282 delete pkt2; 283 } else { 284 PacketPtr pkt = new Packet(req, cmd); 285 functionalTLBAccess(pkt, cu_id, trans_mode); 286 PacketPtr new_pkt = new Packet(pkt->req, cmd); 287 new_pkt->dataStatic(data); 288 289 if (suppress_func_errors) { 290 new_pkt->setSuppressFuncError(); 291 }; 292 293 // fixme: this should be cuList[cu_id] if cu_id != n_cu 294 // The latter requires a memPort in the dispatcher 295 cuList[0]->memPort[0]->sendFunctional(new_pkt); 296 297 delete new_pkt; 298 delete pkt; 299 } 300} 301 302bool 303Shader::busy() 304{ 305 for (int i_cu = 0; i_cu < n_cu; ++i_cu) { 306 if (!cuList[i_cu]->isDone()) { 307 return true; 308 } 309 } 310 311 return false; 312} 313 314void 315Shader::ScheduleAdd(uint32_t *val,Tick when,int x) 316{ 317 sa_val.push_back(val); 318 sa_when.push_back(tick_cnt + when); 319 sa_x.push_back(x); 320 ++sa_n; 321} 322 323 324void 325Shader::processTick() 326{ 327 if (busy()) { 328 exec(); 329 schedule(tickEvent, curTick() + ticks(1)); 330 } 331} 332 333void 334Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 335 MemCmd cmd, bool suppress_func_errors) 336{ 337 uint8_t *data_buf = (uint8_t*)ptr; 338 339 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize()); 340 !gen.done(); gen.next()) { 341 Request *req = new Request(0, gen.addr(), gen.size(), 0, 342 cuList[0]->masterId(), 0, 0, 0); 343 344 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); 345 data_buf += gen.size(); 346 delete req; 347 } 348} 349 350void 351Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) 352{ 353 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); 354} 355 356void 357Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 358 bool suppress_func_errors) 359{ 360 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); 361} 362 363void 364Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) 365{ 366 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); 367} 368 369void 370Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, 371 bool suppress_func_errors) 372{ 373 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, 374 suppress_func_errors); 375} 376 377/* 378 * Send a packet through the appropriate TLB functional port. 379 * If cu_id=n_cu, then this is the dispatcher's TLB. 380 * Otherwise it's the TLB of the cu_id compute unit. 381 */ 382void 383Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) 384{ 385 // update senderState. Need to know the gpuTc and the TLB mode 386 pkt->senderState = 387 new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); 388 389 if (cu_id == n_cu) { 390 dispatcher->tlbPort->sendFunctional(pkt); 391 } else { 392 // even when the perLaneTLB flag is turned on 393 // it's ok tp send all accesses through lane 0 394 // since the lane # is not known here, 395 // This isn't important since these are functional accesses. 396 cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); 397 } 398 399 /* safe_cast the senderState */ 400 TheISA::GpuTLB::TranslationState *sender_state = 401 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 402 403 delete sender_state->tlbEntry; 404 delete pkt->senderState; 405} 406