Cross Reference: /gem5/src/gpu-compute/shader.cc

Deleted Added

sdiff udiff text old ( 11308:7d8836fd043d ) new ( 11386:94c09b607a84 )

full compact

shader.cc (11308:7d8836fd043d)	shader.cc (11386:94c09b607a84)
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 / 35 36#include "gpu-compute/shader.hh" 37 38#include <limits> 39 40#include "arch/x86/linux/linux.hh" 41#include "base/chunk_generator.hh" 42#include "debug/GPUDisp.hh" 43#include "debug/GPUMem.hh" 44#include "debug/HSAIL.hh" 45#include "gpu-compute/dispatcher.hh" 46#include "gpu-compute/gpu_static_inst.hh" 47#include "gpu-compute/qstruct.hh" 48#include "gpu-compute/wavefront.hh" 49#include "mem/packet.hh" 50#include "mem/ruby/system/RubySystem.hh" 51#include "sim/sim_exit.hh" 52 53Shader::Shader(const Params p) : SimObject(p), 54 clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), 55 cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), 56 hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), 57 separate_acquire_release(p->separate_acquire_release), coissue_return(1), 58 trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), 59 globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), 60 box_tick_cnt(0), start_tick_cnt(0) 61{ 62 63 cuList.resize(n_cu); 64 65 for (int i = 0; i < n_cu; ++i) { 66 cuList[i] = p->CUs[i]; 67 assert(i == cuList[i]->cu_id); 68 cuList[i]->shader = this; 69 } 70} 71 72Addr 73Shader::mmap(int length) 74{ 75 76 Addr start; 77 78 // round up length to the next page 79 length = roundUp(length, TheISA::PageBytes); 80	1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Steve Reinhardt 34 / 35 36#include "gpu-compute/shader.hh" 37 38#include <limits> 39 40#include "arch/x86/linux/linux.hh" 41#include "base/chunk_generator.hh" 42#include "debug/GPUDisp.hh" 43#include "debug/GPUMem.hh" 44#include "debug/HSAIL.hh" 45#include "gpu-compute/dispatcher.hh" 46#include "gpu-compute/gpu_static_inst.hh" 47#include "gpu-compute/qstruct.hh" 48#include "gpu-compute/wavefront.hh" 49#include "mem/packet.hh" 50#include "mem/ruby/system/RubySystem.hh" 51#include "sim/sim_exit.hh" 52 53Shader::Shader(const Params p) : SimObject(p), 54 clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), 55 cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), 56 hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), 57 separate_acquire_release(p->separate_acquire_release), coissue_return(1), 58 trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), 59 globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), 60 box_tick_cnt(0), start_tick_cnt(0) 61{ 62 63 cuList.resize(n_cu); 64 65 for (int i = 0; i < n_cu; ++i) { 66 cuList[i] = p->CUs[i]; 67 assert(i == cuList[i]->cu_id); 68 cuList[i]->shader = this; 69 } 70} 71 72Addr 73Shader::mmap(int length) 74{ 75 76 Addr start; 77 78 // round up length to the next page 79 length = roundUp(length, TheISA::PageBytes); 80
81 if (X86Linux64::mmapGrowsDown()) {	81 Process *proc = gpuTc->getProcessPtr(); 82 83 if (proc->mmapGrowsDown()) {
82 DPRINTF(HSAIL, "GROWS DOWN");	84 DPRINTF(HSAIL, "GROWS DOWN");
83 start = gpuTc->getProcessPtr()->mmap_end -length; 84 gpuTc->getProcessPtr()->mmap_end = start;	85 start = proc->mmap_end - length; 86 proc->mmap_end = start;
85 } else { 86 DPRINTF(HSAIL, "GROWS UP");	87 } else { 88 DPRINTF(HSAIL, "GROWS UP");
87 start = gpuTc->getProcessPtr()->mmap_end; 88 gpuTc->getProcessPtr()->mmap_end += length;	89 start = proc->mmap_end; 90 proc->mmap_end += length;
89 90 // assertion to make sure we don't overwrite the stack (it grows down)	91 92 // assertion to make sure we don't overwrite the stack (it grows down)
91 assert(gpuTc->getProcessPtr()->mmap_end < 92 gpuTc->getProcessPtr()->stack_base - 93 gpuTc->getProcessPtr()->max_stack_size); 94	93 assert(proc->mmap_end < proc->stack_base - proc->max_stack_size);
95 } 96 97 DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); 98	94 } 95 96 DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); 97
99 gpuTc->getProcessPtr()->allocateMem(start,length);	98 proc->allocateMem(start, length);
100 101 return start; 102} 103 104void 105Shader::init() 106{ 107 // grab the threadContext of the thread running on the CPU 108 assert(cpuPointer); 109 gpuTc = cpuPointer->getContext(0); 110 assert(gpuTc); 111} 112 113Shader::~Shader() 114{ 115 for (int j = 0; j < n_cu; ++j) 116 delete cuList[j]; 117} 118 119void 120Shader::updateThreadContext(int tid) { 121 // thread context of the thread which dispatched work 122 assert(cpuPointer); 123 gpuTc = cpuPointer->getContext(tid); 124 assert(gpuTc); 125} 126 127void 128Shader::hostWakeUp(BaseCPU cpu) { 129* if (cpuPointer == cpu) { 130 if (gpuTc->status() == ThreadContext::Suspended) 131 cpu->activateContext(gpuTc->threadId()); 132 } else { 133 //Make sure both dispatcher and shader are trying to 134 //wakeup same host. Hack here to enable kernel launch 135 //from multiple CPUs 136 panic("Dispatcher wants to wakeup a different host"); 137 } 138} 139 140Shader* 141ShaderParams::create() 142{ 143 return new Shader(this); 144} 145 146void 147Shader::exec() 148{ 149 tick_cnt = curTick(); 150 box_tick_cnt = curTick() - start_tick_cnt; 151 152 // apply any scheduled adds 153 for (int i = 0; i < sa_n; ++i) { 154 if (sa_when[i] <= tick_cnt) { 155 sa_val[i] += sa_x[i]; 156* sa_val.erase(sa_val.begin() + i); 157 sa_x.erase(sa_x.begin() + i); 158 sa_when.erase(sa_when.begin() + i); 159 --sa_n; 160 --i; 161 } 162 } 163 164 // clock all of the cu's 165 for (int i = 0; i < n_cu; ++i) 166 cuList[i]->exec(); 167} 168 169bool 170Shader::dispatch_workgroups(NDRange ndr) 171{ 172* bool scheduledSomething = false; 173 int cuCount = 0; 174 int curCu = nextSchedCu; 175 176 while (cuCount < n_cu) { 177 //Every time we try a CU, update nextSchedCu 178 nextSchedCu = (nextSchedCu + 1) % n_cu; 179 180 // dispatch workgroup iff the following two conditions are met: 181 // (a) wg_rem is true - there are unassigned workgroups in the grid 182 // (b) there are enough free slots in cu cuList[i] for this wg 183 if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { 184 scheduledSomething = true; 185 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); 186 187 // ticks() member function translates cycles to simulation ticks. 188 if (!tickEvent.scheduled()) { 189 schedule(tickEvent, curTick() + this->ticks(1)); 190 } 191 192 cuList[curCu]->StartWorkgroup(ndr); 193 ndr->wgId[0]++; 194 ndr->globalWgId++; 195 if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { 196 ndr->wgId[0] = 0; 197 ndr->wgId[1]++; 198 199 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { 200 ndr->wgId[1] = 0; 201 ndr->wgId[2]++; 202 203 if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { 204 ndr->wg_disp_rem = false; 205 break; 206 } 207 } 208 } 209 } 210 211 ++cuCount; 212 curCu = nextSchedCu; 213 } 214 215 return scheduledSomething; 216} 217 218void 219Shader::handshake(GpuDispatcher _dispatcher) 220{ 221* dispatcher = _dispatcher; 222} 223 224void 225Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void data, 226* bool suppress_func_errors, int cu_id) 227{ 228 unsigned block_size = RubySystem::getBlockSizeBytes(); 229 unsigned size = req->getSize(); 230 231 Addr tmp_addr; 232 BaseTLB::Mode trans_mode; 233 234 if (cmd == MemCmd::ReadReq) { 235 trans_mode = BaseTLB::Read; 236 } else if (cmd == MemCmd::WriteReq) { 237 trans_mode = BaseTLB::Write; 238 } else { 239 fatal("unexcepted MemCmd\n"); 240 } 241 242 tmp_addr = req->getVaddr(); 243 Addr split_addr = roundDown(tmp_addr + size - 1, block_size); 244 245 assert(split_addr <= tmp_addr \|\| split_addr - tmp_addr < block_size); 246 247 // Misaligned access 248 if (split_addr > tmp_addr) { 249 RequestPtr req1, req2; 250 req->splitOnVaddr(split_addr, req1, req2); 251 252 253 PacketPtr pkt1 = new Packet(req2, cmd); 254 PacketPtr pkt2 = new Packet(req1, cmd); 255 256 functionalTLBAccess(pkt1, cu_id, trans_mode); 257 functionalTLBAccess(pkt2, cu_id, trans_mode); 258 259 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); 260 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); 261 262 new_pkt1->dataStatic(data); 263 new_pkt2->dataStatic((uint8_t)data + req1->getSize()); 264* 265 if (suppress_func_errors) { 266 new_pkt1->setSuppressFuncError(); 267 new_pkt2->setSuppressFuncError(); 268 } 269 270 // fixme: this should be cuList[cu_id] if cu_id != n_cu 271 // The latter requires a memPort in the dispatcher 272 cuList[0]->memPort[0]->sendFunctional(new_pkt1); 273 cuList[0]->memPort[0]->sendFunctional(new_pkt2); 274 275 delete new_pkt1; 276 delete new_pkt2; 277 delete pkt1; 278 delete pkt2; 279 } else { 280 PacketPtr pkt = new Packet(req, cmd); 281 functionalTLBAccess(pkt, cu_id, trans_mode); 282 PacketPtr new_pkt = new Packet(pkt->req, cmd); 283 new_pkt->dataStatic(data); 284 285 if (suppress_func_errors) { 286 new_pkt->setSuppressFuncError(); 287 }; 288 289 // fixme: this should be cuList[cu_id] if cu_id != n_cu 290 // The latter requires a memPort in the dispatcher 291 cuList[0]->memPort[0]->sendFunctional(new_pkt); 292 293 delete new_pkt; 294 delete pkt; 295 } 296} 297 298bool 299Shader::busy() 300{ 301 for (int i_cu = 0; i_cu < n_cu; ++i_cu) { 302 if (!cuList[i_cu]->isDone()) { 303 return true; 304 } 305 } 306 307 return false; 308} 309 310void 311Shader::ScheduleAdd(uint32_t val,Tick when,int x) 312{ 313* sa_val.push_back(val); 314 sa_when.push_back(tick_cnt + when); 315 sa_x.push_back(x); 316 ++sa_n; 317} 318 319Shader::TickEvent::TickEvent(Shader _shader) 320* : Event(CPU_Tick_Pri), shader(_shader) 321{ 322} 323 324 325void 326Shader::TickEvent::process() 327{ 328 if (shader->busy()) { 329 shader->exec(); 330 shader->schedule(this, curTick() + shader->ticks(1)); 331 } 332} 333 334const char* 335Shader::TickEvent::description() const 336{ 337 return "Shader tick"; 338} 339 340void 341Shader::AccessMem(uint64_t address, void ptr, uint32_t size, int cu_id, 342* MemCmd cmd, bool suppress_func_errors) 343{ 344 uint8_t data_buf = (uint8_t)ptr; 345 346 for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes()); 347 !gen.done(); gen.next()) { 348 Request req = new Request(0, gen.addr(), gen.size(), 0, 349* cuList[0]->masterId(), 0, 0, 0); 350 351 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); 352 data_buf += gen.size(); 353 delete req; 354 } 355} 356 357void 358Shader::ReadMem(uint64_t address, void ptr, uint32_t size, int cu_id) 359{ 360* AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); 361} 362 363void 364Shader::ReadMem(uint64_t address, void ptr, uint32_t size, int cu_id, 365* bool suppress_func_errors) 366{ 367 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); 368} 369 370void 371Shader::WriteMem(uint64_t address, void ptr,uint32_t size, int cu_id) 372{ 373* AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); 374} 375 376void 377Shader::WriteMem(uint64_t address, void ptr, uint32_t size, int cu_id, 378* bool suppress_func_errors) 379{ 380 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, 381 suppress_func_errors); 382} 383 384/* 385 * Send a packet through the appropriate TLB functional port. 386 * If cu_id=n_cu, then this is the dispatcher's TLB. 387 * Otherwise it's the TLB of the cu_id compute unit. 388 / 389void 390Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) 391{ 392* // update senderState. Need to know the gpuTc and the TLB mode 393 pkt->senderState = 394 new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); 395 396 if (cu_id == n_cu) { 397 dispatcher->tlbPort->sendFunctional(pkt); 398 } else { 399 // even when the perLaneTLB flag is turned on 400 // it's ok tp send all accesses through lane 0 401 // since the lane # is not known here, 402 // This isn't important since these are functional accesses. 403 cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); 404 } 405 406 /* safe_cast the senderState / 407* TheISA::GpuTLB::TranslationState sender_state = 408* safe_cast<TheISA::GpuTLB::TranslationState>(pkt->senderState); 409* 410 delete sender_state->tlbEntry; 411 delete pkt->senderState; 412}	99 100 return start; 101} 102 103void 104Shader::init() 105{ 106 // grab the threadContext of the thread running on the CPU 107 assert(cpuPointer); 108 gpuTc = cpuPointer->getContext(0); 109 assert(gpuTc); 110} 111 112Shader::~Shader() 113{ 114 for (int j = 0; j < n_cu; ++j) 115 delete cuList[j]; 116} 117 118void 119Shader::updateThreadContext(int tid) { 120 // thread context of the thread which dispatched work 121 assert(cpuPointer); 122 gpuTc = cpuPointer->getContext(tid); 123 assert(gpuTc); 124} 125 126void 127Shader::hostWakeUp(BaseCPU cpu) { 128* if (cpuPointer == cpu) { 129 if (gpuTc->status() == ThreadContext::Suspended) 130 cpu->activateContext(gpuTc->threadId()); 131 } else { 132 //Make sure both dispatcher and shader are trying to 133 //wakeup same host. Hack here to enable kernel launch 134 //from multiple CPUs 135 panic("Dispatcher wants to wakeup a different host"); 136 } 137} 138 139Shader* 140ShaderParams::create() 141{ 142 return new Shader(this); 143} 144 145void 146Shader::exec() 147{ 148 tick_cnt = curTick(); 149 box_tick_cnt = curTick() - start_tick_cnt; 150 151 // apply any scheduled adds 152 for (int i = 0; i < sa_n; ++i) { 153 if (sa_when[i] <= tick_cnt) { 154 sa_val[i] += sa_x[i]; 155* sa_val.erase(sa_val.begin() + i); 156 sa_x.erase(sa_x.begin() + i); 157 sa_when.erase(sa_when.begin() + i); 158 --sa_n; 159 --i; 160 } 161 } 162 163 // clock all of the cu's 164 for (int i = 0; i < n_cu; ++i) 165 cuList[i]->exec(); 166} 167 168bool 169Shader::dispatch_workgroups(NDRange ndr) 170{ 171* bool scheduledSomething = false; 172 int cuCount = 0; 173 int curCu = nextSchedCu; 174 175 while (cuCount < n_cu) { 176 //Every time we try a CU, update nextSchedCu 177 nextSchedCu = (nextSchedCu + 1) % n_cu; 178 179 // dispatch workgroup iff the following two conditions are met: 180 // (a) wg_rem is true - there are unassigned workgroups in the grid 181 // (b) there are enough free slots in cu cuList[i] for this wg 182 if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { 183 scheduledSomething = true; 184 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); 185 186 // ticks() member function translates cycles to simulation ticks. 187 if (!tickEvent.scheduled()) { 188 schedule(tickEvent, curTick() + this->ticks(1)); 189 } 190 191 cuList[curCu]->StartWorkgroup(ndr); 192 ndr->wgId[0]++; 193 ndr->globalWgId++; 194 if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { 195 ndr->wgId[0] = 0; 196 ndr->wgId[1]++; 197 198 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { 199 ndr->wgId[1] = 0; 200 ndr->wgId[2]++; 201 202 if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { 203 ndr->wg_disp_rem = false; 204 break; 205 } 206 } 207 } 208 } 209 210 ++cuCount; 211 curCu = nextSchedCu; 212 } 213 214 return scheduledSomething; 215} 216 217void 218Shader::handshake(GpuDispatcher _dispatcher) 219{ 220* dispatcher = _dispatcher; 221} 222 223void 224Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void data, 225* bool suppress_func_errors, int cu_id) 226{ 227 unsigned block_size = RubySystem::getBlockSizeBytes(); 228 unsigned size = req->getSize(); 229 230 Addr tmp_addr; 231 BaseTLB::Mode trans_mode; 232 233 if (cmd == MemCmd::ReadReq) { 234 trans_mode = BaseTLB::Read; 235 } else if (cmd == MemCmd::WriteReq) { 236 trans_mode = BaseTLB::Write; 237 } else { 238 fatal("unexcepted MemCmd\n"); 239 } 240 241 tmp_addr = req->getVaddr(); 242 Addr split_addr = roundDown(tmp_addr + size - 1, block_size); 243 244 assert(split_addr <= tmp_addr \|\| split_addr - tmp_addr < block_size); 245 246 // Misaligned access 247 if (split_addr > tmp_addr) { 248 RequestPtr req1, req2; 249 req->splitOnVaddr(split_addr, req1, req2); 250 251 252 PacketPtr pkt1 = new Packet(req2, cmd); 253 PacketPtr pkt2 = new Packet(req1, cmd); 254 255 functionalTLBAccess(pkt1, cu_id, trans_mode); 256 functionalTLBAccess(pkt2, cu_id, trans_mode); 257 258 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); 259 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); 260 261 new_pkt1->dataStatic(data); 262 new_pkt2->dataStatic((uint8_t)data + req1->getSize()); 263* 264 if (suppress_func_errors) { 265 new_pkt1->setSuppressFuncError(); 266 new_pkt2->setSuppressFuncError(); 267 } 268 269 // fixme: this should be cuList[cu_id] if cu_id != n_cu 270 // The latter requires a memPort in the dispatcher 271 cuList[0]->memPort[0]->sendFunctional(new_pkt1); 272 cuList[0]->memPort[0]->sendFunctional(new_pkt2); 273 274 delete new_pkt1; 275 delete new_pkt2; 276 delete pkt1; 277 delete pkt2; 278 } else { 279 PacketPtr pkt = new Packet(req, cmd); 280 functionalTLBAccess(pkt, cu_id, trans_mode); 281 PacketPtr new_pkt = new Packet(pkt->req, cmd); 282 new_pkt->dataStatic(data); 283 284 if (suppress_func_errors) { 285 new_pkt->setSuppressFuncError(); 286 }; 287 288 // fixme: this should be cuList[cu_id] if cu_id != n_cu 289 // The latter requires a memPort in the dispatcher 290 cuList[0]->memPort[0]->sendFunctional(new_pkt); 291 292 delete new_pkt; 293 delete pkt; 294 } 295} 296 297bool 298Shader::busy() 299{ 300 for (int i_cu = 0; i_cu < n_cu; ++i_cu) { 301 if (!cuList[i_cu]->isDone()) { 302 return true; 303 } 304 } 305 306 return false; 307} 308 309void 310Shader::ScheduleAdd(uint32_t val,Tick when,int x) 311{ 312* sa_val.push_back(val); 313 sa_when.push_back(tick_cnt + when); 314 sa_x.push_back(x); 315 ++sa_n; 316} 317 318Shader::TickEvent::TickEvent(Shader _shader) 319* : Event(CPU_Tick_Pri), shader(_shader) 320{ 321} 322 323 324void 325Shader::TickEvent::process() 326{ 327 if (shader->busy()) { 328 shader->exec(); 329 shader->schedule(this, curTick() + shader->ticks(1)); 330 } 331} 332 333const char* 334Shader::TickEvent::description() const 335{ 336 return "Shader tick"; 337} 338 339void 340Shader::AccessMem(uint64_t address, void ptr, uint32_t size, int cu_id, 341* MemCmd cmd, bool suppress_func_errors) 342{ 343 uint8_t data_buf = (uint8_t)ptr; 344 345 for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes()); 346 !gen.done(); gen.next()) { 347 Request req = new Request(0, gen.addr(), gen.size(), 0, 348* cuList[0]->masterId(), 0, 0, 0); 349 350 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); 351 data_buf += gen.size(); 352 delete req; 353 } 354} 355 356void 357Shader::ReadMem(uint64_t address, void ptr, uint32_t size, int cu_id) 358{ 359* AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); 360} 361 362void 363Shader::ReadMem(uint64_t address, void ptr, uint32_t size, int cu_id, 364* bool suppress_func_errors) 365{ 366 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); 367} 368 369void 370Shader::WriteMem(uint64_t address, void ptr,uint32_t size, int cu_id) 371{ 372* AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); 373} 374 375void 376Shader::WriteMem(uint64_t address, void ptr, uint32_t size, int cu_id, 377* bool suppress_func_errors) 378{ 379 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, 380 suppress_func_errors); 381} 382 383/* 384 * Send a packet through the appropriate TLB functional port. 385 * If cu_id=n_cu, then this is the dispatcher's TLB. 386 * Otherwise it's the TLB of the cu_id compute unit. 387 / 388void 389Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) 390{ 391* // update senderState. Need to know the gpuTc and the TLB mode 392 pkt->senderState = 393 new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); 394 395 if (cu_id == n_cu) { 396 dispatcher->tlbPort->sendFunctional(pkt); 397 } else { 398 // even when the perLaneTLB flag is turned on 399 // it's ok tp send all accesses through lane 0 400 // since the lane # is not known here, 401 // This isn't important since these are functional accesses. 402 cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); 403 } 404 405 /* safe_cast the senderState / 406* TheISA::GpuTLB::TranslationState sender_state = 407* safe_cast<TheISA::GpuTLB::TranslationState>(pkt->senderState); 408* 409 delete sender_state->tlbEntry; 410 delete pkt->senderState; 411}