src/gpu-compute/shader.cc

/*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */

#include "gpu-compute/shader.hh"

#include <limits>

#include "arch/x86/linux/linux.hh"
#include "base/chunk_generator.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUMem.hh"
#include "debug/HSAIL.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/qstruct.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "sim/sim_exit.hh"

Shader::Shader(const Params *p) : SimObject(p),
    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
    box_tick_cnt(0), start_tick_cnt(0)
{

    cuList.resize(n_cu);

    for (int i = 0; i < n_cu; ++i) {
        cuList[i] = p->CUs[i];
        assert(i == cuList[i]->cu_id);
        cuList[i]->shader = this;
    }
}

Addr
Shader::mmap(int length)
{

    Addr start;

    // round up length to the next page
    length = roundUp(length, TheISA::PageBytes);

    Process *proc = gpuTc->getProcessPtr();

    if (proc->mmapGrowsDown()) {
        DPRINTF(HSAIL, "GROWS DOWN");
        start = proc->mmap_end - length;
        proc->mmap_end = start;
    } else {
        DPRINTF(HSAIL, "GROWS UP");
        start = proc->mmap_end;
        proc->mmap_end += length;

        // assertion to make sure we don't overwrite the stack (it grows down)
        assert(proc->mmap_end < proc->stack_base - proc->max_stack_size);
    }

    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);

    proc->allocateMem(start, length);

    return start;
}

void
Shader::init()
{
    // grab the threadContext of the thread running on the CPU
    assert(cpuPointer);
    gpuTc = cpuPointer->getContext(0);
    assert(gpuTc);
}

Shader::~Shader()
{
    for (int j = 0; j < n_cu; ++j)
        delete cuList[j];
}

void
Shader::updateThreadContext(int tid) {
    // thread context of the thread which dispatched work
    assert(cpuPointer);
    gpuTc = cpuPointer->getContext(tid);
    assert(gpuTc);
}

void
Shader::hostWakeUp(BaseCPU *cpu) {
    if (cpuPointer == cpu) {
        if (gpuTc->status() == ThreadContext::Suspended)
            cpu->activateContext(gpuTc->threadId());
    } else {
        //Make sure both dispatcher and shader are trying to
        //wakeup same host. Hack here to enable kernel launch
        //from multiple CPUs
        panic("Dispatcher wants to wakeup a different host");
    }
}

Shader*
ShaderParams::create()
{
    return new Shader(this);
}

void
Shader::exec()
{
    tick_cnt = curTick();
    box_tick_cnt = curTick() - start_tick_cnt;

    // apply any scheduled adds
    for (int i = 0; i < sa_n; ++i) {
        if (sa_when[i] <= tick_cnt) {
            *sa_val[i] += sa_x[i];
            sa_val.erase(sa_val.begin() + i);
            sa_x.erase(sa_x.begin() + i);
            sa_when.erase(sa_when.begin() + i);
            --sa_n;
            --i;
        }
    }

    // clock all of the cu's
    for (int i = 0; i < n_cu; ++i)
        cuList[i]->exec();
}

bool
Shader::dispatch_workgroups(NDRange *ndr)
{
    bool scheduledSomething = false;
    int cuCount = 0;
    int curCu = nextSchedCu;

    while (cuCount < n_cu) {
        //Every time we try a CU, update nextSchedCu
        nextSchedCu = (nextSchedCu + 1) % n_cu;

        // dispatch workgroup iff the following two conditions are met:
        // (a) wg_rem is true - there are unassigned workgroups in the grid
        // (b) there are enough free slots in cu cuList[i] for this wg
        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
            scheduledSomething = true;
            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);

            // ticks() member function translates cycles to simulation ticks.
            if (!tickEvent.scheduled()) {
                schedule(tickEvent, curTick() + this->ticks(1));
            }

            cuList[curCu]->StartWorkgroup(ndr);
            ndr->wgId[0]++;
            ndr->globalWgId++;
            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
                ndr->wgId[0] = 0;
                ndr->wgId[1]++;

                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
                    ndr->wgId[1] = 0;
                    ndr->wgId[2]++;

                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
                        ndr->wg_disp_rem = false;
                        break;
                    }
                }
            }
        }

        ++cuCount;
        curCu = nextSchedCu;
    }

    return scheduledSomething;
}

void
Shader::handshake(GpuDispatcher *_dispatcher)
{
    dispatcher = _dispatcher;
}

void
Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
                           bool suppress_func_errors, int cu_id)
{
    unsigned block_size = RubySystem::getBlockSizeBytes();
    unsigned size = req->getSize();

    Addr tmp_addr;
    BaseTLB::Mode trans_mode;

    if (cmd == MemCmd::ReadReq) {
        trans_mode = BaseTLB::Read;
    } else if (cmd == MemCmd::WriteReq) {
        trans_mode = BaseTLB::Write;
    } else {
        fatal("unexcepted MemCmd\n");
    }

    tmp_addr = req->getVaddr();
    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);

    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);

    // Misaligned access
    if (split_addr > tmp_addr) {
        RequestPtr req1, req2;
        req->splitOnVaddr(split_addr, req1, req2);


        PacketPtr pkt1 = new Packet(req2, cmd);
        PacketPtr pkt2 = new Packet(req1, cmd);

        functionalTLBAccess(pkt1, cu_id, trans_mode);
        functionalTLBAccess(pkt2, cu_id, trans_mode);

        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);

        new_pkt1->dataStatic(data);
        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());

        if (suppress_func_errors) {
            new_pkt1->setSuppressFuncError();
            new_pkt2->setSuppressFuncError();
        }

        // fixme: this should be cuList[cu_id] if cu_id != n_cu
        // The latter requires a memPort in the dispatcher
        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
        cuList[0]->memPort[0]->sendFunctional(new_pkt2);

        delete new_pkt1;
        delete new_pkt2;
        delete pkt1;
        delete pkt2;
    } else {
        PacketPtr pkt = new Packet(req, cmd);
        functionalTLBAccess(pkt, cu_id, trans_mode);
        PacketPtr new_pkt = new Packet(pkt->req, cmd);
        new_pkt->dataStatic(data);

        if (suppress_func_errors) {
            new_pkt->setSuppressFuncError();
        };

        // fixme: this should be cuList[cu_id] if cu_id != n_cu
        // The latter requires a memPort in the dispatcher
        cuList[0]->memPort[0]->sendFunctional(new_pkt);

        delete new_pkt;
        delete pkt;
    }
}

bool
Shader::busy()
{
    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
        if (!cuList[i_cu]->isDone()) {
            return true;
        }
    }

    return false;
}

void
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
{
    sa_val.push_back(val);
    sa_when.push_back(tick_cnt + when);
    sa_x.push_back(x);
    ++sa_n;
}

Shader::TickEvent::TickEvent(Shader *_shader)
    : Event(CPU_Tick_Pri), shader(_shader)
{
}


void
Shader::TickEvent::process()
{
    if (shader->busy()) {
        shader->exec();
        shader->schedule(this, curTick() + shader->ticks(1));
    }
}

const char*
Shader::TickEvent::description() const
{
    return "Shader tick";
}

void
Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                  MemCmd cmd, bool suppress_func_errors)
{
    uint8_t *data_buf = (uint8_t*)ptr;

    for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
         !gen.done(); gen.next()) {
        Request *req = new Request(0, gen.addr(), gen.size(), 0,
                                   cuList[0]->masterId(), 0, 0, 0);

        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
        data_buf += gen.size();
        delete req;
    }
}

void
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
{
    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
}

void
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                bool suppress_func_errors)
{
    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
}

void
Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
{
    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
}

void
Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                 bool suppress_func_errors)
{
    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
              suppress_func_errors);
}

/*
 * Send a packet through the appropriate TLB functional port.
 * If cu_id=n_cu, then this is the dispatcher's TLB.
 * Otherwise it's the TLB of the cu_id compute unit.
 */
void
Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
{
    // update senderState. Need to know the gpuTc and the TLB mode
    pkt->senderState =
        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);

    if (cu_id == n_cu) {
        dispatcher->tlbPort->sendFunctional(pkt);
    } else {
        // even when the perLaneTLB flag is turned on
        // it's ok tp send all accesses through lane 0
        // since the lane # is not known here,
        // This isn't important since these are functional accesses.
        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
    }

    /* safe_cast the senderState */
    TheISA::GpuTLB::TranslationState *sender_state =
               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);

    delete sender_state->tlbEntry;
    delete pkt->senderState;
}