/* * Copyright (c) 2002-2005 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Steve Reinhardt */ #include "arch/locked_mem.hh" #include "arch/mmaped_ipr.hh" #include "arch/utility.hh" #include "base/bigint.hh" #include "config/the_isa.hh" #include "cpu/exetrace.hh" #include "cpu/simple/atomic.hh" #include "mem/packet.hh" #include "mem/packet_access.hh" #include "params/AtomicSimpleCPU.hh" #include "sim/system.hh" using namespace std; using namespace TheISA; AtomicSimpleCPU::TickEvent::TickEvent(AtomicSimpleCPU *c) : Event(CPU_Tick_Pri), cpu(c) { } void AtomicSimpleCPU::TickEvent::process() { cpu->tick(); } const char * AtomicSimpleCPU::TickEvent::description() const { return "AtomicSimpleCPU tick"; } Port * AtomicSimpleCPU::getPort(const string &if_name, int idx) { if (if_name == "dcache_port") return &dcachePort; else if (if_name == "icache_port") return &icachePort; else if (if_name == "physmem_port") { hasPhysMemPort = true; return &physmemPort; } else panic("No Such Port\n"); } void AtomicSimpleCPU::init() { BaseCPU::init(); #if FULL_SYSTEM ThreadID size = threadContexts.size(); for (ThreadID i = 0; i < size; ++i) { ThreadContext *tc = threadContexts[i]; // initialize CPU, including PC TheISA::initCPU(tc, tc->contextId()); } #endif if (hasPhysMemPort) { bool snoop = false; AddrRangeList pmAddrList; physmemPort.getPeerAddressRanges(pmAddrList, snoop); physMemAddr = *pmAddrList.begin(); } // Atomic doesn't do MT right now, so contextId == threadId ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too } bool AtomicSimpleCPU::CpuPort::recvTiming(PacketPtr pkt) { panic("AtomicSimpleCPU doesn't expect recvTiming callback!"); return true; } Tick AtomicSimpleCPU::CpuPort::recvAtomic(PacketPtr pkt) { //Snooping a coherence request, just return return 0; } void AtomicSimpleCPU::CpuPort::recvFunctional(PacketPtr pkt) { //No internal storage to update, just return return; } void AtomicSimpleCPU::CpuPort::recvStatusChange(Status status) { if (status == RangeChange) { if (!snoopRangeSent) { snoopRangeSent = true; sendStatusChange(Port::RangeChange); } return; } panic("AtomicSimpleCPU doesn't expect recvStatusChange callback!"); } void AtomicSimpleCPU::CpuPort::recvRetry() { panic("AtomicSimpleCPU doesn't expect recvRetry callback!"); } void AtomicSimpleCPU::DcachePort::setPeer(Port *port) { Port::setPeer(port); #if FULL_SYSTEM // Update the ThreadContext's memory ports (Functional/Virtual // Ports) cpu->tcBase()->connectMemPorts(cpu->tcBase()); #endif } AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p) : BaseSimpleCPU(p), tickEvent(this), width(p->width), locked(false), simulate_data_stalls(p->simulate_data_stalls), simulate_inst_stalls(p->simulate_inst_stalls), icachePort(name() + "-iport", this), dcachePort(name() + "-iport", this), physmemPort(name() + "-iport", this), hasPhysMemPort(false) { _status = Idle; icachePort.snoopRangeSent = false; dcachePort.snoopRangeSent = false; } AtomicSimpleCPU::~AtomicSimpleCPU() { } void AtomicSimpleCPU::serialize(ostream &os) { SimObject::State so_state = SimObject::getState(); SERIALIZE_ENUM(so_state); SERIALIZE_SCALAR(locked); BaseSimpleCPU::serialize(os); nameOut(os, csprintf("%s.tickEvent", name())); tickEvent.serialize(os); } void AtomicSimpleCPU::unserialize(Checkpoint *cp, const string §ion) { SimObject::State so_state; UNSERIALIZE_ENUM(so_state); UNSERIALIZE_SCALAR(locked); BaseSimpleCPU::unserialize(cp, section); tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); } void AtomicSimpleCPU::resume() { if (_status == Idle || _status == SwitchedOut) return; DPRINTF(SimpleCPU, "Resume\n"); assert(system->getMemoryMode() == Enums::atomic); changeState(SimObject::Running); if (thread->status() == ThreadContext::Active) { if (!tickEvent.scheduled()) schedule(tickEvent, nextCycle()); } } void AtomicSimpleCPU::switchOut() { assert(_status == Running || _status == Idle); _status = SwitchedOut; tickEvent.squash(); } void AtomicSimpleCPU::takeOverFrom(BaseCPU *oldCPU) { BaseCPU::takeOverFrom(oldCPU, &icachePort, &dcachePort); assert(!tickEvent.scheduled()); // if any of this CPU's ThreadContexts are active, mark the CPU as // running and schedule its tick event. ThreadID size = threadContexts.size(); for (ThreadID i = 0; i < size; ++i) { ThreadContext *tc = threadContexts[i]; if (tc->status() == ThreadContext::Active && _status != Running) { _status = Running; schedule(tickEvent, nextCycle()); break; } } if (_status != Running) { _status = Idle; } assert(threadContexts.size() == 1); ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too } void AtomicSimpleCPU::activateContext(int thread_num, int delay) { DPRINTF(SimpleCPU, "ActivateContext %d (%d cycles)\n", thread_num, delay); assert(thread_num == 0); assert(thread); assert(_status == Idle); assert(!tickEvent.scheduled()); notIdleFraction++; numCycles += tickToCycles(thread->lastActivate - thread->lastSuspend); //Make sure ticks are still on multiples of cycles schedule(tickEvent, nextCycle(curTick + ticks(delay))); _status = Running; } void AtomicSimpleCPU::suspendContext(int thread_num) { DPRINTF(SimpleCPU, "SuspendContext %d\n", thread_num); assert(thread_num == 0); assert(thread); if (_status == Idle) return; assert(_status == Running); // tick event may not be scheduled if this gets called from inside // an instruction's execution, e.g. "quiesce" if (tickEvent.scheduled()) deschedule(tickEvent); notIdleFraction--; _status = Idle; } template Fault AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags) { // use the CPU's statically allocated read request and packet objects Request *req = &data_read_req; if (traceData) { traceData->setAddr(addr); } //The block size of our peer. unsigned blockSize = dcachePort.peerBlockSize(); //The size of the data we're trying to read. int dataSize = sizeof(T); uint8_t * dataPtr = (uint8_t *)&data; //The address of the second part of this access if it needs to be split //across a cache line boundary. Addr secondAddr = roundDown(addr + dataSize - 1, blockSize); if(secondAddr > addr) dataSize = secondAddr - addr; dcache_latency = 0; while(1) { req->setVirt(0, addr, dataSize, flags, thread->readPC()); // translate to physical address Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Read); // Now do the access. if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) { Packet pkt = Packet(req, req->isLLSC() ? MemCmd::LoadLockedReq : MemCmd::ReadReq, Packet::Broadcast); pkt.dataStatic(dataPtr); if (req->isMmapedIpr()) dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt); else { if (hasPhysMemPort && pkt.getAddr() == physMemAddr) dcache_latency += physmemPort.sendAtomic(&pkt); else dcache_latency += dcachePort.sendAtomic(&pkt); } dcache_access = true; assert(!pkt.isError()); if (req->isLLSC()) { TheISA::handleLockedRead(thread, req); } } // This will need a new way to tell if it has a dcache attached. if (req->isUncacheable()) recordEvent("Uncached Read"); //If there's a fault, return it if (fault != NoFault) { if (req->isPrefetch()) { return NoFault; } else { return fault; } } //If we don't need to access a second cache line, stop now. if (secondAddr <= addr) { data = gtoh(data); if (traceData) { traceData->setData(data); } if (req->isLocked() && fault == NoFault) { assert(!locked); locked = true; } return fault; } /* * Set up for accessing the second cache line. */ //Move the pointer we're reading into to the correct location. dataPtr += dataSize; //Adjust the size to get the remaining bytes. dataSize = addr + sizeof(T) - secondAddr; //And access the right address. addr = secondAddr; } } #ifndef DOXYGEN_SHOULD_SKIP_THIS template Fault AtomicSimpleCPU::read(Addr addr, Twin32_t &data, unsigned flags); template Fault AtomicSimpleCPU::read(Addr addr, Twin64_t &data, unsigned flags); template Fault AtomicSimpleCPU::read(Addr addr, uint64_t &data, unsigned flags); template Fault AtomicSimpleCPU::read(Addr addr, uint32_t &data, unsigned flags); template Fault AtomicSimpleCPU::read(Addr addr, uint16_t &data, unsigned flags); template Fault AtomicSimpleCPU::read(Addr addr, uint8_t &data, unsigned flags); #endif //DOXYGEN_SHOULD_SKIP_THIS template<> Fault AtomicSimpleCPU::read(Addr addr, double &data, unsigned flags) { return read(addr, *(uint64_t*)&data, flags); } template<> Fault AtomicSimpleCPU::read(Addr addr, float &data, unsigned flags) { return read(addr, *(uint32_t*)&data, flags); } template<> Fault AtomicSimpleCPU::read(Addr addr, int32_t &data, unsigned flags) { return read(addr, (uint32_t&)data, flags); } template Fault AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res) { // use the CPU's statically allocated write request and packet objects Request *req = &data_write_req; if (traceData) { traceData->setAddr(addr); } //The block size of our peer. unsigned blockSize = dcachePort.peerBlockSize(); //The size of the data we're trying to read. int dataSize = sizeof(T); uint8_t * dataPtr = (uint8_t *)&data; //The address of the second part of this access if it needs to be split //across a cache line boundary. Addr secondAddr = roundDown(addr + dataSize - 1, blockSize); if(secondAddr > addr) dataSize = secondAddr - addr; dcache_latency = 0; while(1) { req->setVirt(0, addr, dataSize, flags, thread->readPC()); // translate to physical address Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Write); // Now do the access. if (fault == NoFault) { MemCmd cmd = MemCmd::WriteReq; // default bool do_access = true; // flag to suppress cache access if (req->isLLSC()) { cmd = MemCmd::StoreCondReq; do_access = TheISA::handleLockedWrite(thread, req); } else if (req->isSwap()) { cmd = MemCmd::SwapReq; if (req->isCondSwap()) { assert(res); req->setExtraData(*res); } } if (do_access && !req->getFlags().isSet(Request::NO_ACCESS)) { Packet pkt = Packet(req, cmd, Packet::Broadcast); pkt.dataStatic(dataPtr); if (req->isMmapedIpr()) { dcache_latency += TheISA::handleIprWrite(thread->getTC(), &pkt); } else { //XXX This needs to be outside of the loop in order to //work properly for cache line boundary crossing //accesses in transendian simulations. data = htog(data); if (hasPhysMemPort && pkt.getAddr() == physMemAddr) dcache_latency += physmemPort.sendAtomic(&pkt); else dcache_latency += dcachePort.sendAtomic(&pkt); } dcache_access = true; assert(!pkt.isError()); if (req->isSwap()) { assert(res); *res = pkt.get(); } } if (res && !req->isSwap()) { *res = req->getExtraData(); } } // This will need a new way to tell if it's hooked up to a cache or not. if (req->isUncacheable()) recordEvent("Uncached Write"); //If there's a fault or we don't need to access a second cache line, //stop now. if (fault != NoFault || secondAddr <= addr) { // If the write needs to have a fault on the access, consider // calling changeStatus() and changing it to "bad addr write" // or something. if (traceData) { traceData->setData(gtoh(data)); } if (req->isLocked() && fault == NoFault) { assert(locked); locked = false; } if (fault != NoFault && req->isPrefetch()) { return NoFault; } else { return fault; } } /* * Set up for accessing the second cache line. */ //Move the pointer we're reading into to the correct location. dataPtr += dataSize; //Adjust the size to get the remaining bytes. dataSize = addr + sizeof(T) - secondAddr; //And access the right address. addr = secondAddr; } } #ifndef DOXYGEN_SHOULD_SKIP_THIS template Fault AtomicSimpleCPU::write(Twin32_t data, Addr addr, unsigned flags, uint64_t *res); template Fault AtomicSimpleCPU::write(Twin64_t data, Addr addr, unsigned flags, uint64_t *res); template Fault AtomicSimpleCPU::write(uint64_t data, Addr addr, unsigned flags, uint64_t *res); template Fault AtomicSimpleCPU::write(uint32_t data, Addr addr, unsigned flags, uint64_t *res); template Fault AtomicSimpleCPU::write(uint16_t data, Addr addr, unsigned flags, uint64_t *res); template Fault AtomicSimpleCPU::write(uint8_t data, Addr addr, unsigned flags, uint64_t *res); #endif //DOXYGEN_SHOULD_SKIP_THIS template<> Fault AtomicSimpleCPU::write(double data, Addr addr, unsigned flags, uint64_t *res) { return write(*(uint64_t*)&data, addr, flags, res); } template<> Fault AtomicSimpleCPU::write(float data, Addr addr, unsigned flags, uint64_t *res) { return write(*(uint32_t*)&data, addr, flags, res); } template<> Fault AtomicSimpleCPU::write(int32_t data, Addr addr, unsigned flags, uint64_t *res) { return write((uint32_t)data, addr, flags, res); } void AtomicSimpleCPU::tick() { DPRINTF(SimpleCPU, "Tick\n"); Tick latency = 0; for (int i = 0; i < width || locked; ++i) { numCycles++; if (!curStaticInst || !curStaticInst->isDelayedCommit()) checkForInterrupts(); checkPcEventQueue(); Fault fault = NoFault; bool fromRom = isRomMicroPC(thread->readMicroPC()); if (!fromRom && !curMacroStaticInst) { setupFetchRequest(&ifetch_req); fault = thread->itb->translateAtomic(&ifetch_req, tc, BaseTLB::Execute); } if (fault == NoFault) { Tick icache_latency = 0; bool icache_access = false; dcache_access = false; // assume no dcache access if (!fromRom && !curMacroStaticInst) { // This is commented out because the predecoder would act like // a tiny cache otherwise. It wouldn't be flushed when needed // like the I cache. It should be flushed, and when that works // this code should be uncommented. //Fetch more instruction memory if necessary //if(predecoder.needMoreBytes()) //{ icache_access = true; Packet ifetch_pkt = Packet(&ifetch_req, MemCmd::ReadReq, Packet::Broadcast); ifetch_pkt.dataStatic(&inst); if (hasPhysMemPort && ifetch_pkt.getAddr() == physMemAddr) icache_latency = physmemPort.sendAtomic(&ifetch_pkt); else icache_latency = icachePort.sendAtomic(&ifetch_pkt); assert(!ifetch_pkt.isError()); // ifetch_req is initialized to read the instruction directly // into the CPU object's inst field. //} } preExecute(); if (curStaticInst) { fault = curStaticInst->execute(this, traceData); // keep an instruction count if (fault == NoFault) countInst(); else if (traceData) { // If there was a fault, we should trace this instruction. delete traceData; traceData = NULL; } postExecute(); } // @todo remove me after debugging with legion done if (curStaticInst && (!curStaticInst->isMicroop() || curStaticInst->isFirstMicroop())) instCnt++; Tick stall_ticks = 0; if (simulate_inst_stalls && icache_access) stall_ticks += icache_latency; if (simulate_data_stalls && dcache_access) stall_ticks += dcache_latency; if (stall_ticks) { Tick stall_cycles = stall_ticks / ticks(1); Tick aligned_stall_ticks = ticks(stall_cycles); if (aligned_stall_ticks < stall_ticks) aligned_stall_ticks += 1; latency += aligned_stall_ticks; } } if(fault != NoFault || !stayAtPC) advancePC(fault); } // instruction takes at least one cycle if (latency < ticks(1)) latency = ticks(1); if (_status != Idle) schedule(tickEvent, curTick + latency); } void AtomicSimpleCPU::printAddr(Addr a) { dcachePort.printAddr(a); } //////////////////////////////////////////////////////////////////////// // // AtomicSimpleCPU Simulation Object // AtomicSimpleCPU * AtomicSimpleCPUParams::create() { numThreads = 1; #if !FULL_SYSTEM if (workload.size() != 1) panic("only one workload allowed"); #endif return new AtomicSimpleCPU(this); }