PerformanceModel::queueInstruction()
はひたすらm_instruction_queue
に命令をPushしていく。ENABLE_PERF_MODEL_OWN_THREAD
は定義されていない気がする。
void PerformanceModel::queueInstruction(DynamicInstruction *ins) { if (m_fastforward || !m_enabled) { // Some threads may not have switched instrumentation mode yet even though we have left ROI // Ignore the instructions they send to avoid overflowing buffers delete ins; return; } #ifdef ENABLE_PERF_MODEL_OWN_THREAD m_instruction_queue.push_wait(ins); #else m_instruction_queue.push(ins); #endif }
いよいよiterate()
によりシミュレーションを行う。
PerformanceModel::iterate()
では、handleInstruction()
によりm_instruction_queue
から命令を取り出して実行する。これはROBなわけではないので、命令を取り出してCPUに流し込んでいくだけだと思う。
void PerformanceModel::iterate() { while (m_instruction_queue.size() > 0) { // While the functional thread is waiting because of clock skew minimization, wait here as well #ifdef ENABLE_PERF_MODEL_OWN_THREAD while(m_hold) sched_yield(); #endif DynamicInstruction *ins = m_instruction_queue.front(); LOG_ASSERT_ERROR(!ins->instruction->isIdle(), "Idle instructions should not make it here!"); if (!m_fastforward && m_enabled) handleInstruction(ins); delete ins; m_instruction_queue.pop(); } synchronize(); }
おそらくhandleInstruction
の飛び先はMicroOpPerformanceModel::handleInstruction()
に飛ぶんだと思う。
handleInstruction()
では、まず命令のMicroOpの展開を行っている?
void MicroOpPerformanceModel::handleInstruction(DynamicInstruction *dynins) { /* ... 途中省略 ... */ if (dynins->instruction->getMicroOps()) { for(std::vector<const MicroOp*>::const_iterator it = dynins->instruction->getMicroOps()->begin(); it != dynins->instruction->getMicroOps()->end(); it++) { m_current_uops.push_back(m_core_model->createDynamicMicroOp(m_allocator, *it, insn_period)); } }
各uopに応じて、exec_base_index / store_base_index / load_base_index
の計算を行っているが、これはよくわからない。
試してみると、これはmicroopのなかでどの操作がどの位置で初めて登場するかを記録しているらしい。
MicroOpPerformanceModel::handleInstruction() MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called exec_base_index = 16 load_base_index = 0 store_base_index = -1
つぎにキャッシュ操作について、複数のメモリアクセスが1つの命令内で存在している場合、これをSquashするようなコードが埋め込まれているらしい。
// If we haven't gotten all of our read or write data yet, iterate over the operands for (size_t i = 0 ; i < ops.size() ; ++i) { const Operand &o = ops[i]; if (o.m_type == Operand::MEMORY) { LOG_ASSERT_ERROR(dynins->num_memory > memidx, "Did not get enough memory_info objects"); DynamicInstruction::MemoryInfo &info = dynins->memory_info[memidx++]; LOG_ASSERT_ERROR(info.dir == o.m_direction, "Expected memory %d info, got: %d.", o.m_direction, info.dir); // Because the interval model is currently in cycles, convert the data to cycles here before using it // Force the latencies into cycles for use in the original interval model // FIXME Update the Interval Timer to use SubsecondTime UInt64 memory_cycle_latency = SubsecondTime::divideRounded(info.latency, insn_period); // Optimize multiple accesses to the same cache line by one instruction (vscatter/vgather) // For simplicity, vgather/vscatter have 16 load/store microops, one for each address. // Here, we squash microops that touch a given cache line a second time // FIXME: although the microop is squashed and its latency ignored, the cache still sees the access IntPtr cache_line = info.addr & ~63; // FIXME: hard-coded cache line size
実際のSqusahが発生するコードは以下のようになっている。同一キャッシュラインのアクセスを探索する。
if (o.m_direction == Operand::READ) { // Operand::READ if (load_base_index != SIZE_MAX) { size_t load_index = load_base_index + num_reads_done; LOG_ASSERT_ERROR(load_index < m_current_uops.size(), "Expected load_index(%x) to be less than uops.size()(%d).", load_index, m_current_uops.size()); LOG_ASSERT_ERROR(m_current_uops[load_index]->getMicroOp()->isLoad(), "Expected uop %d to be a load.", load_index); if (std::find(m_cache_lines_read.begin(), m_cache_lines_read.end(), cache_line) != m_cache_lines_read.end()) { m_current_uops[load_index]->squash(&m_current_uops); do_squashing = true; } m_cache_lines_read.push_back(cache_line); // Update this uop with load latencies UInt64 bypass_latency = m_core_model->getBypassLatency(m_current_uops[load_index]); m_current_uops[load_index]->setExecLatency(memory_cycle_latency + bypass_latency); Memory::Access addr; addr.set(info.addr); m_current_uops[load_index]->setAddress(addr); m_current_uops[load_index]->setDCacheHitWhere(info.hit_where); ++num_reads_done; } else { LOG_PRINT_ERROR("Read operand count mismatch"); } }
次に、命令自体のコストを設定する。分岐命令とそうでない命令でコストが異なる。
if (dynins->instruction->getType() == INST_BRANCH) { bool is_mispredict; dynins->getBranchCost(getCore(), &is_mispredict); // Set whether the branch was mispredicted or not LOG_ASSERT_ERROR(m_current_uops[exec_base_index]->getMicroOp()->isBranch(), "Expected to find a branch here."); m_current_uops[exec_base_index]->setBranchMispredicted(is_mispredict); m_current_uops[exec_base_index]->setBranchTaken(dynins->branch_info.taken); m_current_uops[exec_base_index]->setBranchTarget(dynins->branch_info.target); // Do not update the execution latency of a branch instruction // The interval model will calculate the branch latency } else { insn_cost = dynins->getCost(getCore()); #if DEBUG_INSN_LOG if (insn_cost > 17) { fprintf(m_insn_log, "[%llu] ", (long long unsigned int)m_cycle_count); if (load_base_index != SIZE_MAX) { fprintf(m_insn_log, "L"); } if (store_base_index != SIZE_MAX) { fprintf(m_insn_log, "S"); } if (exec_base_index != SIZE_MAX) { fprintf(m_insn_log, "X"); #ifdef ENABLE_MICROOP_STRINGS fprintf(m_insn_log, "-%s:%s", dynins->instruction->getDisassembly().c_str(), dynins->instruction->getTypeName().c_str()); fflush(m_insn_log); #endif } fprintf(m_insn_log, "approx cost = %llu\n", (long long unsigned int)insn_cost); } #endif }