Sniperの動作原理をトレースする (2. Performance Modelの概要)

PerformanceModel::queueInstruction()はひたすらm_instruction_queueに命令をPushしていく。ENABLE_PERF_MODEL_OWN_THREADは定義されていない気がする。

void PerformanceModel::queueInstruction(DynamicInstruction *ins)
{
   if (m_fastforward || !m_enabled)
   {
      // Some threads may not have switched instrumentation mode yet even though we have left ROI
      // Ignore the instructions they send to avoid overflowing buffers
      delete ins;
      return;
   }

   #ifdef ENABLE_PERF_MODEL_OWN_THREAD
      m_instruction_queue.push_wait(ins);
   #else
      m_instruction_queue.push(ins);
   #endif
}

いよいよiterate()によりシミュレーションを行う。

PerformanceModel::iterate()では、handleInstruction()によりm_instruction_queueから命令を取り出して実行する。これはROBなわけではないので、命令を取り出してCPUに流し込んでいくだけだと思う。

void PerformanceModel::iterate()
{
   while (m_instruction_queue.size() > 0)
   {
      // While the functional thread is waiting because of clock skew minimization, wait here as well
      #ifdef ENABLE_PERF_MODEL_OWN_THREAD
      while(m_hold)
         sched_yield();
      #endif

      DynamicInstruction *ins = m_instruction_queue.front();

      LOG_ASSERT_ERROR(!ins->instruction->isIdle(), "Idle instructions should not make it here!");

      if (!m_fastforward && m_enabled)
         handleInstruction(ins);

      delete ins;

      m_instruction_queue.pop();
   }

   synchronize();
}

おそらくhandleInstructionの飛び先はMicroOpPerformanceModel::handleInstruction()に飛ぶんだと思う。

handleInstruction()では、まず命令のMicroOpの展開を行っている？

void MicroOpPerformanceModel::handleInstruction(DynamicInstruction *dynins)
{
   /* ... 途中省略 ... */

   if (dynins->instruction->getMicroOps())
   {
      for(std::vector<const MicroOp*>::const_iterator it = dynins->instruction->getMicroOps()->begin(); it != dynins->instruction->getMicroOps()->end(); it++)
      {
         m_current_uops.push_back(m_core_model->createDynamicMicroOp(m_allocator, *it, insn_period));
      }
   }

各uopに応じて、exec_base_index / store_base_index / load_base_index の計算を行っているが、これはよくわからない。

試してみると、これはmicroopのなかでどの操作がどの位置で初めて登場するかを記録しているらしい。

MicroOpPerformanceModel::handleInstruction()
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
MicroOpPerformanceModel::handleInstruction createDynamicMicroOp called
exec_base_index  = 16
load_base_index  = 0
store_base_index = -1

つぎにキャッシュ操作について、複数のメモリアクセスが1つの命令内で存在している場合、これをSquashするようなコードが埋め込まれているらしい。

    // If we haven't gotten all of our read or write data yet, iterate over the operands
   for (size_t i = 0 ; i < ops.size() ; ++i)
   {
      const Operand &o = ops[i];

      if (o.m_type == Operand::MEMORY)
      {
         LOG_ASSERT_ERROR(dynins->num_memory > memidx, "Did not get enough memory_info objects");
         DynamicInstruction::MemoryInfo &info = dynins->memory_info[memidx++];
         LOG_ASSERT_ERROR(info.dir == o.m_direction,
                          "Expected memory %d info, got: %d.", o.m_direction, info.dir);

         // Because the interval model is currently in cycles, convert the data to cycles here before using it
         // Force the latencies into cycles for use in the original interval model
         // FIXME Update the Interval Timer to use SubsecondTime
         UInt64 memory_cycle_latency = SubsecondTime::divideRounded(info.latency, insn_period);

         // Optimize multiple accesses to the same cache line by one instruction (vscatter/vgather)
         //   For simplicity, vgather/vscatter have 16 load/store microops, one for each address.
         //   Here, we squash microops that touch a given cache line a second time
         //   FIXME: although the microop is squashed and its latency ignored, the cache still sees the access
         IntPtr cache_line = info.addr & ~63; // FIXME: hard-coded cache line size

実際のSqusahが発生するコードは以下のようになっている。同一キャッシュラインのアクセスを探索する。

         if (o.m_direction == Operand::READ)
         {
            // Operand::READ

            if (load_base_index != SIZE_MAX)
            {
               size_t load_index = load_base_index + num_reads_done;

               LOG_ASSERT_ERROR(load_index < m_current_uops.size(),
                                "Expected load_index(%x) to be less than uops.size()(%d).", load_index, m_current_uops.size());
               LOG_ASSERT_ERROR(m_current_uops[load_index]->getMicroOp()->isLoad(),
                                "Expected uop %d to be a load.", load_index);

               if (std::find(m_cache_lines_read.begin(), m_cache_lines_read.end(), cache_line) != m_cache_lines_read.end())
               {
                  m_current_uops[load_index]->squash(&m_current_uops);
                  do_squashing = true;
               }
               m_cache_lines_read.push_back(cache_line);

               // Update this uop with load latencies
               UInt64 bypass_latency = m_core_model->getBypassLatency(m_current_uops[load_index]);
               m_current_uops[load_index]->setExecLatency(memory_cycle_latency + bypass_latency);
               Memory::Access addr;
               addr.set(info.addr);
               m_current_uops[load_index]->setAddress(addr);
               m_current_uops[load_index]->setDCacheHitWhere(info.hit_where);
               ++num_reads_done;
            }
            else
            {
               LOG_PRINT_ERROR("Read operand count mismatch");
            }

         }

次に、命令自体のコストを設定する。分岐命令とそうでない命令でコストが異なる。

   if (dynins->instruction->getType() == INST_BRANCH)
   {
      bool is_mispredict;
      dynins->getBranchCost(getCore(), &is_mispredict);

      // Set whether the branch was mispredicted or not
      LOG_ASSERT_ERROR(m_current_uops[exec_base_index]->getMicroOp()->isBranch(), "Expected to find a branch here.");
      m_current_uops[exec_base_index]->setBranchMispredicted(is_mispredict);
      m_current_uops[exec_base_index]->setBranchTaken(dynins->branch_info.taken);
      m_current_uops[exec_base_index]->setBranchTarget(dynins->branch_info.target);
      // Do not update the execution latency of a branch instruction
      // The interval model will calculate the branch latency
   }
   else
   {
      insn_cost = dynins->getCost(getCore());

   #if DEBUG_INSN_LOG
      if (insn_cost > 17)
      {
         fprintf(m_insn_log, "[%llu] ", (long long unsigned int)m_cycle_count);
         if (load_base_index != SIZE_MAX) {
            fprintf(m_insn_log, "L");
         }
         if (store_base_index != SIZE_MAX) {
            fprintf(m_insn_log, "S");
         }
         if (exec_base_index != SIZE_MAX) {
            fprintf(m_insn_log, "X");
   #ifdef ENABLE_MICROOP_STRINGS
            fprintf(m_insn_log, "-%s:%s", dynins->instruction->getDisassembly().c_str(), dynins->instruction->getTypeName().c_str());
            fflush(m_insn_log);
   #endif
         }
         fprintf(m_insn_log, "approx cost = %llu\n", (long long unsigned int)insn_cost);
      }
   #endif
   }

FPGA開発日記

カテゴリ別記事インデックス https://msyksphinz.github.io/github_pages , English Version https://fpgadevdiary.hatenadiary.com/

Sniperの動作原理をトレースする (2. Performance Modelの概要)