QEMUのTCG(Tiny Code Generator)を読み解く

QEMUは高速なエミュレーションが可能な理由の一つに、TCGを使った高速なバイナリ変換機構がある。TCGの役割は、ターゲットバイナリからTCG(Tiny Code Generator)を用いた中間表現に変換し、ホスト形式のバイナリに変換する2つの機構が存在している。

f:id:msyksphinz:20200727012347p:plain:w500

TCGがどのようにして高速なエミュレーションを実現しているのかについて調査している。ターゲットコードをTCGに変換する手法については、target/riscv/translate.cを解析してきたので何となくわかるとして、TCGからどのようにホストのバイナリに変換する方法について見ていこうと思う。

まずはTCGについてはtcg/tcg.cに多くのオペレーションの定義がなされている。一方で、各ホストのTCGからの変換ポリシについてはtcg/[target-arch]に定義がある。一番安定してそうなtcg/i386/tcg-target.inc.cを見てみることにする。

qemu/tcg/i386/tcg-target.inc.c

#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#if TCG_TARGET_REG_BITS == 64
    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
#else
    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
#endif
...

x86の命令定義についてはあまり詳しくないのだが、おそらくこの辺がx86の機械語に相当しているのだろう。

#define OPC_ARITH_EvIz  (0x81)
#define OPC_ARITH_EvIb (0x83)
#define OPC_ARITH_GvEv (0x03)     /* ... plus (ARITH_FOO << 3) */
#define OPC_ANDN        (0xf2 | P_EXT38)
#define OPC_ADD_GvEv   (OPC_ARITH_GvEv | (ARITH_ADD << 3))
#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
...

その証拠に、RISC-V側の定義は機械語に一致している（RISC-Vの機械語なら一瞬で理解できるのだ）。

typedef enum {
    OPC_ADD = 0x33,
    OPC_ADDI = 0x13,
    OPC_AND = 0x7033,
    OPC_ANDI = 0x7013,
    OPC_AUIPC = 0x17,
    OPC_BEQ = 0x63,
    OPC_BGE = 0x5063,
    OPC_BGEU = 0x7063,
...

この知識を前提にして、TCGからバイナリを生成するフローを追いかけていく。

qemu/tcg/tcg.c

int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
{
#ifdef CONFIG_PROFILER
    TCGProfile *prof = &s->prof;
#endif
    int i, num_insns;
...
    QTAILQ_FOREACH(op, &s->ops, link) {
        TCGOpcode opc = op->opc;

#ifdef CONFIG_PROFILER
        atomic_set(&prof->table_op_count[opc], prof->table_op_count[opc] + 1);
#endif

        switch (opc) {
        case INDEX_op_mov_i32:
        case INDEX_op_mov_i64:
        case INDEX_op_mov_vec:
...
        default:
            /* Sanity check that we've not introduced any unhandled opcodes. */
            tcg_debug_assert(tcg_op_supported(opc));
            /* Note: in order to speed up the code, it would be much
               faster to have specialized register allocator functions for
               some common argument patterns */
            tcg_reg_alloc_op(s, op);
            break;
        }
#ifdef CONFIG_DEBUG_TCG
...

まあ最初はtcg_reg_alloc_op()から読んでいけばよかろう。これはおそらくレジスタに書き込みを行う命令だ（reg_alloc_opなので）。

で、tcg_reg_alloc()のコードの大半はとりあえず無視して、実際に命令を生成するとところだけ取り出すとここだ。

static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
{
    const TCGLifeData arg_life = op->life;
    const TCGOpDef * const def = &tcg_op_defs[op->opc];
    TCGRegSet i_allocated_regs;
...
    /* emit instruction */
    if (def->flags & TCG_OPF_VECTOR) {
        tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op),
                       new_args, const_args);
    } else {
        tcg_out_op(s, op->opc, new_args, const_args);
    }

tcg_out_op()は各アーキテクチャで実装が委ねられている。

qemu/tcg/i386/tcg-target.inc.c

static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                              const TCGArg *args, const int *const_args)
{
    TCGArg a0, a1, a2;
    int c, const_a2, vexop, rexw = 0;

#if TCG_TARGET_REG_BITS == 64
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i64): \
            rexw = P_REXW; /* FALLTHRU */    \
        case glue(glue(INDEX_op_, x), _i32)
...
                
    OP_32_64(add2):
        if (const_args[4]) {
            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
        } else {
            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
        }
        if (const_args[5]) {

っとなんだかややこしいのでRISC-V側を見てみる。

qemu/tcg/i386/tcg-target.inc.c

static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                       const TCGArg *args, const int *const_args)
{
    TCGArg a0 = args[0];
    TCGArg a1 = args[1];
    TCGArg a2 = args[2];
...
    switch (opc) {
            ...
    case INDEX_op_add_i64:
        if (c2) {
            tcg_out_opc_imm(s, OPC_ADDI, a0, a1, a2);
        } else {
            tcg_out_opc_reg(s, OPC_ADD, a0, a1, a2);
        }
        break;

このtcg_out_opc_reg()というのがまさにRISC-V向けバイナリを生成するルーチンだ。

/*
 * RISC-V instruction emitters
 */

static void tcg_out_opc_reg(TCGContext *s, RISCVInsn opc,
                            TCGReg rd, TCGReg rs1, TCGReg rs2)
{
    tcg_out32(s, encode_r(opc, rd, rs1, rs2));
}

このencode_r()が機械語エンコーディングを作り上げている。

/*
 * RISC-V immediate and instruction encoders (excludes 16-bit RVC)
 */

/* Type-R */

static int32_t encode_r(RISCVInsn opc, TCGReg rd, TCGReg rs1, TCGReg rs2)
{
    return opc | (rd & 0x1f) << 7 | (rs1 & 0x1f) << 15 | (rs2 & 0x1f) << 20;
}

これにより機械語バイナリが作られるという訳だ。つまり、このTCGから機械語への変換ルーチンは魔法でもなんでもなく、TCGのオペレーションをパタンマッチングさせているということだ。

FPGA開発日記

カテゴリ別記事インデックス https://msyksphinz.github.io/github_pages , English Version https://fpgadevdiary.hatenadiary.com/

QEMUのTCG(Tiny Code Generator)を読み解く