QEMUのTCG(Tiny Code Generator)を読み解く(6. QEMUのTCGが生成される仕組み)

QEMUはTCG（Tiny Code Generator）と呼ばれる仕組みを使ってゲストマシンの機械語をホストマシンの機械語に変換している。ゲストマシンの機械語は、QEMU実行中にTCGに変換され、これをなるべく最小のホスト機械語に変換することでほぼネイティブなホスト機械語で実行することができる。

では、具体的にどのような仕組みでTCGが生成されているのかを観察する。MYRISCVXのADDI命令を見ていこうと思う。自動生成によって作られたデコーダがADDI命令を検出すると、trans_addi()関数が呼び出される。

qemu/build-myriscvx/myriscvx64-softmmu/target/myriscvx/decode_insn32.inc.c

static bool decode_insn32(DisasContext *ctx, uint32_t insn)
{
    union {
        arg_atomic f_atomic;
        arg_b f_b;
        arg_decode_insn329 f_decode_insn329;
...
        return false;
    case 0x00000013:
        /* ........ ........ ........ .0010011 */
        switch ((insn >> 12) & 0x7) {
        case 0x0:
            /* ........ ........ .000.... .0010011 */
            /* /home/msyksphinz/work/riscv/qemu/target/myriscvx/insn32.decode:54 */
            decode_insn32_extract_i(ctx, &u.f_i, insn);
            if (trans_addi(ctx, &u.f_i)) return true;
            return false;
        case 0x1:
            /* ........ ........ .001.... .0010011 */
            decode_insn32_extract_sh(ctx, &u.f_shift, insn);
...

qemu/target/myriscvx/insn_trans/trans_rvi.inc.c

static bool trans_addi(DisasContext *ctx, arg_addi *a)
{
  TCGv source1;
  source1 = tcg_temp_new();

  gen_get_gpr(source1, a->rs1);

  tcg_gen_addi_tl(source1, source1, a->imm);

  gen_set_gpr(a->rd, source1);
  tcg_temp_free(source1);
  return true;
}

さて問題はこのtrans_addi()の実装だ。ここでは、

tcg_temp_new()
gen_get_gpr()
tcg_gen_addi_tl()
gen_set_gpr()
tcg_temp_free()

が呼び出されている。これらの仕組みについて見ていこうと思う。

`tcg_temp_new()`

tcg_temp_new()はマクロとして定義されている。

qemu/include/tcg/tcg-op.h

#define tcg_temp_new() tcg_temp_new_i64()

qemu/include/tcg/tcg.h

static inline TCGv_i64 tcg_temp_new_i64(void)
{
    TCGTemp *t = tcg_temp_new_internal(TCG_TYPE_I64, false);
    return temp_tcgv_i64(t);
}

tcg_temp_new_internal()は以下のような定義だ。

qemu/tcg/tcg.c

TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
{
    TCGContext *s = tcg_ctx;
    TCGTemp *ts;
    int idx, k;

    k = type + (temp_local ? TCG_TYPE_COUNT : 0);
    idx = find_first_bit(s->free_temps[k].l, TCG_MAX_TEMPS);
    if (idx < TCG_MAX_TEMPS) {
        /* There is already an available temp with the right type.  */
        clear_bit(idx, s->free_temps[k].l);

        ts = &s->temps[idx];
        ts->temp_allocated = 1;
        tcg_debug_assert(ts->base_type == type);
        tcg_debug_assert(ts->temp_local == temp_local);
    } else {
        ts = tcg_temp_alloc(s);
        if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
            TCGTemp *ts2 = tcg_temp_alloc(s);

            ts->base_type = type;
            ts->type = TCG_TYPE_I32;
            ts->temp_allocated = 1;
            ts->temp_local = temp_local;

            tcg_debug_assert(ts2 == ts + 1);
            ts2->base_type = TCG_TYPE_I64;
            ts2->type = TCG_TYPE_I32;
            ts2->temp_allocated = 1;
            ts2->temp_local = temp_local;
        } else {
            ts->base_type = type;
            ts->type = type;
            ts->temp_allocated = 1;
            ts->temp_local = temp_local;
        }
    }

#if defined(CONFIG_DEBUG_TCG)
    s->temps_in_use++;
#endif
    return ts;
}

変数の割り当ての方法を見てみる。

find_first_bit()によって空いているtempsビットの位置を検索する。

    k = type + (temp_local ? TCG_TYPE_COUNT : 0);
    idx = find_first_bit(s->free_temps[k].l, TCG_MAX_TEMPS);

宣言した変数の数がTCG_MAX_TEMPSを超えなければ、テーブル上に変数を用意する。

    if (idx < TCG_MAX_TEMPS) {
        /* There is already an available temp with the right type.  */
        clear_bit(idx, s->free_temps[k].l);

        ts = &s->temps[idx];
        ts->temp_allocated = 1;
        tcg_debug_assert(ts->base_type == type);
        tcg_debug_assert(ts->temp_local == temp_local);
...
#if defined(CONFIG_DEBUG_TCG)
    s->temps_in_use++;
#endif
    return ts;
}

変数tsは、割り当てられたtempsのアドレスを返す。

struct TCGContext {
    uint8_t *pool_cur, *pool_end;
    TCGPool *pool_first, *pool_current, *pool_first_large;
    int nb_labels;
    int nb_globals;
...
    
    TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */

    QTAILQ_HEAD(, TCGOp) ops, free_ops;
    QSIMPLEQ_HEAD(, TCGLabel) labels;
...

これにより、tcg_temp_new()はフリーな変数リストの中から新しい変数のアドレスを取得する。

`gen_get_gpr()`

gen_get_gpr()はRISC-Vの汎用レジスタから値を取得する関数だ。RISC-V依存なのでtranslate側に実装されている。

/* Wrapper for getting reg values - need to check of reg is zero since
 * cpu_gpr[0] is not actually allocated
 */
static inline void gen_get_gpr(TCGv t, int reg_num)
{
  if (reg_num == 0) {
    tcg_gen_movi_tl(t, 0);
  } else {
    tcg_gen_mov_tl(t, cpu_gpr[reg_num]);
  }
}

tcg_gen_movi_tl()はTCGを生成するコード。先ほどのtcg_temp_new()で確保した変数に値を格納する。

static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
{
    if (ret != arg) {
        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
    }
}

これは、2オペランドの操作命令で、オペレーションとしてはデータ移動である。tcg_gen_op2_i64()を見てみると、

qemu/include/tcg/tcg-op.h

// a1は書き込み先レジスタを示す変数
// a2は書き込み元を示すデータ
static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
{
    tcg_gen_op2(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2));
}

qemu/include/tcg/tcg-op.h

void tcg_gen_op2(TCGOpcode opc, TCGArg a1, TCGArg a2)
{
    TCGOp *op = tcg_emit_op(opc);
    op->args[0] = a1;
    op->args[1] = a2;
}

ここでtcg_emit_op()で新しいTCGOpを確保する。このTCGOp *opはtcg_op_alloc()で確保されるのだが、この確保されたアドレスはどのようにして使われるのだろう？

TCGOp *tcg_emit_op(TCGOpcode opc)
{
    TCGOp *op = tcg_op_alloc(opc);
    QTAILQ_INSERT_TAIL(&tcg_ctx->ops, op, link);
    return op;
}

static TCGOp *tcg_op_alloc(TCGOpcode opc)
{
    TCGContext *s = tcg_ctx;
    TCGOp *op;

    if (likely(QTAILQ_EMPTY(&s->free_ops))) {
        op = tcg_malloc(sizeof(TCGOp));
    } else {
        op = QTAILQ_FIRST(&s->free_ops);
        QTAILQ_REMOVE(&s->free_ops, op, link);
    }
    memset(op, 0, offsetof(TCGOp, link));
    op->opc = opc;
    s->nb_ops++;

    return op;
}

tcg_malloc()はコンテキストの現在のプール領域から新たなデータ領域を取得している。

qemu/include/tcg/tcg.h

/* user-mode: Called with mmap_lock held.  */
static inline void *tcg_malloc(int size)
{
    TCGContext *s = tcg_ctx;
    uint8_t *ptr, *ptr_end;

    /* ??? This is a weak placeholder for minimum malloc alignment.  */
    size = QEMU_ALIGN_UP(size, 8);

    ptr = s->pool_cur;
    ptr_end = ptr + size;
    if (unlikely(ptr_end > s->pool_end)) {
        return tcg_malloc_internal(tcg_ctx, size);
    } else {
        s->pool_cur = ptr_end;
        return ptr;
    }
}

QTAILQ_INSERT_TAIL()により確保したOpを挿入する。

#define QTAILQ_INSERT_TAIL(head, elm, field) do {                       \
        (elm)->field.tqe_next = NULL;                                   \
        (elm)->field.tqe_circ.tql_prev = (head)->tqh_circ.tql_prev;     \
        (head)->tqh_circ.tql_prev->tql_next = (elm);                    \
        (head)->tqh_circ.tql_prev = &(elm)->field.tqe_circ;             \
} while (/*CONSTCOND*/0)

`tcg_gen_addi_tl()`

これは加算をするOpを挿入する。

qemu/tcg/tcg-op.c

void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
{
    /* some cases can be optimized here */
    if (arg2 == 0) {
        tcg_gen_mov_i64(ret, arg1);
    } else {
        TCGv_i64 t0 = tcg_const_i64(arg2);
        tcg_gen_add_i64(ret, arg1, t0);
        tcg_temp_free_i64(t0);
    }
}

定数格納のために、tcg_const_i64()を呼んでいる。tcg_gen_add_i64()は加算命令を挿入するTCGだろう。

qemu/tcg/tcg-op.c

static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
{
    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
}
...
    
void tcg_gen_op3(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3)
{
    TCGOp *op = tcg_emit_op(opc);
    op->args[0] = a1;
    op->args[1] = a2;
    op->args[2] = a3;
}

`gen_set_gpr()`

これは汎用レジスタに値を設定するTCGを生成する。

qemu/target/myriscvx/translate.c

/* Wrapper for setting reg values - need to check of reg is zero since
 * cpu_gpr[0] is not actually allocated. this is more for safety purposes,
 * since we usually avoid calling the OP_TYPE_gen function if we see a write to
 * $zero
 */
static inline void gen_set_gpr(int reg_num_dst, TCGv t)
{
  if (reg_num_dst != 0) {
    tcg_gen_mov_tl(cpu_gpr[reg_num_dst], t);
  }
}

`tcg_temp_free()`

tcg_temp_free()は確保したTemp変数を解放する。

qemu/include/tcg/tcg.h

static inline void tcg_temp_free_i64(TCGv_i64 arg)
{
    tcg_temp_free_internal(tcgv_i64_temp(arg));
}

qemu/tcg/tcg.c

void tcg_temp_free_internal(TCGTemp *ts)
{
    TCGContext *s = tcg_ctx;
    int k, idx;

#if defined(CONFIG_DEBUG_TCG)
    s->temps_in_use--;
    if (s->temps_in_use < 0) {
        fprintf(stderr, "More temporaries freed than allocated!\n");
    }
#endif

    tcg_debug_assert(ts->temp_global == 0);
    tcg_debug_assert(ts->temp_allocated != 0);
    ts->temp_allocated = 0;

    idx = temp_idx(ts);
    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
    set_bit(idx, s->free_temps[k].l);
}