TCGの続き。TCGによるエンコードをもっと詳しく見るために、最適化を抑制すべくいろいろ変更してみた。
git diff tcg/i386/tcg-target.inc.c diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index ec083bddcf..d86ccdf05b 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -40,6 +40,8 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { }; #endif +void tgen_arithr(TCGContext *s, int subop, int dest, int src); + static const int tcg_target_reg_alloc_order[] = { #if TCG_TARGET_REG_BITS == 64 TCG_REG_RBP, @@ -820,7 +822,7 @@ static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) } /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ -static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) +void tgen_arithr(TCGContext *s, int subop, int dest, int src) { /* Propagate an opcode prefix, such as P_REXW. */ int ext = subop & ~0x7;
i386
ターゲットのtgen_arithr()
のstatic inline
を削除して独立するようにした。QEMUはビルドにWarningが入ると速攻で落ちるので慎重に作業する。
これでビルドしてtgen_arithr()
のオブジェクトコードを見てみる。
000000000041f421 <tgen_arithr>: 41f421: 55 push %rbp 41f422: 48 89 e5 mov %rsp,%rbp 41f425: 48 83 ec 30 sub $0x30,%rsp 41f429: 48 89 7d e8 mov %rdi,-0x18(%rbp) 41f42d: 89 75 e4 mov %esi,-0x1c(%rbp) 41f430: 89 55 e0 mov %edx,-0x20(%rbp) 41f433: 89 4d dc mov %ecx,-0x24(%rbp) 41f436: 8b 45 e4 mov -0x1c(%rbp),%eax 41f439: 83 e0 f8 and $0xfffffff8,%eax 41f43c: 89 45 fc mov %eax,-0x4(%rbp) 41f43f: 83 65 e4 07 andl $0x7,-0x1c(%rbp) 41f443: 8b 45 e4 mov -0x1c(%rbp),%eax 41f446: 8d 14 c5 00 00 00 00 lea 0x0(,%rax,8),%edx 41f44d: 8b 45 fc mov -0x4(%rbp),%eax 41f450: 01 d0 add %edx,%eax 41f452: 8d 70 03 lea 0x3(%rax),%esi 41f455: 8b 4d dc mov -0x24(%rbp),%ecx 41f458: 8b 55 e0 mov -0x20(%rbp),%edx 41f45b: 48 8b 45 e8 mov -0x18(%rbp),%rax 41f45f: 48 89 c7 mov %rax,%rdi 41f462: e8 ec f8 ff ff callq 41ed53 <tcg_out_modrm> 41f467: 90 nop 41f468: c9 leaveq 41f469: c3 retq
なるほど、さっぱりわからんぞ。というかhelp
を見ていたらいろいろトレースオプションがあることに気が付いた。
$ qemu-system-riscv64 --machine virt --d help --nographic --trace enable=myriscvx_trap --kernel rv64ui-p-simple
out_asm show generated host assembly code for each compiled TB in_asm show target assembly code for each compiled TB op show micro ops for each compiled TB op_opt show micro ops after optimization op_ind show micro ops before indirect lowering int show interrupts/exceptions in short format exec show trace before each executed TB (lots of logs) cpu show CPU registers before entering a TB (lots of logs) fpu include FPU registers in the 'cpu' logging mmu log MMU-related activities pcall x86 only: show protected mode far calls/returns/exceptions cpu_reset show CPU state before CPU resets unimp log unimplemented functionality guest_errors log when the guest OS does something invalid (eg accessing a non-existent register) page dump pages at beginning of user mode emulation nochain do not chain compiled TBs so that "exec" and "cpu" show complete traces strace log every user-mode syscall, its input, and its result trace:PATTERN enable trace events
なるほど、ここでは、in_asm,op,op_opt,op_ind,out_asm
が有益そうだ。
$ qemu-system-riscv64 --machine virt --d in_asm,op,op_opt,op_ind,out_asm --nographic \ --trace enable=myriscvx_trap --kernel rv64ui-p-simple 2>&1 | tee qemu.myriscvx64.log
IN: Priv: 3; Virt: 0 0x0000000000001000: 00000297 auipc t0,0 # 0x1000 0x0000000000001004: 02028593 addi a1,t0,32 0x0000000000001008: f1402573 csrrs a0,mhartid,zero OUT: [size=112] 0x7fdeec000100: 8b 5d f0 movl -0x10(%rbp), %ebx 0x7fdeec000103: 85 db testl %ebx, %ebx 0x7fdeec000105: 0f 8c 4b 00 00 00 jl 0x7fdeec000156 0x7fdeec00010b: 48 c7 45 28 00 10 00 00 movq $0x1000, 0x28(%rbp) 0x7fdeec000113: 48 c7 45 58 20 10 00 00 movq $0x1020, 0x58(%rbp) 0x7fdeec00011b: 48 c7 85 00 01 00 00 08 movq $0x1008, 0x100(%rbp) 0x7fdeec000123: 10 00 00 0x7fdeec000126: c7 85 8c f8 ff ff 01 00 movl $1, -0x774(%rbp) 0x7fdeec00012e: 00 00 0x7fdeec000130: 48 8b fd movq %rbp, %rdi 0x7fdeec000133: 33 f6 xorl %esi, %esi 0x7fdeec000135: ba 14 0f 00 00 movl $0xf14, %edx 0x7fdeec00013a: 33 c9 xorl %ecx, %ecx 0x7fdeec00013c: ff 15 26 00 00 00 callq *0x26(%rip) 0x7fdeec000142: 48 89 45 50 movq %rax, 0x50(%rbp) 0x7fdeec000146: 48 c7 85 00 01 00 00 0c movq $0x100c, 0x100(%rbp) 0x7fdeec00014e: 10 00 00 0x7fdeec000151: e9 c0 fe ff ff jmp 0x7fdeec000016 0x7fdeec000156: 48 8d 05 e6 fe ff ff leaq -0x11a(%rip), %rax 0x7fdeec00015d: e9 b6 fe ff ff jmp 0x7fdeec000018 0x7fdeec000162: 90 nop 0x7fdeec000163: 90 nop 0x7fdeec000164: 90 nop 0x7fdeec000165: 90 nop 0x7fdeec000166: 90 nop 0x7fdeec000167: 90 nop 0x7fdeec000168: .quad 0x00000000004c30fe
なるほど、上記のRISC-V3命令がこのx86命令に変換されるらしい。これはさっぱりわからんぞ。
という訳でもう少し簡単なプログラムを作ってみようと思った。
simple_asm.S
.section .text _start: addi x1, x0, 10 addi x2, x1, 11 addi x3, x2, 12 addi x4, x3, 13 addi x5, x4, 14 addi x6, x5, 15 addi x7, x6, 16 addi x8, x7, 17 addi x9, x8, 18 addi x10, x9, 19 addi x11, x10, 20 addi x12, x11, 21 addi x13, x12, 22 addi x14, x13, 23 addi x15, x14, 24 addi x16, x15, 25 addi x17, x16, 26 addi x18, x17, 27 addi x19, x18, 28 addi x20, x20, 29 main: finish_loop: j finish_loop
ひたすら加算を行う。これをコンパイルしたらひたすらx86の加算命令が生成されるはずだ。
$ riscv64-unknown-elf-as simple_asm.S -o simple_asm.o $ riscv64-unknown-elf-ld simple_asm.o -o simple_asm -T ../../riscv-tools/riscv-tests/benchmarks/common/test.ld
実行してみる。
$ qemu-system-riscv64 --machine virt --d in_asm,op,op_opt,op_ind,out_asm \ --nographic --trace enable=myriscvx_trap --kernel simple_asm 2>&1 | tee qemu.myriscvx64.log
これが入力アセンブリコード
---------------- IN: Priv: 3; Virt: 0 0x0000000080000000: 00a00093 addi ra,zero,10 0x0000000080000004: 00b08113 addi sp,ra,11 0x0000000080000008: 00c10193 addi gp,sp,12 0x000000008000000c: 00d18213 addi tp,gp,13 0x0000000080000010: 00e20293 addi t0,tp,14 0x0000000080000014: 00f28313 addi t1,t0,15 0x0000000080000018: 01030393 addi t2,t1,16 0x000000008000001c: 01138413 addi s0,t2,17 0x0000000080000020: 01240493 addi s1,s0,18 0x0000000080000024: 01348513 addi a0,s1,19 0x0000000080000028: 01450593 addi a1,a0,20 0x000000008000002c: 01558613 addi a2,a1,21 0x0000000080000030: 01660693 addi a3,a2,22 0x0000000080000034: 01768713 addi a4,a3,23 0x0000000080000038: 01870793 addi a5,a4,24 0x000000008000003c: 01978813 addi a6,a5,25 0x0000000080000040: 01a80893 addi a7,a6,26 0x0000000080000044: 01b88913 addi s2,a7,27 0x0000000080000048: 01c90993 addi s3,s2,28 0x000000008000004c: 01da0a13 addi s4,s4,29 0x0000000080000050: 0000006f j 0 # 0x80000050
- 中間のTiny Code。なるほど、新しい変数を定義しては定数加算を繰り返していることが分かる。
OP: ld_i32 tmp0,env,$0xfffffffffffffff0 movi_i32 tmp1,$0x0 brcond_i32 tmp0,tmp1,lt,$L0 ---- 0000000080000000 movi_i64 tmp2,$0x0 movi_i64 tmp3,$0xa add_i64 tmp2,tmp2,tmp3 mov_i64 x1/ra,tmp2 ---- 0000000080000004 mov_i64 tmp2,x1/ra movi_i64 tmp3,$0xb add_i64 tmp2,tmp2,tmp3 mov_i64 x2/sp,tmp2 ---- 0000000080000008 mov_i64 tmp2,x2/sp movi_i64 tmp3,$0xc add_i64 tmp2,tmp2,tmp3 mov_i64 x3/gp,tmp2 ---- 000000008000000c mov_i64 tmp2,x3/gp movi_i64 tmp3,$0xd add_i64 tmp2,tmp2,tmp3 ...
- 驚いたことにこの後Tiny Codeに最適化が実行される。定数伝搬が検出されすべてのレジスタ割り当てがそのまま即値代入に置き換えられる。
OP after optimization and liveness analysis: ld_i32 tmp0,env,$0xfffffffffffffff0 dead: 1 pref=0xffff movi_i32 tmp1,$0x0 pref=0xffff brcond_i32 tmp0,tmp1,lt,$L0 dead: 0 1 ---- 0000000080000000 movi_i64 tmp2,$0xa pref=0xffff mov_i64 x1/ra,tmp2 sync: 0 dead: 0 1 pref=0xffff ---- 0000000080000004 movi_i64 tmp2,$0x15 pref=0xffff mov_i64 x2/sp,tmp2 sync: 0 dead: 0 1 pref=0xffff ---- 0000000080000008 movi_i64 tmp2,$0x21 pref=0xffff mov_i64 x3/gp,tmp2 sync: 0 dead: 0 1 pref=0xffff ---- 000000008000000c ...
- 生成されたx86機械語。
movq
って定数移動のことか(x86だから定数移動以外もできるんだろうけど)。綺麗に依存関係が消されている。少し悔しい。
OUT: [size=236] 0x7fbd1c0003c0: 8b 5d f0 movl -0x10(%rbp), %ebx 0x7fbd1c0003c3: 85 db testl %ebx, %ebx 0x7fbd1c0003c5: 0f 8c d5 00 00 00 jl 0x7fbd1c0004a0 0x7fbd1c0003cb: 48 c7 45 08 0a 00 00 00 movq $0xa, 8(%rbp) 0x7fbd1c0003d3: 48 c7 45 10 15 00 00 00 movq $0x15, 0x10(%rbp) 0x7fbd1c0003db: 48 c7 45 18 21 00 00 00 movq $0x21, 0x18(%rbp) 0x7fbd1c0003e3: 48 c7 45 20 2e 00 00 00 movq $0x2e, 0x20(%rbp) 0x7fbd1c0003eb: 48 c7 45 28 3c 00 00 00 movq $0x3c, 0x28(%rbp) 0x7fbd1c0003f3: 48 c7 45 30 4b 00 00 00 movq $0x4b, 0x30(%rbp) 0x7fbd1c0003fb: 48 c7 45 38 5b 00 00 00 movq $0x5b, 0x38(%rbp) 0x7fbd1c000403: 48 c7 45 40 6c 00 00 00 movq $0x6c, 0x40(%rbp) 0x7fbd1c00040b: 48 c7 45 48 7e 00 00 00 movq $0x7e, 0x48(%rbp) 0x7fbd1c000413: 48 c7 45 50 91 00 00 00 movq $0x91, 0x50(%rbp) 0x7fbd1c00041b: 48 c7 45 58 a5 00 00 00 movq $0xa5, 0x58(%rbp) 0x7fbd1c000423: 48 c7 45 60 ba 00 00 00 movq $0xba, 0x60(%rbp) 0x7fbd1c00042b: 48 c7 45 68 d0 00 00 00 movq $0xd0, 0x68(%rbp) 0x7fbd1c000433: 48 c7 45 70 e7 00 00 00 movq $0xe7, 0x70(%rbp) 0x7fbd1c00043b: 48 c7 45 78 ff 00 00 00 movq $0xff, 0x78(%rbp) 0x7fbd1c000443: 48 c7 85 80 00 00 00 18 movq $0x118, 0x80(%rbp) 0x7fbd1c00044b: 01 00 00 0x7fbd1c00044e: 48 c7 85 88 00 00 00 32 movq $0x132, 0x88(%rbp) 0x7fbd1c000456: 01 00 00 0x7fbd1c000459: 48 c7 85 90 00 00 00 4d movq $0x14d, 0x90(%rbp) 0x7fbd1c000461: 01 00 00 0x7fbd1c000464: 48 c7 85 98 00 00 00 69 movq $0x169, 0x98(%rbp) 0x7fbd1c00046c: 01 00 00 0x7fbd1c00046f: 48 8b 9d a0 00 00 00 movq 0xa0(%rbp), %rbx 0x7fbd1c000476: 48 83 c3 1d addq $0x1d, %rbx 0x7fbd1c00047a: 48 89 9d a0 00 00 00 movq %rbx, 0xa0(%rbp) 0x7fbd1c000481: 66 90 nop 0x7fbd1c000483: e9 00 00 00 00 jmp 0x7fbd1c000488 0x7fbd1c000488: bb 50 00 00 80 movl $0x80000050, %ebx 0x7fbd1c00048d: 48 89 9d 00 02 00 00 movq %rbx, 0x200(%rbp) 0x7fbd1c000494: 48 8d 05 65 fe ff ff leaq -0x19b(%rip), %rax 0x7fbd1c00049b: e9 78 fb ff ff jmp 0x7fbd1c000018 0x7fbd1c0004a0: 48 8d 05 5c fe ff ff leaq -0x1a4(%rip), %rax 0x7fbd1c0004a7: e9 6c fb ff ff jmp 0x7fbd1c000018