[TCG] 01.QEMU TCG 概览

// include/exec/translation-block.h
struct TranslationBlock {
    /*
     * Guest PC corresponding to this block.  This must be the true
     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
     * privilege, must store those bits elsewhere.
     *
     * If CF_PCREL, the opcodes for the TranslationBlock are written
     * such that the TB is associated only with the physical page and
     * may be run in any virtual address context.  In this case, PC
     * must always be taken from ENV in a target-specific manner.
     * Unwind information is taken as offsets from the page, to be
     * deposited into the "current" PC.
     */
    vaddr pc;

    /*
     * Target-specific data associated with the TranslationBlock, e.g.:
     * x86: the original user, the Code Segment virtual base,
     * arm: an extension of tb->flags,
     * s390x: instruction data for EXECUTE,
     * sparc: the next pc of the instruction queue (for delay slots).
     */
    uint64_t cs_base;

    uint32_t flags; /* flags defining in which context the code was generated */
    uint32_t cflags;    /* compile flags */

/* Note that TCG_MAX_INSNS is 512; we validate this match elsewhere. */
#define CF_COUNT_MASK    0x000001ff
#define CF_NO_GOTO_TB    0x00000200 /* Do not chain with goto_tb */
#define CF_NO_GOTO_PTR   0x00000400 /* Do not chain with goto_ptr */
#define CF_SINGLE_STEP   0x00000800 /* gdbstub single-step in effect */
#define CF_MEMI_ONLY     0x00001000 /* Only instrument memory ops */
#define CF_USE_ICOUNT    0x00002000
#define CF_INVALID       0x00004000 /* TB is stale. Set with @jmp_lock held */
#define CF_PARALLEL      0x00008000 /* Generate code for a parallel context */
#define CF_NOIRQ         0x00010000 /* Generate an uninterruptible TB */
#define CF_PCREL         0x00020000 /* Opcodes in TB are PC-relative */
#define CF_CLUSTER_MASK  0xff000000 /* Top 8 bits are cluster ID */
#define CF_CLUSTER_SHIFT 24

    /*
     * Above fields used for comparing
     */

    /* size of target code for this block (1 <= size <= TARGET_PAGE_SIZE) */
    uint16_t size;
    uint16_t icount;

    struct tb_tc tc;

    /*
     * Track tb_page_addr_t intervals that intersect this TB.
     * For user-only, the virtual addresses are always contiguous,
     * and we use a unified interval tree.  For system, we use a
     * linked list headed in each PageDesc.  Within the list, the lsb
     * of the previous pointer tells the index of page_next[], and the
     * list is protected by the PageDesc lock(s).
     */
#ifdef CONFIG_USER_ONLY
    IntervalTreeNode itree;
#else
    uintptr_t page_next[2];
    tb_page_addr_t page_addr[2];
#endif

    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
    QemuSpin jmp_lock;

    /* The following data are used to directly call another TB from
     * the code of this one. This can be done either by emitting direct or
     * indirect native jump instructions. These jumps are reset so that the TB
     * just continues its execution. The TB can be linked to another one by
     * setting one of the jump targets (or patching the jump instruction). Only
     * two of such jumps are supported.
     */
#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
    uint16_t jmp_reset_offset[2]; /* offset of original jump target */
    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
    uintptr_t jmp_target_addr[2]; /* target address */

    /*
     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
     * Each TB can have two outgoing jumps, and therefore can participate
     * in two lists. The list entries are kept in jmp_list_next[2]. The least
     * significant bit (LSB) of the pointers in these lists is used to encode
     * which of the two list entries is to be used in the pointed TB.
     *
     * List traversals are protected by jmp_lock. The destination TB of each
     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
     * can be acquired from any origin TB.
     *
     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
     * being invalidated, so that no further outgoing jumps from it can be set.
     *
     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
     * to a destination TB that has CF_INVALID set.
     */
    uintptr_t jmp_list_head;
    uintptr_t jmp_list_next[2];
    uintptr_t jmp_dest[2];
};

2.2.加速器 - AccelClass

AccelClass 是指令翻译的加速器，例如 KVM、XEN 等加速器

// include/qemu/accel.h
typedef struct AccelClass {
    /*< private >*/
    ObjectClass parent_class;
    /*< public >*/

    const char *name; 
    int (*init_machine)(MachineState *ms);
#ifndef CONFIG_USER_ONLY
    void (*setup_post)(MachineState *ms, AccelState *accel);
    bool (*has_memory)(MachineState *ms, AddressSpace *as,
                       hwaddr start_addr, hwaddr size);
#endif
    bool (*cpu_common_realize)(CPUState *cpu, Error **errp);
    void (*cpu_common_unrealize)(CPUState *cpu);

    /* gdbstub related hooks */
    int (*gdbstub_supported_sstep_flags)(void);

    bool *allowed;
    /*
     * Array of global properties that would be applied when specific
     * accelerator is chosen. It works like MachineClass.compat_props
     * but it's for accelerators not machines. Accelerator-provided
     * global properties may be overridden by machine-type
     * compat_props or user-provided global properties.
     */
    GPtrArray *compat_props;
} AccelClass;

2.3.虚拟 CPU - TCGContext

TCGContext 保存指令翻译时的上下文信息，包括 CPU、当前指令块的指针、指令缓冲区等，指令块翻译时 CPU 被抽象为 TCGContext

// include/tcg/tcg.h
struct TCGContext {
    uint8_t *pool_cur, *pool_end;
    TCGPool *pool_first, *pool_current, *pool_first_large;
    int nb_labels;
    int nb_globals;
    int nb_temps;
    int nb_indirects;
    int nb_ops;
    TCGType addr_type;            /* TCG_TYPE_I32 or TCG_TYPE_I64 */

    int page_mask;
    uint8_t page_bits;
    uint8_t tlb_dyn_max_bits;
    uint8_t insn_start_words;
    TCGBar guest_mo;

    TCGRegSet reserved_regs;
    intptr_t current_frame_offset;
    intptr_t frame_start;
    intptr_t frame_end;
    TCGTemp *frame_temp;

    TranslationBlock *gen_tb;     /* tb for which code is being generated */
    tcg_insn_unit *code_buf;      /* pointer for start of tb */
    tcg_insn_unit *code_ptr;      /* pointer for running end of tb */

#ifdef CONFIG_DEBUG_TCG
    int goto_tb_issue_mask;
    const TCGOpcode *vecop_list;
#endif

    /* Code generation.  Note that we specifically do not use tcg_insn_unit
       here, because there's too much arithmetic throughout that relies
       on addition and subtraction working on bytes.  Rely on the GCC
       extension that allows arithmetic on void*.  */
    void *code_gen_buffer;
    size_t code_gen_buffer_size;
    void *code_gen_ptr;
    void *data_gen_ptr;

    /* Threshold to flush the translated code buffer.  */
    void *code_gen_highwater;

    /* Track which vCPU triggers events */
    CPUState *cpu;                      /* *_trans */

    /* These structures are private to tcg-target.c.inc.  */
#ifdef TCG_TARGET_NEED_LDST_LABELS
    QSIMPLEQ_HEAD(, TCGLabelQemuLdst) ldst_labels;
#endif
#ifdef TCG_TARGET_NEED_POOL_LABELS
    struct TCGLabelPoolData *pool_labels;
#endif

    TCGLabel *exitreq_label;

#ifdef CONFIG_PLUGIN
    /*
     * We keep one plugin_tb struct per TCGContext. Note that on every TB
     * translation we clear but do not free its contents; this way we
     * avoid a lot of malloc/free churn, since after a few TB's it's
     * unlikely that we'll need to allocate either more instructions or more
     * space for instructions (for variable-instruction-length ISAs).
     */
    struct qemu_plugin_tb *plugin_tb;

    /* descriptor of the instruction being translated */
    struct qemu_plugin_insn *plugin_insn;
#endif

    GHashTable *const_table[TCG_TYPE_COUNT];
    TCGTempSet free_temps[TCG_TYPE_COUNT];
    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */

    QTAILQ_HEAD(, TCGOp) ops, free_ops;
    QSIMPLEQ_HEAD(, TCGLabel) labels;

    /*
     * When clear, new ops are added to the tail of @ops.
     * When set, new ops are added in front of @emit_before_op.
     */
    TCGOp *emit_before_op;

    /* Tells which temporary holds a given register.
       It does not take into account fixed registers */
    TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];

    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
    uint64_t *gen_insn_data;

    /* Exit to translator on overflow. */
    sigjmp_buf jmp_trans;
};

2.4.临时变量类型 - TCGTempKind

一般 Guest 的通用寄存器（General Purpose Register，GPR）被定义为 IR 这一层的 Global 寄存器

中间码计算时会用到一些临时变量，这些临时变量保存在 Local Temp 或 Normal Temp 这样的寄存器里，计算要用到一些常量时需要定义一个 TCG 寄存器，然后创建一个常量并将它赋给 TCG 寄存器，Global、Local Temp、Normal Temp 和 Const 这些类型的 TCG 寄存器在前端翻译时经常用到，Fixed 和 EBB 直接使用的情况不多

// include/tcg/tcg.h
typedef enum TCGTempKind {
    /*
     * Temp is dead at the end of the extended basic block (EBB),
     * the single-entry multiple-exit region that falls through
     * conditional branches.
     */
    TEMP_EBB,
    /* Temp is live across the entire translation block, but dead at end. */
    TEMP_TB,
    /* Temp is live across the entire translation block, and between them. */
    TEMP_GLOBAL,
    /* Temp is in a fixed register. */
    TEMP_FIXED,
    /* Temp is a fixed constant. */
    TEMP_CONST,
} TCGTempKind;

2.5.当前正在翻译的指令块 - DisasContextBase

DisasContextBase 保存当前正在翻译的指令块的相关信息，包括 Guest 的 PC 指针、Host 的地址等

// include/exec/translator.h
/**
 * DisasContextBase:
 * @tb: Translation block for this disassembly.
 * @pc_first: Address of first guest instruction in this TB.
 * @pc_next: Address of next guest instruction in this TB (current during
 *           disassembly).
 * @is_jmp: What instruction to disassemble next.
 * @num_insns: Number of translated instructions (including current).
 * @max_insns: Maximum number of instructions to be translated in this TB.
 * @singlestep_enabled: "Hardware" single stepping enabled.
 * @saved_can_do_io: Known value of cpu->neg.can_do_io, or -1 for unknown.
 * @plugin_enabled: TCG plugin enabled in this TB.
 * @insn_start: The last op emitted by the insn_start hook,
 *              which is expected to be INDEX_op_insn_start.
 *
 * Architecture-agnostic disassembly context.
 */
typedef struct DisasContextBase {
    TranslationBlock *tb;
    vaddr pc_first;
    vaddr pc_next;
    DisasJumpType is_jmp;
    int num_insns;
    int max_insns;
    bool singlestep_enabled;
    bool plugin_enabled;
    struct TCGOp *insn_start;
    void *host_addr[2];
} DisasContextBase;

2.6.翻译块的上下文 - DisasContext

DisasContext 保存指令块翻译时的上下文信息，包括寄存器和临时变量，类似于一个包含所有寄存器的虚拟 CPU

// target/i386/tcg/translate.c
typedef struct DisasContext {
    DisasContextBase base;

    target_ulong pc;       /* pc = eip + cs_base */
    target_ulong cs_base;  /* base of CS segment */
    target_ulong pc_save;
    ...

    /* TCG local temps */
    TCGv cc_srcT;
    TCGv A0;
    TCGv T0;
    TCGv T1;

    /* TCG local register indexes (only used inside old micro ops) */
    TCGv tmp0;
    TCGv tmp4;
    TCGv_i32 tmp2_i32;
    ...

    sigjmp_buf jmpbuf;
    TCGOp *prev_insn_start;
    TCGOp *prev_insn_end;
} DisasContext;

3.TCG 初始化

对全局 TCG 模块初始化，函数入口为 configure_accelerators()，其将 CPU 抽象为 TCGContext，定义翻译块的拆分方法和保存方式（树状保存各个 TCG 输出的翻译块），最后注册 JIT，组织 ELF 结构并构建 ELF

3.1.实例化 TCG 对象

instance_init() 为 TCG 对象的实例化方法

// accel/tcg/tcg-all.c
static const TypeInfo tcg_accel_type = {
    .name = TYPE_TCG_ACCEL,
    .parent = TYPE_ACCEL,
    .instance_init = tcg_accel_instance_init,
    .class_init = tcg_accel_class_init,
    .instance_size = sizeof(TCGState),
};
module_obj(TYPE_TCG_ACCEL);
-------------------------------------------------------

// accel/tcg/tcg-all.c
static void tcg_accel_class_init(ObjectClass *oc, void *data)
{
    AccelClass *ac = ACCEL_CLASS(oc);
    ac->name = "tcg";
    ac->init_machine = tcg_init_machine;
    ac->cpu_common_realize = tcg_exec_realizefn;
    ac->cpu_common_unrealize = tcg_exec_unrealizefn;
    ac->allowed = &tcg_allowed;
    ac->gdbstub_supported_sstep_flags = tcg_gdbstub_supported_sstep_flags;

    object_class_property_add_str(oc, "thread",
                                  tcg_get_thread,
                                  tcg_set_thread);

    object_class_property_add(oc, "tb-size", "int",
        tcg_get_tb_size, tcg_set_tb_size,
        NULL, NULL);
    object_class_property_set_description(oc, "tb-size",
        "TCG translation block cache size");

    object_class_property_add_bool(oc, "split-wx",
        tcg_get_splitwx, tcg_set_splitwx);
    object_class_property_set_description(oc, "split-wx",
        "Map jit pages into separate RW and RX regions");

    object_class_property_add_bool(oc, "one-insn-per-tb",
                                   tcg_get_one_insn_per_tb,
                                   tcg_set_one_insn_per_tb);
    object_class_property_set_description(oc, "one-insn-per-tb",
        "Only put one guest insn in each translation block");
}

QEMU 初始化时为虚拟机创建加速器 Accelerator，同时执行 Accelerator 的初始化方法

// system/vl.c
qemu_init()
    |--> configure_accelerators(argv[0]);
        |--> // 查找accelerate：accelerators = "tcg:kvm";
        |--> do_configure_accelerator() // 初始化
            |--> const char *acc = qemu_opt_get(opts, "accel"); // acc = tcg
            |--> AccelClass *ac = accel_find(acc);
            |--> AccelState *accel; // Object accel: Class ac --> Object accel
            |--> accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));  // 创建Object accel
            |--> // properties setting
            |--> ret = accel_init_machine(accel, current_machine);
                |--> ms->accelerator = accel;
                |--> ret = acc->init_machine(ms);
                    |--> tcg_init_machine()
                |--> object_set_accelerator_compat_props(acc->compat_props);
                    |--> object_compat_props[0] = compat_props; // 设置accel全局properties

3.2.开发板创建时绑定 TCG 加速器 - tcg_init_machine()

tcg_init_machine() 初始化翻译块使用的内存页、哈希表和上下文

// accel/tcg/tcg-all.c
static int tcg_init_machine(MachineState *ms)
{
    TCGState *s = TCG_STATE(current_accel());
    ...
    unsigned max_cpus = ms->smp.max_cpus; // 可以通过-smp指定，默认为1
    ...

    page_init(); // 翻译块页表初始化
        |--> page_table_config_init();
    tb_htable_init(); // 翻译块哈希表初始化，用于搜索翻译块
        |--> qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
    tcg_init(s->tb_size * MiB, s->splitwx_enabled, max_cpus); // 上下文初始化
        |--> tcg_context_init(max_cpus);
        |--> tcg_region_init(tb_size, splitwx, max_cpus);

3.3.虚拟 CPU 初始化

每个虚拟的 CPU 对应一个 TCGContext，即在 IR 层 CPU 被抽象为 TCGContext

// tcg/tcg.c
static void tcg_context_init(unsigned max_cpus)
{
    TCGContext *s = &tcg_init_ctx;
    int op, total_args, n, i;
    TCGOpDef *def;
    TCGArgConstraint *args_ct;
    TCGTemp *ts;

    // 基本初始化
    memset(s, 0, sizeof(*s));
    ...
    init_call_layout(&info_helper_ld32_mmu);
    ...
    // 初始化IR转换所使用的寄存器
    tcg_target_init(s);

    // 操作码定义
    process_op_defs(s);

    /* Reverse the order of the saved registers, assuming they're all at
       the start of tcg_target_reg_alloc_order.  */
    for (n = 0; n < ARRAY_SIZE(tcg_target_reg_alloc_order); ++n) {
        int r = tcg_target_reg_alloc_order[n];
        if (tcg_regset_test_reg(tcg_target_call_clobber_regs, r)) {
            break;
        }
    }
    for (i = 0; i < n; ++i) {
        indirect_reg_alloc_order[i] = tcg_target_reg_alloc_order[n - 1 - i];
    }
    for (; i < ARRAY_SIZE(tcg_target_reg_alloc_order); ++i) {
        indirect_reg_alloc_order[i] = tcg_target_reg_alloc_order[i];
    }

    alloc_tcg_plugin_context(s);

    tcg_ctx = s;
    /*
     * In user-mode we simply share the init context among threads, since we
     * use a single region. See the documentation tcg_region_init() for the
     * reasoning behind this.
     * In system-mode we will have at most max_cpus TCG threads.
     */
#ifdef CONFIG_USER_ONLY
    tcg_ctxs = &tcg_ctx;
    tcg_cur_ctxs = 1;
    tcg_max_ctxs = 1;
#else
    tcg_max_ctxs = max_cpus; // 每个虚拟的CPU对应一个TCG Context
    tcg_ctxs = g_new0(TCGContext *, max_cpus); // 为TCG Context分配内存
#endif

    tcg_debug_assert(!tcg_regset_test_reg(s->reserved_regs, TCG_AREG0));
    ts = tcg_global_reg_new_internal(s, TCG_TYPE_PTR, TCG_AREG0, "env");
    tcg_env = temp_tcgv_ptr(ts);
}

3.4.内存页初始化 - tcg_region_init()

将 code_gen_buffer() 输出的 Buffer 拆分成多块，使 CPU 可以并行处理，同时，创建一个最初始的 TCGContext tcg_init_ctx，user-mode 中 tcg_init_ctx 即为唯一的 TCGContext，system-mode 中 TCGContext 的数量与 vCPU 的个数相关

// tcg/region.c
void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
{
    const size_t page_size = qemu_real_host_page_size();
    size_t region_size;
    int have_prot, need_prot;

    /* Size the buffer.  */
    if (tb_size == 0) { // 获取内存页大小，宿主机为Linux，一个内存页大小为4KB
        ...

    // 分配实际的内存空间
    have_prot = alloc_code_gen_buffer(tb_size, splitwx, &error_fatal);
        |--> alloc_code_gen_buffer_anon()
            |--> buf = mmap(NULL, size, prot, flags, -1, 0);
            |--> region.start_aligned = buf;
            |--> region.total_size = size;
    ...

    // 为该内存页分配更大的页表，减少查找未命中时继续加载页表的操作
    /* Request large pages for the buffer and the splitwx.  */
    qemu_madvise(region.start_aligned, region.total_size, QEMU_MADV_HUGEPAGE);
    ...

    /* A region must have at least 2 pages; one code, one guard */
    g_assert(region_size >= 2 * page_size);
    region.stride = region_size;

    // 为页表预留空间
    /* Reserve space for guard pages. */
    region.size = region_size - page_size;
    region.total_size -= page_size;

    /*
     * The first region will be smaller than the others, via the prologue,
     * which has yet to be allocated.  For now, the first region begins at
     * the page boundary.
     */
    region.after_prologue = region.start_aligned;

    /* init the region struct */
    qemu_mutex_init(&region.lock);

    /*
     * Set guard pages in the rw buffer, as that's the one into which
     * buffer overruns could occur.  Do not set guard pages in the rx
     * buffer -- let that one use hugepages throughout.
     * Work with the page protections set up with the initial mapping.
     */
    need_prot = PROT_READ | PROT_WRITE;
    ...

    // 将TCG Region Memory以树状保存，同时定义了释放每个Region Memory的方法
    tcg_region_trees_init();

    /*
     * Leave the initial context initialized to the first region.
     * This will be the context into which we generate the prologue.
     * It is also the only context for CONFIG_USER_ONLY.
     */
    tcg_region_initial_alloc__locked(&tcg_init_ctx);
        |--> bool err = tcg_region_alloc__locked(s);
            |--> tcg_region_assign(s, region.current);
                |--> tcg_region_bounds(curr_region, &start, &end);
                |--> s->code_gen_buffer = start;
                |--> s->code_gen_ptr = start;
                |--> s->code_gen_buffer_size = end - start;
                |--> s->code_gen_highwater = end - TCG_HIGHWATER;
}

3.5.翻译准备 - tcg_prologue_init()

Prologue 保存主函数的 Frame Pointer，用于在子函数调用结束后恢复主函数的栈帧，同时为子函数准备栈帧，其主要操作包括：

保存主函数的 Frame Pointer，将保存在 ebp 寄存器的 Frame Pointer 压栈，退出子函数时可以从栈中恢复主函数的 Frame Pointer；
将 esp 赋值给 ebp，即将子函数的 Frame Pointer 指向主函数的栈顶；
修改栈顶指针 esp，为子函数的本地变量分配栈空间；

main 函数并不是程序第一个调用的函数，main 函数也是一个被调用的函数，其也有栈帧，程序第一个被调用的函数 _start 也会模拟一个栈帧

// tcg/tcg.c
__thread TCGContext *tcg_ctx; // 每个线程中都有一个tcg_ctx实例，且互不干扰
----------------------------------------------------------

// tcg/tcg.c
void tcg_prologue_init(void)
{
    TCGContext *s = tcg_ctx;
    size_t prologue_size;

    s->code_ptr = s->code_gen_ptr;
    s->code_buf = s->code_gen_ptr;
    s->data_gen_ptr = NULL;
    ...

    /* Generate the prologue.  */
    tcg_target_qemu_prologue(s); // 需要区分平台（i386、ARM、MIPS等）
    ...

    // 注册JIT（即时翻译/动态翻译）
    tcg_region_prologue_set(s);
        |--> tcg_register_jit()
            |--> tcg_register_jit_int()
                |--> tcg_register_jit_int()
}
----------------------------------------------------------

// tcg/tcg.c
static void tcg_register_jit_int(const void *buf_ptr, size_t buf_size,
                                 const void *debug_frame,
                                 size_t debug_frame_size)
{
    ...
    struct ElfImage {
        ElfW(Ehdr) ehdr; // ELF Header
        ElfW(Phdr) phdr; // Program Header
        ElfW(Shdr) shdr[7]; // Section Header
        ElfW(Sym)  sym[2]; // String and Symbol Tables
        struct DebugInfo di;
        uint8_t    da[24];
        char       str[80];
    };
    
    struct ElfImage *img;
    
    // 创建ELF模板
    static const struct ElfImage img_template = {
        .ehdr = {
            .e_ident[EI_MAG0] = ELFMAG0,
        ...
        
        .shdr = {
        ...

        .sym = {
            [1] = { /* code_gen_buffer */
                .st_info = ELF_ST_INFO(STB_GLOBAL, STT_FUNC),
                .st_shndx = 1,
            }
        },
        ...
        
        .str = "\0" ".text\0" ".debug_info\0" ".debug_abbrev\0"
               ".debug_frame\0" ".symtab\0" ".strtab\0" "code_gen_buffer",
    };
    
    /* We only need a single jit entry; statically allocate it.  */
    static struct jit_code_entry one_entry;

    uintptr_t buf = (uintptr_t)buf_ptr;
    size_t img_size = sizeof(struct ElfImage) + debug_frame_size;
    ...

    img = g_malloc(img_size);
    *img = img_template;
    
    img->phdr.p_vaddr = buf;
    ...

3.6.虚拟 CPU 操作函数 - TCGCPUOps

TCGCPUOps 保存虚拟 CPU 的相关接口，包括中断入口、指令块同步、虚拟 CPU 退出等接口

// target/i386/tcg/tcg-cpu.c
static const TCGCPUOps x86_tcg_ops = {
    .initialize = tcg_x86_init,
    .synchronize_from_tb = x86_cpu_synchronize_from_tb,
    .restore_state_to_opc = x86_restore_state_to_opc,
    .cpu_exec_enter = x86_cpu_exec_enter,
    .cpu_exec_exit = x86_cpu_exec_exit,
    ...
    .tlb_fill = x86_cpu_tlb_fill,
    .do_interrupt = x86_cpu_do_interrupt,
    .cpu_exec_halt = x86_cpu_exec_halt,
    .cpu_exec_interrupt = x86_cpu_exec_interrupt,
    .do_unaligned_access = x86_cpu_do_unaligned_access,
    .debug_excp_handler = breakpoint_handler,
    .debug_check_breakpoint = x86_debug_check_breakpoint,
    .need_replay_interrupt = x86_need_replay_interrupt,
};

主板初始化时会绑定 CPU Class 和 TCG 的 tcg_ops

// system/vl.c
qemu_init()
    |--> configure_accelerators() // TCG初始化
    |--> ...
    |--> machine_run_board_init()
            |--> accel_init_interfaces(ACCEL_GET_CLASS(machine->accelerator));
                |--> accel_system_init_ops_interfaces(ac); // ac.name = tcg
                    |--> AccelOpsClass *ops;
                    |--> ops->ops_init(ops);
                        |--> tcg_accel_ops_init() // TCG Operation相关方法绑定
                            |--> ops->create_vcpu_thread = mttcg_start_vcpu_thread;
                            |--> ...
                    |--> cpus_register_accel(ops);
                        |--> cpus_accel = ops;
                |--> accel_init_cpu_interfaces(ac);
                    |--> acc = object_class_by_name(acc_name); // ac_name = tcg-accel
                    |--> accel_init_cpu_int_aux()
                        |--> accel_cpu->cpu_class_init(cc);
                            |--> x86_tcg_cpu_class_init()
                                |--> acc->cpu_target_realize = tcg_cpu_realizefn;
                                |--> ...
                        |--> cc->init_accel_cpu(accel_cpu, cc);
                            |--> x86_tcg_cpu_class_init()
                                |--> cc->init_accel_cpu = x86_tcg_cpu_init_ops;

4.TCG 线程

x86_cpu_new() 创建 CPU 并执行 realize 方法，对于 i386 平台，Object CPU 的 name 为 qemu64-x86_64-cpu，其 Parent 为 Class CPU，对应的 realize 方法为 x86_cpu_realizefn()，执行该方法时初始化 vCPU 线程

// target/i386/cpu.c
static const TypeInfo x86_cpu_type_info = {
    .name = TYPE_X86_CPU,
    .parent = TYPE_CPU,
    ...
    
    .class_init = x86_cpu_common_class_init,
        |--> device_class_set_parent_realize(dc, x86_cpu_realizefn, &xcc->parent_realize);
        |--> ...
};
---------------------------------------------------------------------------

// hw/core/machine.c
machine_run_board_init()
|--> machine_class->init(machine);
    |--> pc_init1()
        |--> x86_cpus_init(x86ms, pcmc->default_cpu_version);
            |--> x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
                |--> Object *cpu = object_new(MACHINE(x86ms)->cpu_type); // name = qemu64-x86_64-cpu
                |--> qdev_realize(DEVICE(cpu), NULL, errp);
                    |--> x86_cpu_realizefn()
                        |--> qemu_init_vcpu(cs);
                            |--> cpus_accel->create_vcpu_thread(cpu);

CPU Accelerator 调用 create_vcpu_thread() 方法创建线程，在 TCG 初始化时绑定对应的 Operation

// accel/tcg/tcg-accel-ops.c
static void tcg_accel_ops_init(AccelOpsClass *ops)
{
    if (qemu_tcg_mttcg_enabled()) {
        ops->create_vcpu_thread = mttcg_start_vcpu_thread;
    ...
---------------------------------------------------------------------------

// accel/tcg/tcg-accel-ops-mttcg.c
void mttcg_start_vcpu_thread(CPUState *cpu)
{
    char thread_name[VCPU_THREAD_NAME_SIZE];

    g_assert(tcg_enabled());
    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);

    cpu->thread = g_new0(QemuThread, 1);
    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
    qemu_cond_init(cpu->halt_cond);

    /* create a thread per vCPU with TCG (MTTCG) */
    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
             cpu->cpu_index);

    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                       cpu, QEMU_THREAD_JOINABLE);
}

TCG 线程创建后运行该线程，线程自身执行的函数为 mttcg_cpu_thread_fn()

// util/qemu-thread-posix.c
qemu_thread_create()
|--> err = pthread_create(&thread->thread, &attr, qemu_thread_start, qemu_thread_args);
    |--> void *(*start_routine)(void *) = qemu_thread_args->start_routine;
    |--> r = start_routine(arg); // start_routine --> mttcg_cpu_thread_fn
        |--> mttcg_cpu_thread_fn()

4.1.TCG 线程启动 - mttcg_cpu_thread_fn()

tcg_region_init() 为最初始的 TCGContext tcg_init_ctx 分配空间，user-mode 使用这唯一的一个，system-mode 中会重新创建新的 ctx

由于使用了直接赋值 *s = tcg_init_ctx，因此新创建的 ctx 和 tcg_init_ctx 的 mem_base 指针指向同一块内存，因此需要将 ctx 自身的 mem_base 指针指向自身所属的内存

CPU 线程执行是在一个循环中，如果没有中断/调试等信号，则会持续不断地执行，qemu_wait_io_event() 会响应一些外部中断

// accel/tcg/tcg-accel-ops-mttcg.c
static void *mttcg_cpu_thread_fn(void *arg)
{
    MttcgForceRcuNotifier force_rcu;
    CPUState *cpu = arg;
    ...
    
    // RCU Config
    rcu_register_thread();
    ...
    tcg_register_thread();
    |--> TCGContext *s = g_malloc(sizeof(*s));
    |--> *s = tcg_init_ctx;
    |--> /* Relink mem_base.  */
    |--> tcg_ctx = s;
    ...

    // CPU Thread Run
    do {
        if (cpu_can_run(cpu)) {
            ...
            r = tcg_cpu_exec(cpu);
            |--> ret = cpu_exec(cpu);
                |--> ret = cpu_exec_setjmp(cpu, &sc);
                    |--> return cpu_exec_loop(cpu, sc);
 
            ...
            switch (r) {
            case EXCP_DEBUG:
                cpu_handle_guest_debug(cpu);
                break;
            case EXCP_HALTED:
                ...
                break;
            case EXCP_ATOMIC:
                ...
                cpu_exec_step_atomic(cpu);
                ...
        }
        ...
        qemu_wait_io_event(cpu);
    } while (!cpu->unplug || cpu_can_run(cpu));

4.2.虚拟 CPU 运行 - cpu_exec_loop()

EIP (Extended Instruction Pointer) 寄存器始终指向 CPU 中的下一条指令，当 CPU 执行完当前指令后，EIP 将自动递增，指向下一条待执行的指令

PC (Program Counter) 与 IP/EIP 寄存器的区别在于，PC 寄存器指向 CPU 中当前正在执行的指令，而 IP 寄存器指向 CPU 即将执行的下一条指令

cpu_exec_loop() 循环获取当前的 PC 指针，翻译该 PC 指向的 Guest Code，同时检测是否有中断或异常产生

// accel/tcg/cpu-exec.c
/* main execution loop */
static int __attribute__((noinline))
cpu_exec_loop(CPUState *cpu, SyncClocks *sc)
{
    ...
    /* if an exception is pending, we execute it here */
    while (!cpu_handle_exception(cpu, &ret)) { // 异常处理
        TranslationBlock *last_tb = NULL;
        ...

        while (!cpu_handle_interrupt(cpu, &last_tb)) { // 中断处理
            TranslationBlock *tb;
            vaddr pc;
            uint64_t cs_base;
            uint32_t flags, cflags;

            // 获取cs_base指针和pc指针
            cpu_get_tb_cpu_state(cpu_env(cpu), &pc, &cs_base, &flags);
                // Code Segment Base：0xffff0000
                |--> *cs_base = env->segs[R_CS].base;
                // Program Counter：pc = cs_base + eip
                |--> *pc = (uint32_t)(*cs_base + env->eip);

            // #define CF_PCREL         0x00020000 /* Opcodes in TB are PC-relative */
            // #define CF_PARALLEL      0x00008000 /* Generate code for a parallel context */
            cflags = cpu->cflags_next_tb;
            if (cflags == -1) {
                cflags = curr_cflags(cpu);
            ...

            // CPU线程刚开始运行时还不存在TranslationBlock，因此需要重新创建
            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
            if (tb == NULL) {
                ...
                tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
            ...

            // 运行翻译块
            cpu_loop_exec_tb(cpu, tb, pc, &last_tb, &tb_exit);

            ...
            align_clocks(sc, cpu);
        }
    }

    return ret;
}

4.2.1.指令翻译准备 - tb_gen_code()

tb_gen_code() 为后续指令翻译准备内存页，并分配 TB 块

// accel/tcg/translate-all.c
TranslationBlock *tb_gen_code(CPUState *cpu,
                              vaddr pc, uint64_t cs_base,
                              uint32_t flags, int cflags)
{
    CPUArchState *env = cpu_env(cpu);
    TranslationBlock *tb, *existing_tb;
    tb_page_addr_t phys_pc, phys_p2;
    tcg_insn_unit *gen_code_buf;
    ...
    void *host_pc;

    ...
    // 从Host中获取内存页
    phys_pc = get_page_addr_code_hostp(env, pc, &host_pc);
    ...
    
    // 计算每个TB块中存储多少条翻译指令：max instruction = 512
    max_insns = cflags & CF_COUNT_MASK;
    if (max_insns == 0) {
        max_insns = TCG_MAX_INSNS;
    }
    
    tb = tcg_tb_alloc(tcg_ctx);
        |--> tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
    ...
    gen_code_buf = tcg_ctx->code_gen_ptr;
    ...
    tcg_ctx->gen_tb = tb;
    ...
    
    // 循环逐条翻译指令
    gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns, &ti);
        |--> int ret = sigsetjmp(tcg_ctx->jmp_trans, 0); // 保存当前堆栈环境
        |--> gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
            |--> translator_loop(cpu, tb, max_insns, pc, host_pc, &i386_tr_ops, &dc.base);
        |--> return tcg_gen_code(tcg_ctx, tb, pc);
    ...

    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
    ...
    
    /*
     * Insert TB into the corresponding region tree before publishing it
     * through QHT. Otherwise rewinding happened in the TB might fail to
     * lookup itself using host PC.
     */
    tcg_tb_insert(tb); // 将TB插入TCG Region Tree
    ...
    
    // 关联TB和虚拟机的物理内存页（GPA）
    existing_tb = tb_link_page(tb);
    ...
    
    return tb;
}

4.2.2.指令翻译 - translator_loop() & disas_insn()

translator_loop() 调用架构对应的指令翻译接口 disas_insn()，将 Guest Code 翻译为中间码

// accel/tcg/translator.c
void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
                     vaddr pc, void *host_pc, const TranslatorOps *ops,
                     DisasContextBase *db)
{
    ...
    /* Initialize DisasContext */
    db->tb = tb;
    ...
    
    /* Start translating.  */
    icount_start_insn = gen_tb_start(db, cflags);
    ...
    while (true) {
        ...
        ops->insn_start(db, cpu);
            |--> i386_tr_insn_start()
                |--> tcg_gen_insn_start(pc_arg, dc->cc_op);
                    |--> TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 64 / TCG_TARGET_REG_BITS);
                        |--> TCGOp *op = tcg_op_alloc(opc, nargs);
                    |--> tcg_set_insn_start_param(op, 0, pc); // op->args[0] = pc
                    |--> tcg_set_insn_start_param(op, 1, a1); // op->args[1] = a1
        ...
        ops->translate_insn(db, cpu);
            |--> i386_tr_translate_insn()
                |--> DisasContext *dc = container_of(dcbase, DisasContext, base);
                |--> disas_insn(dc, cpu) // 逐条翻译指令，保存在DisasContext的临时变量中
        ...
    }

    /* Emit code to exit the TB, as indicated by db->is_jmp.  */
    ops->tb_stop(db, cpu);
        |--> i386_tr_tb_stop()

    gen_tb_end(tb, cflags, icount_start_insn, db->num_insns);
    ...

    /* The disas_log hook may use these values rather than recompute.  */
    tb->size = db->pc_next - db->pc_first;
    tb->icount = db->num_insns;
    ...
}
----------------------------------------------------------------------------------

// target/i386/tcg/translate.c
/* convert one instruction. s->base.is_jmp is set if the translation must
   be stopped. Return the next pc value */
static bool disas_insn(DisasContext *s, CPUState *cpu)
{
    CPUX86State *env = cpu_env(cpu);
    int b, prefixes;
    int shift;
    MemOp ot, aflag, dflag;
    int modrm, reg, rm, mod, op, opreg, val;
    bool orig_cc_op_dirty = s->cc_op_dirty;
    CCOp orig_cc_op = s->cc_op;
    target_ulong orig_pc_save = s->pc_save;

    s->pc = s->base.pc_next;
    s->override = -1;
    ...
    
 next_byte:
    s->prefix = prefixes;
    b = x86_ldub_code(env, s);
    /* Collect prefixes.  */
    switch (b) {
    default:
        break;
    case 0x0f:
        b = x86_ldub_code(env, s) + 0x100;
    ...

4.3.指令执行 - cpu_loop_exec_tb()

TCG 中定义了全局的函数指针 tcg_qemu_tb_exec，该函数指针指向翻译块中保存的代码，调用该函数指针将代码段由可写改为可读，从而自动执行内存中保存的指令

// include/tcg/tcg.h
typedef uintptr_t tcg_prologue_fn(CPUArchState *env, const void *tb_ptr);
extern tcg_prologue_fn *tcg_qemu_tb_exec;
----------------------------------------------------------------------

// tcg/tcg.c
void tcg_prologue_init(void)
{
    TCGContext *s = tcg_ctx;
    ...
    
    tcg_qemu_tb_exec = (tcg_prologue_fn *)tcg_splitwx_to_rx(s->code_ptr);
    ...


// accel/tcg/cpu-exec.c
static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
                                    vaddr pc, TranslationBlock **last_tb,
                                    int *tb_exit)
{
    ...
    // 函数指针ptr指向一块内存，当该函数指针被调用时将该块内存有可写变为可读
    // CPU将会自动执行该块内存中保存的指令
    tb = cpu_tb_exec(cpu, tb, tb_exit);
        |--> ret = tcg_qemu_tb_exec(cpu_env(cpu), tb_ptr);
            |--> tcg_splitwx_to_rx()
                |--> return rw ? rw + tcg_splitwx_diff : NULL;
        
    ...

    /* Ensure global icount has gone forward */
    icount_update(cpu);
    ...