From 6a342f2c64f93d3d795c3fe26ba5b8895e6291c9 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Thu, 2 Jan 2025 22:44:58 +0900 Subject: [PATCH] Refactor --- src/arch-arm32.cc | 46 +++++++++++++++++++++++----------------------- src/mold.h | 30 ++++++++++++++++-------------- src/thunks.cc | 40 +++++++++++++++++++++------------------- 3 files changed, 60 insertions(+), 56 deletions(-) diff --git a/src/arch-arm32.cc b/src/arch-arm32.cc index 03fff7e8f8..82646c3a9b 100644 --- a/src/arch-arm32.cc +++ b/src/arch-arm32.cc @@ -243,25 +243,21 @@ void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, } } -// ARM and Thumb branch instructions can jump within ±16 MiB. -static bool is_jump_reachable(i64 val) { - return sign_extend(val, 24) == val; +static bool is_reachable(i64 disp) { + return -branch_distance <= disp && disp < branch_distance; +} + +static Thunk &get_reachable_thunk(OutputSection &osec, u64 addr) { + for (std::unique_ptr> &thunk : osec.thunks) + if (is_reachable(thunk->get_addr() - addr)) + return *thunk; + abort(); } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - auto get_tls_trampoline_addr = [&](u64 addr) { - for (i64 i = 0; i < output_section->thunks.size(); i++) { - i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset - - addr; - if (-branch_distance <= disp && disp < branch_distance) - return disp; - } - abort(); - }; - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX) @@ -287,6 +283,10 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); }; auto get_arm_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 4; }; + auto get_tlsdesc_trampoline_addr = [&] { + return get_reachable_thunk(*output_section, P).get_addr(); + }; + switch (rel.r_type) { case R_ARM_ABS32: break; @@ -305,7 +305,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // They are different in only one bit. We need to use BL if // the jump target is Thumb. Otherwise, use BLX. i64 val = S + A - P; - if (is_jump_reachable(val)) { + if (is_reachable(val)) { if (T) { write_thm_b_imm(loc, val); *(ul16 *)(loc + 2) |= 0x1000; // rewrite to BL @@ -345,8 +345,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { if (!is_bl && !is_blx) Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX"; - u64 val = S + A - P; - if (is_jump_reachable(val)) { + i64 val = S + A - P; + if (is_reachable(val)) { if (T) { *(ul32 *)loc = 0xfa00'0000; // BLX *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2); @@ -372,8 +372,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // immediate; it takes only a register. So if mode switch is // required, we jump to a linker-synthesized thunk which does the // job with a longer code sequence. - u64 val = S + A - P; - if (!is_jump_reachable(val) || T) + i64 val = S + A - P; + if (!is_reachable(val) || T) val = get_arm_thunk_addr() + A - P; *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2); break; @@ -418,8 +418,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to // switch processor mode. - u64 val = S + A - P; - if (!is_jump_reachable(val) || !T) + i64 val = S + A - P; + if (!is_reachable(val) || !T) val = get_thumb_thunk_addr() + A - P; write_thm_b_imm(loc, val); break; @@ -504,8 +504,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; case R_ARM_TLS_CALL: if (sym.has_tlsdesc(ctx)) { - // BL - *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2); + *(ul32 *)loc = 0xeb00'0000; // bl 0 + *(ul32 *)loc |= bits(get_tlsdesc_trampoline_addr() - P - 8, 25, 2); } else if (sym.has_gottp(ctx)) { *(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0] } else { @@ -514,7 +514,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; case R_ARM_THM_TLS_CALL: if (sym.has_tlsdesc(ctx)) { - u64 val = align_to(get_tls_trampoline_addr(P + 4), 4); + u64 val = align_to(get_tlsdesc_trampoline_addr() - P - 4, 4); write_thm_b_imm(loc, val); *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX } else if (sym.has_gottp(ctx)) { diff --git a/src/mold.h b/src/mold.h index 70fabc38da..cf68ca1242 100644 --- a/src/mold.h +++ b/src/mold.h @@ -106,9 +106,12 @@ class Thunk { i64 size() const { return E::thunk_hdr_size + symbols.size() * E::thunk_size; } void copy_buf(Context &ctx); - u64 get_addr(i64 idx) const { - return output_section.shdr.sh_addr + offset + E::thunk_hdr_size + - idx * E::thunk_size; + u64 get_addr() const { + return output_section.shdr.sh_addr + offset; + } + + u64 get_addr(i64 i) const { + return get_addr() + E::thunk_hdr_size + E::thunk_size * i; } OutputSection &output_section; @@ -116,8 +119,6 @@ class Thunk { std::vector *> symbols; }; -template void gather_thunk_addresses(Context &); - template static consteval i64 get_branch_distance() { // ARM64's branch has 26 bits immediate. The immediate is padded with @@ -149,10 +150,13 @@ static consteval i64 get_branch_distance() { // on the target architecture. For example, ARM32's B instruction jumps to // the branch's address + immediate + 4 (i.e., B with offset 0 jumps to // the next instruction), while RISC-V has no such implicit bias. Here, we -// subtract 16 as a safety margin. +// subtract 16 as a safety margin that is large enough for all targets. template static constexpr i64 branch_distance = get_branch_distance() - 16; +template +void gather_thunk_addresses(Context &ctx); + // // input-sections.cc // @@ -2946,16 +2950,14 @@ inline void Symbol::set_djb_hash(Context &ctx, u32 hash) { } template -u64 +inline u64 Symbol::get_thunk_addr(Context &ctx, u64 P) const requires needs_thunk { - assert(aux_idx != -1); - std::span vec = ctx.symbol_aux[aux_idx].thunk_addrs; - u64 min = (P < branch_distance) ? 0 : P - branch_distance; - auto it = std::lower_bound(vec.begin(), vec.end(), min); - assert(it != vec.end()); - assert(*it < (P + branch_distance < P) ? UINT64_MAX : P + branch_distance); - return *it; + u64 lo = (P < branch_distance) ? 0 : P - branch_distance; + u64 val = *std::lower_bound(vec.begin(), vec.end(), lo); + assert(-branch_distance <= (i64)(val - P) && + (i64)(val - P) < branch_distance); + return val; } template diff --git a/src/thunks.cc b/src/thunks.cc index 514f19dd8e..11e26452c8 100644 --- a/src/thunks.cc +++ b/src/thunks.cc @@ -32,14 +32,16 @@ namespace mold { using E = MOLD_TARGET; -// We create thunks for each 12.8/1.6/3.2 MiB code block for +// We create thunks for each 25.6/3.2/6.4 MiB code block for // ARM64/ARM32/PPC, respectively. -static constexpr i64 batch_size = branch_distance / 10; +static constexpr i64 batch_size = branch_distance / 5; // We assume that a single thunk group is smaller than 1 MiB. static constexpr i64 max_thunk_size = 1024 * 1024; -// Thunks are aligned to 16 byte boundaries. +// We align thunks to 16 byte boundaries because many processor vendors +// recommend we align branch targets to 16 byte boundaries for performance +// reasons. static constexpr i64 thunk_align = 16; template @@ -221,11 +223,12 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { while (t < thunks.size()) reset(*thunks[t++]); - this->shdr.sh_size = offset; - + u32 p2align = 0; for (InputSection *isec : members) - this->shdr.sh_addralign = - std::max(this->shdr.sh_addralign, 1 << isec->p2align); + p2align = std::max(p2align, isec->p2align); + + this->shdr.sh_size = offset; + this->shdr.sh_addralign = 1 << p2align; } // When applying relocations, we want to know the address in a reachable @@ -241,24 +244,23 @@ void gather_thunk_addresses(Context &ctx) { std::vector *> syms; for (Chunk *chunk : ctx.chunks) { - OutputSection *osec = chunk->to_osec(); - if (!osec || !(osec->shdr.sh_flags & SHF_EXECINSTR)) - continue; - - for (std::unique_ptr> &thunk : osec->thunks) { - for (i64 i = 0; i < thunk->symbols.size(); i++) { - Symbol &sym = *thunk->symbols[i]; - sym.add_aux(ctx); - ctx.symbol_aux[sym.aux_idx].thunk_addrs.push_back(thunk->get_addr(i)); - if (!sym.flags.test_and_set()) - syms.push_back(&sym); + if (OutputSection *osec = chunk->to_osec()) { + for (std::unique_ptr> &thunk : osec->thunks) { + for (i64 i = 0; i < thunk->symbols.size(); i++) { + Symbol &sym = *thunk->symbols[i]; + sym.add_aux(ctx); + + std::vector &vec = ctx.symbol_aux[sym.aux_idx].thunk_addrs; + if (vec.empty()) + syms.push_back(&sym); + vec.push_back(thunk->get_addr(i)); + } } } } tbb::parallel_for_each(syms, [&](Symbol *sym) { sort(ctx.symbol_aux[sym->aux_idx].thunk_addrs); - sym->flags = 0; }); }