Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
rui314 committed Jan 3, 2025
1 parent d0bcd35 commit 6a342f2
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 56 deletions.
46 changes: 23 additions & 23 deletions src/arch-arm32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -243,25 +243,21 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
}
}

// ARM and Thumb branch instructions can jump within ±16 MiB.
static bool is_jump_reachable(i64 val) {
return sign_extend(val, 24) == val;
static bool is_reachable(i64 disp) {
return -branch_distance<E> <= disp && disp < branch_distance<E>;
}

static Thunk<E> &get_reachable_thunk(OutputSection<E> &osec, u64 addr) {
for (std::unique_ptr<Thunk<E>> &thunk : osec.thunks)
if (is_reachable(thunk->get_addr() - addr))
return *thunk;
abort();
}

template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);

auto get_tls_trampoline_addr = [&](u64 addr) {
for (i64 i = 0; i < output_section->thunks.size(); i++) {
i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
addr;
if (-branch_distance<E> <= disp && disp < branch_distance<E>)
return disp;
}
abort();
};

for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
Expand All @@ -287,6 +283,10 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
auto get_arm_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 4; };

auto get_tlsdesc_trampoline_addr = [&] {
return get_reachable_thunk(*output_section, P).get_addr();
};

switch (rel.r_type) {
case R_ARM_ABS32:
break;
Expand All @@ -305,7 +305,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
// They are different in only one bit. We need to use BL if
// the jump target is Thumb. Otherwise, use BLX.
i64 val = S + A - P;
if (is_jump_reachable(val)) {
if (is_reachable(val)) {
if (T) {
write_thm_b_imm(loc, val);
*(ul16 *)(loc + 2) |= 0x1000; // rewrite to BL
Expand Down Expand Up @@ -345,8 +345,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
if (!is_bl && !is_blx)
Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX";

u64 val = S + A - P;
if (is_jump_reachable(val)) {
i64 val = S + A - P;
if (is_reachable(val)) {
if (T) {
*(ul32 *)loc = 0xfa00'0000; // BLX
*(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
Expand All @@ -372,8 +372,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
// immediate; it takes only a register. So if mode switch is
// required, we jump to a linker-synthesized thunk which does the
// job with a longer code sequence.
u64 val = S + A - P;
if (!is_jump_reachable(val) || T)
i64 val = S + A - P;
if (!is_reachable(val) || T)
val = get_arm_thunk_addr() + A - P;
*(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
break;
Expand Down Expand Up @@ -418,8 +418,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {

// Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
// switch processor mode.
u64 val = S + A - P;
if (!is_jump_reachable(val) || !T)
i64 val = S + A - P;
if (!is_reachable(val) || !T)
val = get_thumb_thunk_addr() + A - P;
write_thm_b_imm(loc, val);
break;
Expand Down Expand Up @@ -504,8 +504,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
break;
case R_ARM_TLS_CALL:
if (sym.has_tlsdesc(ctx)) {
// BL <tls_trampoline>
*(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
*(ul32 *)loc = 0xeb00'0000; // bl 0
*(ul32 *)loc |= bits(get_tlsdesc_trampoline_addr() - P - 8, 25, 2);
} else if (sym.has_gottp(ctx)) {
*(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0]
} else {
Expand All @@ -514,7 +514,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
break;
case R_ARM_THM_TLS_CALL:
if (sym.has_tlsdesc(ctx)) {
u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
u64 val = align_to(get_tlsdesc_trampoline_addr() - P - 4, 4);
write_thm_b_imm(loc, val);
*(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
} else if (sym.has_gottp(ctx)) {
Expand Down
30 changes: 16 additions & 14 deletions src/mold.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,19 @@ class Thunk<E> {
i64 size() const { return E::thunk_hdr_size + symbols.size() * E::thunk_size; }
void copy_buf(Context<E> &ctx);

u64 get_addr(i64 idx) const {
return output_section.shdr.sh_addr + offset + E::thunk_hdr_size +
idx * E::thunk_size;
u64 get_addr() const {
return output_section.shdr.sh_addr + offset;
}

u64 get_addr(i64 i) const {
return get_addr() + E::thunk_hdr_size + E::thunk_size * i;
}

OutputSection<E> &output_section;
i64 offset;
std::vector<Symbol<E> *> symbols;
};

template <needs_thunk E> void gather_thunk_addresses(Context<E> &);

template <needs_thunk E>
static consteval i64 get_branch_distance() {
// ARM64's branch has 26 bits immediate. The immediate is padded with
Expand Down Expand Up @@ -149,10 +150,13 @@ static consteval i64 get_branch_distance() {
// on the target architecture. For example, ARM32's B instruction jumps to
// the branch's address + immediate + 4 (i.e., B with offset 0 jumps to
// the next instruction), while RISC-V has no such implicit bias. Here, we
// subtract 16 as a safety margin.
// subtract 16 as a safety margin that is large enough for all targets.
template <needs_thunk E>
static constexpr i64 branch_distance = get_branch_distance<E>() - 16;

template <needs_thunk E>
void gather_thunk_addresses(Context<E> &ctx);

//
// input-sections.cc
//
Expand Down Expand Up @@ -2946,16 +2950,14 @@ inline void Symbol<E>::set_djb_hash(Context<E> &ctx, u32 hash) {
}

template <typename E>
u64
inline u64
Symbol<E>::get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E> {
assert(aux_idx != -1);

std::span<u64> vec = ctx.symbol_aux[aux_idx].thunk_addrs;
u64 min = (P < branch_distance<E>) ? 0 : P - branch_distance<E>;
auto it = std::lower_bound(vec.begin(), vec.end(), min);
assert(it != vec.end());
assert(*it < (P + branch_distance<E> < P) ? UINT64_MAX : P + branch_distance<E>);
return *it;
u64 lo = (P < branch_distance<E>) ? 0 : P - branch_distance<E>;
u64 val = *std::lower_bound(vec.begin(), vec.end(), lo);
assert(-branch_distance<E> <= (i64)(val - P) &&
(i64)(val - P) < branch_distance<E>);
return val;
}

template <typename E>
Expand Down
40 changes: 21 additions & 19 deletions src/thunks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ namespace mold {

using E = MOLD_TARGET;

// We create thunks for each 12.8/1.6/3.2 MiB code block for
// We create thunks for each 25.6/3.2/6.4 MiB code block for
// ARM64/ARM32/PPC, respectively.
static constexpr i64 batch_size = branch_distance<E> / 10;
static constexpr i64 batch_size = branch_distance<E> / 5;

// We assume that a single thunk group is smaller than 1 MiB.
static constexpr i64 max_thunk_size = 1024 * 1024;

// Thunks are aligned to 16 byte boundaries.
// We align thunks to 16 byte boundaries because many processor vendors
// recommend we align branch targets to 16 byte boundaries for performance
// reasons.
static constexpr i64 thunk_align = 16;

template <typename E>
Expand Down Expand Up @@ -221,11 +223,12 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
while (t < thunks.size())
reset(*thunks[t++]);

this->shdr.sh_size = offset;

u32 p2align = 0;
for (InputSection<E> *isec : members)
this->shdr.sh_addralign =
std::max<u32>(this->shdr.sh_addralign, 1 << isec->p2align);
p2align = std::max<u32>(p2align, isec->p2align);

this->shdr.sh_size = offset;
this->shdr.sh_addralign = 1 << p2align;
}

// When applying relocations, we want to know the address in a reachable
Expand All @@ -241,24 +244,23 @@ void gather_thunk_addresses(Context<E> &ctx) {
std::vector<Symbol<E> *> syms;

for (Chunk<E> *chunk : ctx.chunks) {
OutputSection<E> *osec = chunk->to_osec();
if (!osec || !(osec->shdr.sh_flags & SHF_EXECINSTR))
continue;

for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks) {
for (i64 i = 0; i < thunk->symbols.size(); i++) {
Symbol<E> &sym = *thunk->symbols[i];
sym.add_aux(ctx);
ctx.symbol_aux[sym.aux_idx].thunk_addrs.push_back(thunk->get_addr(i));
if (!sym.flags.test_and_set())
syms.push_back(&sym);
if (OutputSection<E> *osec = chunk->to_osec()) {
for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks) {
for (i64 i = 0; i < thunk->symbols.size(); i++) {
Symbol<E> &sym = *thunk->symbols[i];
sym.add_aux(ctx);

std::vector<u64> &vec = ctx.symbol_aux[sym.aux_idx].thunk_addrs;
if (vec.empty())
syms.push_back(&sym);
vec.push_back(thunk->get_addr(i));
}
}
}
}

tbb::parallel_for_each(syms, [&](Symbol<E> *sym) {
sort(ctx.symbol_aux[sym->aux_idx].thunk_addrs);
sym->flags = 0;
});
}

Expand Down

0 comments on commit 6a342f2

Please sign in to comment.