From 6a342f2c64f93d3d795c3fe26ba5b8895e6291c9 Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@cs.stanford.edu>
Date: Thu, 2 Jan 2025 22:44:58 +0900
Subject: [PATCH] Refactor

---
 src/arch-arm32.cc | 46 +++++++++++++++++++++++-----------------------
 src/mold.h        | 30 ++++++++++++++++--------------
 src/thunks.cc     | 40 +++++++++++++++++++++-------------------
 3 files changed, 60 insertions(+), 56 deletions(-)
diff --git a/src/arch-arm32.cc b/src/arch-arm32.cc
index 03fff7e8f8..82646c3a9b 100644
--- a/src/arch-arm32.cc
+++ b/src/arch-arm32.cc
@@ -243,25 +243,21 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
   }
 }
 
-// ARM and Thumb branch instructions can jump within ±16 MiB.
-static bool is_jump_reachable(i64 val) {
-  return sign_extend(val, 24) == val;
+static bool is_reachable(i64 disp) {
+  return -branch_distance<E> <= disp && disp < branch_distance<E>;
+}
+
+static Thunk<E> &get_reachable_thunk(OutputSection<E> &osec, u64 addr) {
+  for (std::unique_ptr<Thunk<E>> &thunk : osec.thunks)
+    if (is_reachable(thunk->get_addr() - addr))
+      return *thunk;
+  abort();
 }
 
 template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  auto get_tls_trampoline_addr = [&](u64 addr) {
-    for (i64 i = 0; i < output_section->thunks.size(); i++) {
-      i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
-                 addr;
-      if (-branch_distance<E> <= disp && disp < branch_distance<E>)
-        return disp;
-    }
-    abort();
-  };
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
@@ -287,6 +283,10 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
     auto get_arm_thunk_addr   = [&] { return sym.get_thunk_addr(ctx, P) + 4; };
 
+    auto get_tlsdesc_trampoline_addr = [&] {
+      return get_reachable_thunk(*output_section, P).get_addr();
+    };
+
     switch (rel.r_type) {
     case R_ARM_ABS32:
       break;
@@ -305,7 +305,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // They are different in only one bit. We need to use BL if
       // the jump target is Thumb. Otherwise, use BLX.
       i64 val = S + A - P;
-      if (is_jump_reachable(val)) {
+      if (is_reachable(val)) {
         if (T) {
           write_thm_b_imm(loc, val);
           *(ul16 *)(loc + 2) |= 0x1000;  // rewrite to BL
@@ -345,8 +345,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       if (!is_bl && !is_blx)
         Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX";
 
-      u64 val = S + A - P;
-      if (is_jump_reachable(val)) {
+      i64 val = S + A - P;
+      if (is_reachable(val)) {
         if (T) {
           *(ul32 *)loc = 0xfa00'0000; // BLX
           *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
@@ -372,8 +372,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // immediate; it takes only a register. So if mode switch is
       // required, we jump to a linker-synthesized thunk which does the
       // job with a longer code sequence.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || T)
+      i64 val = S + A - P;
+      if (!is_reachable(val) || T)
         val = get_arm_thunk_addr() + A - P;
       *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
       break;
@@ -418,8 +418,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
       // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
       // switch processor mode.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || !T)
+      i64 val = S + A - P;
+      if (!is_reachable(val) || !T)
         val = get_thumb_thunk_addr() + A - P;
       write_thm_b_imm(loc, val);
       break;
@@ -504,8 +504,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_ARM_TLS_CALL:
       if (sym.has_tlsdesc(ctx)) {
-        // BL <tls_trampoline>
-        *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
+        *(ul32 *)loc = 0xeb00'0000; // bl 0
+        *(ul32 *)loc |= bits(get_tlsdesc_trampoline_addr() - P - 8, 25, 2);
       } else if (sym.has_gottp(ctx)) {
         *(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0]
       } else {
@@ -514,7 +514,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_ARM_THM_TLS_CALL:
       if (sym.has_tlsdesc(ctx)) {
-        u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
+        u64 val = align_to(get_tlsdesc_trampoline_addr() - P - 4, 4);
         write_thm_b_imm(loc, val);
         *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
       } else if (sym.has_gottp(ctx)) {
diff --git a/src/mold.h b/src/mold.h
index 70fabc38da..cf68ca1242 100644
--- a/src/mold.h
+++ b/src/mold.h
@@ -106,9 +106,12 @@ class Thunk<E> {
   i64 size() const { return E::thunk_hdr_size + symbols.size() * E::thunk_size; }
   void copy_buf(Context<E> &ctx);
 
-  u64 get_addr(i64 idx) const {
-    return output_section.shdr.sh_addr + offset + E::thunk_hdr_size +
-           idx * E::thunk_size;
+  u64 get_addr() const {
+    return output_section.shdr.sh_addr + offset;
+  }
+
+  u64 get_addr(i64 i) const {
+    return get_addr() + E::thunk_hdr_size + E::thunk_size * i;
   }
 
   OutputSection<E> &output_section;
@@ -116,8 +119,6 @@ class Thunk<E> {
   std::vector<Symbol<E> *> symbols;
 };
 
-template <needs_thunk E> void gather_thunk_addresses(Context<E> &);
-
 template <needs_thunk E>
 static consteval i64 get_branch_distance() {
   // ARM64's branch has 26 bits immediate. The immediate is padded with
@@ -149,10 +150,13 @@ static consteval i64 get_branch_distance() {
 // on the target architecture. For example, ARM32's B instruction jumps to
 // the branch's address + immediate + 4 (i.e., B with offset 0 jumps to
 // the next instruction), while RISC-V has no such implicit bias. Here, we
-// subtract 16 as a safety margin.
+// subtract 16 as a safety margin that is large enough for all targets.
 template <needs_thunk E>
 static constexpr i64 branch_distance = get_branch_distance<E>() - 16;
 
+template <needs_thunk E>
+void gather_thunk_addresses(Context<E> &ctx);
+
 //
 // input-sections.cc
 //
@@ -2946,16 +2950,14 @@ inline void Symbol<E>::set_djb_hash(Context<E> &ctx, u32 hash) {
 }
 
 template <typename E>
-u64
+inline u64
 Symbol<E>::get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E> {
-  assert(aux_idx != -1);
-
   std::span<u64> vec = ctx.symbol_aux[aux_idx].thunk_addrs;
-  u64 min = (P < branch_distance<E>) ? 0 : P - branch_distance<E>;
-  auto it = std::lower_bound(vec.begin(), vec.end(), min);
-  assert(it != vec.end());
-  assert(*it < (P + branch_distance<E> < P) ? UINT64_MAX : P + branch_distance<E>);
-  return *it;
+  u64 lo = (P < branch_distance<E>) ? 0 : P - branch_distance<E>;
+  u64 val = *std::lower_bound(vec.begin(), vec.end(), lo);
+  assert(-branch_distance<E> <= (i64)(val - P) &&
+         (i64)(val - P) < branch_distance<E>);
+  return val;
 }
 
 template <typename E>
diff --git a/src/thunks.cc b/src/thunks.cc
index 514f19dd8e..11e26452c8 100644
--- a/src/thunks.cc
+++ b/src/thunks.cc
@@ -32,14 +32,16 @@ namespace mold {
 
 using E = MOLD_TARGET;
 
-// We create thunks for each 12.8/1.6/3.2 MiB code block for
+// We create thunks for each 25.6/3.2/6.4 MiB code block for
 // ARM64/ARM32/PPC, respectively.
-static constexpr i64 batch_size = branch_distance<E> / 10;
+static constexpr i64 batch_size = branch_distance<E> / 5;
 
 // We assume that a single thunk group is smaller than 1 MiB.
 static constexpr i64 max_thunk_size = 1024 * 1024;
 
-// Thunks are aligned to 16 byte boundaries.
+// We align thunks to 16 byte boundaries because many processor vendors
+// recommend we align branch targets to 16 byte boundaries for performance
+// reasons.
 static constexpr i64 thunk_align = 16;
 
 template <typename E>
@@ -221,11 +223,12 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
   while (t < thunks.size())
     reset(*thunks[t++]);
 
-  this->shdr.sh_size = offset;
-
+  u32 p2align = 0;
   for (InputSection<E> *isec : members)
-    this->shdr.sh_addralign =
-      std::max<u32>(this->shdr.sh_addralign, 1 << isec->p2align);
+    p2align = std::max<u32>(p2align, isec->p2align);
+
+  this->shdr.sh_size = offset;
+  this->shdr.sh_addralign = 1 << p2align;
 }
 
 // When applying relocations, we want to know the address in a reachable
@@ -241,24 +244,23 @@ void gather_thunk_addresses(Context<E> &ctx) {
   std::vector<Symbol<E> *> syms;
 
   for (Chunk<E> *chunk : ctx.chunks) {
-    OutputSection<E> *osec = chunk->to_osec();
-    if (!osec || !(osec->shdr.sh_flags & SHF_EXECINSTR))
-      continue;
-
-    for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks) {
-      for (i64 i = 0; i < thunk->symbols.size(); i++) {
-        Symbol<E> &sym = *thunk->symbols[i];
-        sym.add_aux(ctx);
-        ctx.symbol_aux[sym.aux_idx].thunk_addrs.push_back(thunk->get_addr(i));
-        if (!sym.flags.test_and_set())
-          syms.push_back(&sym);
+    if (OutputSection<E> *osec = chunk->to_osec()) {
+      for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks) {
+        for (i64 i = 0; i < thunk->symbols.size(); i++) {
+          Symbol<E> &sym = *thunk->symbols[i];
+          sym.add_aux(ctx);
+
+          std::vector<u64> &vec = ctx.symbol_aux[sym.aux_idx].thunk_addrs;
+          if (vec.empty())
+            syms.push_back(&sym);
+          vec.push_back(thunk->get_addr(i));
+        }
       }
     }
   }
 
   tbb::parallel_for_each(syms, [&](Symbol<E> *sym) {
     sort(ctx.symbol_aux[sym->aux_idx].thunk_addrs);
-    sym->flags = 0;
   });
 }