Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[clang] Missed optimization: Failure to remove useless allocations / deallocations #122567

Open
davidstone opened this issue Jan 11, 2025 · 5 comments

Comments

@davidstone
Copy link
Contributor

The following translation unit:

#include <memory>

struct Container {
	~Container() {
		if (m_pointer) {
			std::allocator<int>().deallocate(m_pointer, 1);
		}
	}

	int * m_pointer = nullptr;
};

Container bad() {
	auto container = Container{};
	auto const original_pointer = std::allocator<int>().allocate(1);
	container.m_pointer = original_pointer;
	container.m_pointer = std::allocator<int>().allocate(1);

	std::allocator<int>().deallocate(
		original_pointer,
		1
	);

	return container;
}

when compiled with -O3 generates the following output:

%struct.Container = type { ptr }

define dso_local void @bad()(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.Container) align 8 initializes((0, 8)) %agg.result) local_unnamed_addr #0 personality ptr @__gxx_personality_v0 !dbg !394 {
entry:
    #dbg_value(ptr undef, !401, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !407)
    #dbg_value(ptr undef, !401, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !409)
    #dbg_value(ptr undef, !411, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !416)
    #dbg_value(ptr %agg.result, !398, !DIExpression(DW_OP_deref), !418)
    #dbg_value(ptr undef, !401, !DIExpression(), !407)
    #dbg_value(i64 1, !404, !DIExpression(), !407)
    #dbg_value(ptr null, !405, !DIExpression(), !407)
  %call5.i14 = tail call noalias noundef nonnull dereferenceable(4) ptr @operator new(unsigned long)(i64 noundef 4) #3, !dbg !419
    #dbg_value(ptr %call5.i14, !399, !DIExpression(), !418)
  store ptr %call5.i14, ptr %agg.result, align 8, !dbg !420
    #dbg_value(ptr undef, !401, !DIExpression(), !409)
    #dbg_value(i64 1, !404, !DIExpression(), !409)
    #dbg_value(ptr null, !405, !DIExpression(), !409)
  %call5.i15 = invoke noalias noundef nonnull dereferenceable(4) ptr @operator new(unsigned long)(i64 noundef 4) #3
          to label %invoke.cont4 unwind label %if.then.i, !dbg !427

invoke.cont4:
  store ptr %call5.i15, ptr %agg.result, align 8, !dbg !428
    #dbg_value(ptr undef, !411, !DIExpression(), !416)
    #dbg_value(ptr %call5.i14, !414, !DIExpression(), !416)
    #dbg_value(i64 1, !415, !DIExpression(), !416)
  tail call void @operator delete(void*, unsigned long)(ptr noundef nonnull %call5.i14, i64 noundef 4) #4, !dbg !429
  ret void, !dbg !430

if.then.i:
  %0 = landingpad { ptr, i32 }
          cleanup, !dbg !430
    #dbg_value(ptr undef, !411, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !431)
    #dbg_value(ptr %agg.result, !438, !DIExpression(), !441)
    #dbg_value(ptr undef, !411, !DIExpression(), !431)
    #dbg_value(ptr %call5.i14, !414, !DIExpression(), !431)
    #dbg_value(i64 1, !415, !DIExpression(), !431)
  tail call void @operator delete(void*, unsigned long)(ptr noundef nonnull %call5.i14, i64 noundef 4) #4, !dbg !442
  resume { ptr, i32 } %0, !dbg !430
}

declare i32 @__gxx_personality_v0(...)

declare void @operator delete(void*, unsigned long)(ptr noundef, i64 noundef) local_unnamed_addr #1

declare noundef nonnull ptr @operator new(unsigned long)(i64 noundef) local_unnamed_addr #2

attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nobuiltin nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { nobuiltin allocsize(0) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #3 = { builtin allocsize(0) }
attributes #4 = { builtin nounwind }

which corresponds to the assembly

bad():
        push    r14
        push    rbx
        push    rax
        mov     rbx, rdi
        mov     edi, 4
        call    operator new(unsigned long)@PLT
        mov     r14, rax
        mov     qword ptr [rbx], rax
        mov     edi, 4
        call    operator new(unsigned long)@PLT
        mov     qword ptr [rbx], rax
        mov     esi, 4
        mov     rdi, r14
        call    operator delete(void*, unsigned long)@PLT
        mov     rax, rbx
        add     rsp, 8
        pop     rbx
        pop     r14
        ret
        mov     rbx, rax
        mov     esi, 4
        mov     rdi, r14
        call    operator delete(void*, unsigned long)@PLT
        mov     rdi, rbx
        call    _Unwind_Resume@PLT

DW.ref.__gxx_personality_v0:
        .quad   __gxx_personality_v0

I want it to generate

%struct.Container = type { ptr }

define dso_local void @good()(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.Container) align 8 initializes((0, 8)) %agg.result) local_unnamed_addr #0 personality ptr @__gxx_personality_v0 !dbg !394 {
entry:
    #dbg_value(ptr undef, !399, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !405)
    #dbg_value(ptr %agg.result, !398, !DIExpression(DW_OP_deref), !407)
    #dbg_value(ptr undef, !399, !DIExpression(), !405)
    #dbg_value(i64 1, !402, !DIExpression(), !405)
    #dbg_value(ptr null, !403, !DIExpression(), !405)
  %call5.i3 = tail call noalias noundef nonnull dereferenceable(4) ptr @operator new(unsigned long)(i64 noundef 4) #2, !dbg !408
  store ptr %call5.i3, ptr %agg.result, align 8, !dbg !409
  ret void, !dbg !416
}

declare i32 @__gxx_personality_v0(...)

declare noundef nonnull ptr @operator new(unsigned long)(i64 noundef) local_unnamed_addr #1

attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nobuiltin allocsize(0) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { builtin allocsize(0) }

which corresponds to

good():
        push    rbx
        mov     rbx, rdi
        mov     edi, 4
        call    operator new(unsigned long)@PLT
        mov     qword ptr [rbx], rax
        mov     rax, rbx
        pop     rbx
        ret

It looks like if an optimization pass implemented #68365 that would be sufficient to solve this problem, but there are other possible optimization paths for this.

See it live: https://godbolt.org/z/r87Ezb8b5

This bug prevents the optimization of the following code with any standard library implementation:

auto container = vector<int>();
container.push_back(1);
container.push_back(1);
return container;
@llvmbot llvmbot added the clang Clang issues not falling into any other category label Jan 11, 2025
@EugeneZelenko EugeneZelenko removed the clang Clang issues not falling into any other category label Jan 11, 2025
@dtcxzyw
Copy link
Member

dtcxzyw commented Jan 11, 2025

When OOM occurs at container.m_pointer = std::allocator<int>().allocate(1);, we should call the dtor of Container:

define dso_local void @_Z3badv(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.Container) align 8 initializes((0, 8)) %agg.result) local_unnamed_addr #0 personality ptr @__gxx_personality_v0 {
entry:
  %call5.i14 = tail call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4) #3
  store ptr %call5.i14, ptr %agg.result, align 8
  %call5.i15 = invoke noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4) #3
          to label %invoke.cont4 unwind label %if.then.i

invoke.cont4:
  store ptr %call5.i15, ptr %agg.result, align 8
  tail call void @_ZdlPvm(ptr noundef nonnull %call5.i14, i64 noundef 4) #4
  ret void

if.then.i:
  %0 = landingpad { ptr, i32 }
          cleanup
  tail call void @_ZdlPvm(ptr noundef nonnull %call5.i14, i64 noundef 4) #4 ; <----- HERE
  resume { ptr, i32 } %0
}

If we ignore this call, we will get an expected result: https://godbolt.org/z/K1r97GPxj

define dso_local void @_Z3badv2(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.Container) align 8 initializes((0, 8)) %agg.result) local_unnamed_addr #0 personality ptr @__gxx_personality_v0 {
  %call5.i15 = tail call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4) #3
  store ptr %call5.i15, ptr %agg.result, align 8
  ret void
}

But SimplifyCFG is not allowed to sink the call to operator delete() across the exception handling boundary :(

@dtcxzyw
Copy link
Member

dtcxzyw commented Jan 11, 2025

auto container = vector<int>();
container.push_back(1);
container.push_back(1);
return container;

Similarly, OOM may occur at the second push_back.

@davidstone
Copy link
Contributor Author

Moving it is one way to do this, and would be allowed as long as we're smart about not calling it twice. The observable behavior remains the same, because the standard allows the implementation to change the when and where std::allocator<T>::allocate and std::allocator<T>::deallocate are called, as long as the application doesn't use more memory than the naive ordering.

Another option is to recognize that if an exception is thrown, the memory is deallocated immediately after the call, and if an exception is not thrown, the memory is deallocated immediately after the call. So the first allocation cannot escape the function and can thus be elided entirely so long as we can use other optimizations to do writes to the final memory destination.

It's also interesting to note that removing the line container.m_pointer = original_pointer; causes the "good" codegen to occur. In the event of an exception, the memory associated with the first allocation would be leaked, but instead the initial allocation is optimized out.

@davidstone
Copy link
Contributor Author

One last thing to note about the existing code gen for bad is that it is obviously wrong without any advanced analysis. There are 6 instructions after a ret with no label to let other code jump into there. Unless I'm severely misunderstanding this, it looks like 23% of the generated instructions are unreachable.

@dtcxzyw
Copy link
Member

dtcxzyw commented Jan 11, 2025

There are 6 instructions after a ret with no label to let other code jump into there.

They are referenced by the section .gcc_except_table: https://godbolt.org/z/Gahh5zfMn

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

5 participants