Skip to content

Commit

Permalink
Support using more than 64 cores on Windows (#440)
Browse files Browse the repository at this point in the history
  • Loading branch information
solidpixel authored Nov 24, 2023
1 parent 35f89db commit 9372b7c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 4 deletions.
11 changes: 11 additions & 0 deletions Docs/ChangeLog-4x.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ release of the 4.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.

<!-- ---------------------------------------------------------------------- -->
## 4.7.0

**Status:** TBD

The 4.7.0 release is a maintenance release with minor improvements and fixes.

* **General:**
* **Optimization:** Windows builds of the `astcenc` command line tool can now
use more than 64 cores on large core count systems.

<!-- ---------------------------------------------------------------------- -->
## 4.6.0

Expand Down
58 changes: 54 additions & 4 deletions Source/astcenccli_platform_dependents.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -61,6 +61,51 @@ static int pthread_create(
return 0;
}

/**
* @brief Manually set CPU group and thread affinity.
*
* This is needed on Windows 10 or older to allow benefit from large core count
* systems with more than 64 logical CPUs. The assignment is skipped on systems
* with a single processor group, as it is not necessary.
*/
static void set_group_affinity(
pthread_t thread,
int thread_index
) {
// Skip thread assignment for hardware with a single CPU group
int group_count = GetActiveProcessorGroupCount();
if (group_count == 1)
{
return;
}

// Ensure we have a valid assign if user creates more threads than cores
int assign_index = thread_index % get_cpu_count();
int assign_group { 0 };
int assign_group_cpu_count { 0 };

// Determine which core group and core in the group to use for this thread
int group_cpu_count_sum { 0 };
for (int group = 0; group < group_count; group++)
{
int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
group_cpu_count_sum += group_cpu_count;

if (assign_index < group_cpu_count_sum)
{
assign_group = group;
assign_group_cpu_count = group_cpu_count;
break;
}
}

// Set the affinity to the assigned group, and all supported cores
GROUP_AFFINITY affinity {};
affinity.Mask = (1 << assign_group_cpu_count) - 1;
affinity.Group = assign_group;
SetThreadGroupAffinity(thread, &affinity, nullptr);
}

/**
* @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
*/
Expand All @@ -76,9 +121,8 @@ static int pthread_join(
/* See header for documentation */
int get_cpu_count()
{
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
return static_cast<int>(cpu_count);
}

/* See header for documentation */
Expand Down Expand Up @@ -173,6 +217,12 @@ void launch_threads(

pthread_create(&(thread_descs[i].thread_handle), nullptr,
launch_threads_helper, reinterpret_cast<void*>(thread_descs + i));

// Windows 10 needs explicit thread assignment to handle large core count systems
// TODO: Add check to skip on Windows 11 or newer
#if defined(_WIN32) && !defined(__CYGWIN__)
set_group_affinity(thread_descs[i].thread_handle, i);
#endif
}

// ... and then wait for them to complete
Expand Down

0 comments on commit 9372b7c

Please sign in to comment.