MINOR: cpu-topo: add a new "group-by-ccx" CPU policy

This cpu-policy will only consider CCX and not clusters. This makes
a difference on machines with heterogenous CPUs that generally share
the same L3 cache, where it's not desirable to create multiple groups
based on the CPU types, but instead create one with the different CPU
types. The variants "group-by-2/3/4-ccx" have also been added.

Let's also add some text explaining the difference between cluster
and CCX.
This commit is contained in:
Willy Tarreau 2025-05-13 15:39:35 +02:00
parent 33d8b006d4
commit 01df98adad
2 changed files with 168 additions and 0 deletions

View File

@ -2013,6 +2013,14 @@ cpu-policy <policy>
the limit of 32 or 64 depending on the system. This is
the default policy.
- group-by-2-ccx same as "group-by-ccx" below but create a group every
two CCX. This can make sense on CPUs having many CCX of
few cores each, to avoid creating many groups, or to
smooth the distribution a little bit when not all cores
are in use. Please note that it can have very bad
performance effects when the communication between CCX
is slow. This is generally recommended against.
- group-by-2-clusters same as "group-by-cluster" but create a group every
two clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
@ -2022,6 +2030,14 @@ cpu-policy <policy>
between clusters is slow. This is generally recommended
against.
- group-by-3-ccx same as "group-by-ccx" below but create a group every
three CCX. This can make sense on CPUs having many CCX
of few cores each, to avoid creating many groups, or to
smooth the distribution a little bit when not all cores
are in use. Please note that it can have very bad
performance effects when the communication between CCX
is slow. This is generally recommended against.
- group-by-3-clusters same as "group-by-cluster" but create a group every
three clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
@ -2031,6 +2047,14 @@ cpu-policy <policy>
between clusters is slow. This is generally recommended
against.
- group-by-4-ccx same as "group-by-ccx" below but create a group every
four CCX. This can make sense on CPUs having many CCX
of few cores each, to avoid creating many groups, or to
smooth the distribution a little bit when not all cores
are in use. Please note that it can have very bad
performance effects when the communication between CCX
is slow. This is generally recommended against.
- group-by-4-clusters same as "group-by-cluster" but create a group every
four clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
@ -2040,6 +2064,21 @@ cpu-policy <policy>
between clusters is slow. This is generally recommended
against.
- group-by-ccx if neither "nbthread" not "nbtgroups" were set, then
one thread group is created for each CPU core complex
("CCX") with available CPUs, each with as many threads
as CPUs. A CCX groups CPUs having a similarly fast
access to the last level cache ("LLC"), typically the
L3 cache. On most modern machines, it is critical for
performance not to mix CPUs from distant CCX in the
same thread group. All threads of a group are then
bound to all CPUs of the CCX so that intra-group
communications remain local to the CCX without
enforcing too strong a binding. The per-group thread
limits and thread-group limits are respected. This is
recommended on multi-socket and NUMA systems, as well
as CPUs with bad inter-CCX latencies.
- group-by-cluster if neither "nbthread" not "nbtgroups" were set, then
one thread group is created for each CPU cluster with
available CPUs, each with as many threads as CPUs. All
@ -2049,6 +2088,14 @@ cpu-policy <policy>
per-group thread limits and thread-group limits are
respected. This is recommended on multi-socket and NUMA
systems, as well as CPUs with bad inter-CCX latencies.
On most server machines, clusters and CCX are the same,
but on heterogenous machines ("performance" vs
"efficiency" or "big" vs "little"), a cluster will
generally be made of only a part of a CCX composed only
of very similar CPUs (same type, +/-5% frequency
difference max). The difference is visible on modern
laptops and desktop machines used by developers and
admins to validate setups.
- performance exactly like group-by-cluster above, except that CPU
clusters whose performance is less than half of the

View File

@ -52,6 +52,7 @@ static int cpu_policy = 1; // "first-usable-node"
/* list of CPU policies for "cpu-policy". The default one is the first one. */
static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
@ -60,6 +61,10 @@ static int cpu_policy_resource(int policy, int tmin, int tmax, int gmin, int gma
static struct ha_cpu_policy ha_cpu_policy[] = {
{ .name = "none", .desc = "use all available CPUs", .fct = NULL },
{ .name = "first-usable-node", .desc = "use only first usable node if nbthreads not set", .fct = cpu_policy_first_usable_node, .arg = 0 },
{ .name = "group-by-ccx", .desc = "make one thread group per CCX", .fct = cpu_policy_group_by_ccx , .arg = 1 },
{ .name = "group-by-2-ccx", .desc = "make one thread group per 2 CCX", .fct = cpu_policy_group_by_ccx , .arg = 2 },
{ .name = "group-by-3-ccx", .desc = "make one thread group per 3 CCX", .fct = cpu_policy_group_by_ccx , .arg = 3 },
{ .name = "group-by-4-ccx", .desc = "make one thread group per 4 CCX", .fct = cpu_policy_group_by_ccx , .arg = 4 },
{ .name = "group-by-cluster", .desc = "make one thread group per core cluster", .fct = cpu_policy_group_by_cluster , .arg = 1 },
{ .name = "group-by-2-clusters",.desc = "make one thread group per 2 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 2 },
{ .name = "group-by-3-clusters",.desc = "make one thread group per 3 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 3 },
@ -1179,6 +1184,122 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
return 0;
}
/* the "group-by-ccx" cpu-policy:
* - does nothing if nbthread or thread-groups are set
* - otherwise tries to create one thread-group per CCX (defined as the ID of
* the last level cache), with as many threads as CPUs in the CCX, and bind
* all the threads of this group to all the CPUs of the CCX. In practice, an
* ID of layer3 will have been assigned so we'll use this.
* Also implements the variants "group-by-2-ccx", "group-by-3-ccx" and
* "group-by-4-ccx".
*/
static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
{
struct hap_cpuset visited_ccx_set;
struct hap_cpuset node_cpu_set;
int cpu, cpu_start;
int cpu_count;
int l3id;
int thr_per_grp, nb_grp;
int thr;
int div;
if (global.nbthread)
return 0;
if (global.nbtgroups)
return 0;
ha_cpuset_zero(&visited_ccx_set);
/* iterate over each new ccx */
cpu_start = 0;
/* used as a divisor of ccx */
div = ha_cpu_policy[policy].arg;
div = div ? div : 1;
while (global.nbtgroups < MAX_TGROUPS && global.nbthread < MAX_THREADS) {
ha_cpuset_zero(&node_cpu_set);
l3id = -1; cpu_count = 0;
for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
/* skip disabled and already visited CPUs */
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
continue;
if (ha_cpuset_isset(&visited_ccx_set, ha_cpu_topo[cpu].ca_id[3] / div))
continue;
if (l3id < 0) {
l3id = ha_cpu_topo[cpu].ca_id[3] / div;
cpu_start = cpu + 1;
}
else if (l3id != ha_cpu_topo[cpu].ca_id[3] / div)
continue;
/* make a mask of all of this cluster's CPUs */
ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
cpu_count++;
}
/* now l3id = next L3 ID or -1 if none; cpu_count is the
* number of CPUs in this CCX, and cpu_start is the next
* cpu to restart from to scan for new clusters.
*/
if (l3id < 0 || !cpu_count)
break;
ha_cpuset_set(&visited_ccx_set, l3id);
/* check that we're still within limits. If there are too many
* CPUs but enough groups left, we'll try to make more smaller
* groups, of the closest size each.
*/
nb_grp = (cpu_count + MAX_THREADS_PER_GROUP - 1) / MAX_THREADS_PER_GROUP;
if (nb_grp > MAX_TGROUPS - global.nbtgroups)
nb_grp = MAX_TGROUPS - global.nbtgroups;
thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
if (thr_per_grp > MAX_THREADS_PER_GROUP)
thr_per_grp = MAX_THREADS_PER_GROUP;
while (nb_grp && cpu_count > 0) {
/* create at most thr_per_grp threads */
if (thr_per_grp > cpu_count)
thr_per_grp = cpu_count;
if (thr_per_grp + global.nbthread > MAX_THREADS)
thr_per_grp = MAX_THREADS - global.nbthread;
/* let's create the new thread group */
ha_tgroup_info[global.nbtgroups].base = global.nbthread;
ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
/* assign to this group the required number of threads */
for (thr = 0; thr < thr_per_grp; thr++) {
ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
/* map these threads to all the CPUs */
ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
}
cpu_count -= thr_per_grp;
global.nbthread += thr_per_grp;
global.nbtgroups++;
if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
break;
}
}
if (global.nbthread)
ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
else
ha_diag_warning("Could not determine any CPU cluster\n");
return 0;
}
/* the "performance" cpu-policy:
* - does nothing if nbthread or thread-groups are set
* - eliminates clusters whose total capacity is below half of others