diff --git a/doc/configuration.txt b/doc/configuration.txt index 89cededa0..430ab4f82 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -2013,6 +2013,14 @@ cpu-policy the limit of 32 or 64 depending on the system. This is the default policy. + - group-by-2-ccx same as "group-by-ccx" below but create a group every + two CCX. This can make sense on CPUs having many CCX of + few cores each, to avoid creating many groups, or to + smooth the distribution a little bit when not all cores + are in use. Please note that it can have very bad + performance effects when the communication between CCX + is slow. This is generally recommended against. + - group-by-2-clusters same as "group-by-cluster" but create a group every two clusters. This can make sense on CPUs having many clusters of few cores each, to avoid creating many @@ -2022,6 +2030,14 @@ cpu-policy between clusters is slow. This is generally recommended against. + - group-by-3-ccx same as "group-by-ccx" below but create a group every + three CCX. This can make sense on CPUs having many CCX + of few cores each, to avoid creating many groups, or to + smooth the distribution a little bit when not all cores + are in use. Please note that it can have very bad + performance effects when the communication between CCX + is slow. This is generally recommended against. + - group-by-3-clusters same as "group-by-cluster" but create a group every three clusters. This can make sense on CPUs having many clusters of few cores each, to avoid creating many @@ -2031,6 +2047,14 @@ cpu-policy between clusters is slow. This is generally recommended against. + - group-by-4-ccx same as "group-by-ccx" below but create a group every + four CCX. This can make sense on CPUs having many CCX + of few cores each, to avoid creating many groups, or to + smooth the distribution a little bit when not all cores + are in use. Please note that it can have very bad + performance effects when the communication between CCX + is slow. This is generally recommended against. + - group-by-4-clusters same as "group-by-cluster" but create a group every four clusters. This can make sense on CPUs having many clusters of few cores each, to avoid creating many @@ -2040,6 +2064,21 @@ cpu-policy between clusters is slow. This is generally recommended against. + - group-by-ccx if neither "nbthread" not "nbtgroups" were set, then + one thread group is created for each CPU core complex + ("CCX") with available CPUs, each with as many threads + as CPUs. A CCX groups CPUs having a similarly fast + access to the last level cache ("LLC"), typically the + L3 cache. On most modern machines, it is critical for + performance not to mix CPUs from distant CCX in the + same thread group. All threads of a group are then + bound to all CPUs of the CCX so that intra-group + communications remain local to the CCX without + enforcing too strong a binding. The per-group thread + limits and thread-group limits are respected. This is + recommended on multi-socket and NUMA systems, as well + as CPUs with bad inter-CCX latencies. + - group-by-cluster if neither "nbthread" not "nbtgroups" were set, then one thread group is created for each CPU cluster with available CPUs, each with as many threads as CPUs. All @@ -2049,6 +2088,14 @@ cpu-policy per-group thread limits and thread-group limits are respected. This is recommended on multi-socket and NUMA systems, as well as CPUs with bad inter-CCX latencies. + On most server machines, clusters and CCX are the same, + but on heterogenous machines ("performance" vs + "efficiency" or "big" vs "little"), a cluster will + generally be made of only a part of a CCX composed only + of very similar CPUs (same type, +/-5% frequency + difference max). The difference is visible on modern + laptops and desktop machines used by developers and + admins to validate setups. - performance exactly like group-by-cluster above, except that CPU clusters whose performance is less than half of the diff --git a/src/cpu_topo.c b/src/cpu_topo.c index 42e44ecb2..f80a01f9b 100644 --- a/src/cpu_topo.c +++ b/src/cpu_topo.c @@ -52,6 +52,7 @@ static int cpu_policy = 1; // "first-usable-node" /* list of CPU policies for "cpu-policy". The default one is the first one. */ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err); +static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err); static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err); static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err); static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err); @@ -60,6 +61,10 @@ static int cpu_policy_resource(int policy, int tmin, int tmax, int gmin, int gma static struct ha_cpu_policy ha_cpu_policy[] = { { .name = "none", .desc = "use all available CPUs", .fct = NULL }, { .name = "first-usable-node", .desc = "use only first usable node if nbthreads not set", .fct = cpu_policy_first_usable_node, .arg = 0 }, + { .name = "group-by-ccx", .desc = "make one thread group per CCX", .fct = cpu_policy_group_by_ccx , .arg = 1 }, + { .name = "group-by-2-ccx", .desc = "make one thread group per 2 CCX", .fct = cpu_policy_group_by_ccx , .arg = 2 }, + { .name = "group-by-3-ccx", .desc = "make one thread group per 3 CCX", .fct = cpu_policy_group_by_ccx , .arg = 3 }, + { .name = "group-by-4-ccx", .desc = "make one thread group per 4 CCX", .fct = cpu_policy_group_by_ccx , .arg = 4 }, { .name = "group-by-cluster", .desc = "make one thread group per core cluster", .fct = cpu_policy_group_by_cluster , .arg = 1 }, { .name = "group-by-2-clusters",.desc = "make one thread group per 2 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 2 }, { .name = "group-by-3-clusters",.desc = "make one thread group per 3 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 3 }, @@ -1179,6 +1184,122 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, return 0; } +/* the "group-by-ccx" cpu-policy: + * - does nothing if nbthread or thread-groups are set + * - otherwise tries to create one thread-group per CCX (defined as the ID of + * the last level cache), with as many threads as CPUs in the CCX, and bind + * all the threads of this group to all the CPUs of the CCX. In practice, an + * ID of layer3 will have been assigned so we'll use this. + * Also implements the variants "group-by-2-ccx", "group-by-3-ccx" and + * "group-by-4-ccx". + */ +static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err) +{ + struct hap_cpuset visited_ccx_set; + struct hap_cpuset node_cpu_set; + int cpu, cpu_start; + int cpu_count; + int l3id; + int thr_per_grp, nb_grp; + int thr; + int div; + + if (global.nbthread) + return 0; + + if (global.nbtgroups) + return 0; + + ha_cpuset_zero(&visited_ccx_set); + + /* iterate over each new ccx */ + cpu_start = 0; + + /* used as a divisor of ccx */ + div = ha_cpu_policy[policy].arg; + div = div ? div : 1; + + while (global.nbtgroups < MAX_TGROUPS && global.nbthread < MAX_THREADS) { + ha_cpuset_zero(&node_cpu_set); + l3id = -1; cpu_count = 0; + + for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) { + /* skip disabled and already visited CPUs */ + if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK) + continue; + + if (ha_cpuset_isset(&visited_ccx_set, ha_cpu_topo[cpu].ca_id[3] / div)) + continue; + + if (l3id < 0) { + l3id = ha_cpu_topo[cpu].ca_id[3] / div; + cpu_start = cpu + 1; + } + else if (l3id != ha_cpu_topo[cpu].ca_id[3] / div) + continue; + + /* make a mask of all of this cluster's CPUs */ + ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx); + cpu_count++; + } + + /* now l3id = next L3 ID or -1 if none; cpu_count is the + * number of CPUs in this CCX, and cpu_start is the next + * cpu to restart from to scan for new clusters. + */ + if (l3id < 0 || !cpu_count) + break; + + ha_cpuset_set(&visited_ccx_set, l3id); + + /* check that we're still within limits. If there are too many + * CPUs but enough groups left, we'll try to make more smaller + * groups, of the closest size each. + */ + nb_grp = (cpu_count + MAX_THREADS_PER_GROUP - 1) / MAX_THREADS_PER_GROUP; + if (nb_grp > MAX_TGROUPS - global.nbtgroups) + nb_grp = MAX_TGROUPS - global.nbtgroups; + thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp; + if (thr_per_grp > MAX_THREADS_PER_GROUP) + thr_per_grp = MAX_THREADS_PER_GROUP; + + while (nb_grp && cpu_count > 0) { + /* create at most thr_per_grp threads */ + if (thr_per_grp > cpu_count) + thr_per_grp = cpu_count; + + if (thr_per_grp + global.nbthread > MAX_THREADS) + thr_per_grp = MAX_THREADS - global.nbthread; + + /* let's create the new thread group */ + ha_tgroup_info[global.nbtgroups].base = global.nbthread; + ha_tgroup_info[global.nbtgroups].count = thr_per_grp; + + /* assign to this group the required number of threads */ + for (thr = 0; thr < thr_per_grp; thr++) { + ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1; + ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups]; + ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups]; + /* map these threads to all the CPUs */ + ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set); + } + + cpu_count -= thr_per_grp; + global.nbthread += thr_per_grp; + global.nbtgroups++; + if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS) + break; + } + } + + if (global.nbthread) + ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups); + else + ha_diag_warning("Could not determine any CPU cluster\n"); + + return 0; +} + /* the "performance" cpu-policy: * - does nothing if nbthread or thread-groups are set * - eliminates clusters whose total capacity is below half of others