diff --git a/doc/configuration.txt b/doc/configuration.txt index fd8af6bfa..9c7ed987a 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -1991,6 +1991,17 @@ cpu-policy respected. This is recommended on multi-socket and NUMA systems, as well as CPUs with bad inter-CCX latencies. + - performance exactly like group-by-cluster above, except that CPU + clusters whose performance is less than half of the + next more performant one are evicted. These are + typically "little" or "efficient" cores, whose addition + generally doesn't bring significant gains and can + easily be counter-productive (e.g. TLS handshakes). + Often, keeping such cores for other tasks such as + network handling is much more effective. On development + systems, these can also be used to run auxiliary tools + such as load generators and monitoring tools. + See also: "cpu-map", "cpu-set", "nbthread" cpu-set ... diff --git a/src/cpu_topo.c b/src/cpu_topo.c index b44c1005b..154763164 100644 --- a/src/cpu_topo.c +++ b/src/cpu_topo.c @@ -53,11 +53,13 @@ static int cpu_policy = 1; // "first-usable-node" /* list of CPU policies for "cpu-policy". The default one is the first one. */ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err); static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err); +static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err); static struct ha_cpu_policy ha_cpu_policy[] = { { .name = "none", .desc = "use all available CPUs", .fct = NULL }, { .name = "first-usable-node", .desc = "use only first usable node if nbthreads not set", .fct = cpu_policy_first_usable_node }, { .name = "group-by-cluster", .desc = "make one thread group per core cluster", .fct = cpu_policy_group_by_cluster }, + { .name = "performance", .desc = "make one thread group per perf. core cluster", .fct = cpu_policy_performance }, { 0 } /* end */ }; @@ -531,6 +533,36 @@ void cpu_reorder_by_cluster_capa(struct ha_cpu_topo *topo, int entries) qsort(topo, entries, sizeof(*topo), _cmp_cpu_cluster_capa); } +/* functions below act on ha_cpu_cluster structs */ + +/* function used by qsort to reorder clusters by index */ +int _cmp_cluster_index(const void *a, const void *b) +{ + const struct ha_cpu_cluster *l = (const struct ha_cpu_cluster *)a; + const struct ha_cpu_cluster *r = (const struct ha_cpu_cluster *)b; + return l->idx - r->idx; +} + +/* function used by qsort to order clustes by reverse capacity */ +int _cmp_cluster_capa(const void *a, const void *b) +{ + const struct ha_cpu_cluster *l = (const struct ha_cpu_cluster *)a; + const struct ha_cpu_cluster *r = (const struct ha_cpu_cluster *)b; + return r->capa - l->capa; +} + +/* re-order a cluster array by cluster index only */ +void cpu_cluster_reorder_by_index(struct ha_cpu_cluster *clusters, int entries) +{ + qsort(clusters, entries, sizeof(*clusters), _cmp_cluster_index); +} + +/* re-order a CPU topology array by locality and capacity to detect clusters. */ +void cpu_cluster_reorder_by_capa(struct ha_cpu_cluster *clusters, int entries) +{ + qsort(clusters, entries, sizeof(*clusters), _cmp_cluster_capa); +} + /* returns an optimal maxcpus for the current system. It will take into * account what is reported by the OS, if any, otherwise will fall back * to the cpuset size, which serves as an upper limit in any case. @@ -1064,6 +1096,45 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, return 0; } +/* the "performance" cpu-policy: + * - does nothing if nbthread or thread-groups are set + * - eliminates clusters whose total capacity is below half of others + * - tries to create one thread-group per cluster, with as many + * threads as CPUs in the cluster, and bind all the threads of + * this group to all the CPUs of the cluster. + */ +static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err) +{ + int cpu, cluster; + int capa; + + if (global.nbthread || global.nbtgroups) + return 0; + + /* sort clusters by reverse capacity */ + cpu_cluster_reorder_by_capa(ha_cpu_clusters, cpu_topo_maxcpus); + + capa = 0; + for (cluster = 0; cluster < cpu_topo_maxcpus; cluster++) { + if (capa && ha_cpu_clusters[cluster].capa < capa / 2) { + /* This cluster is more than twice as slow as the + * previous one, we're not interested in using it. + */ + for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) { + if (ha_cpu_topo[cpu].cl_gid == ha_cpu_clusters[cluster].idx) + ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED; + } + } + else + capa = ha_cpu_clusters[cluster].capa; + } + + cpu_cluster_reorder_by_index(ha_cpu_clusters, cpu_topo_maxcpus); + + /* and finish using the group-by-cluster strategy */ + return cpu_policy_group_by_cluster(policy, tmin, tmax, gmin, gmax, err); +} + /* apply the chosen CPU policy if no cpu-map was forced. Returns < 0 on failure * with a message in *err that must be freed by the caller if non-null. */