diff --git a/doc/configuration.txt b/doc/configuration.txt
index 89cededa0..430ab4f82 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2013,6 +2013,14 @@ cpu-policy <policy>
                         the limit of 32 or 64 depending on the system. This is
                         the default policy.
 
+   - group-by-2-ccx     same as "group-by-ccx" below but create a group every
+                        two CCX. This can make sense on CPUs having many CCX of
+                        few cores each, to avoid creating many groups, or to
+                        smooth the distribution a little bit when not all cores
+                        are in use. Please note that it can have very bad
+                        performance effects when the communication between CCX
+                        is slow. This is generally recommended against.
+
    - group-by-2-clusters same as "group-by-cluster" but create a group every
                         two clusters. This can make sense on CPUs having many
                         clusters of few cores each, to avoid creating many
@@ -2022,6 +2030,14 @@ cpu-policy <policy>
                         between clusters is slow. This is generally recommended
                         against.
 
+   - group-by-3-ccx     same as "group-by-ccx" below but create a group every
+                        three CCX. This can make sense on CPUs having many CCX
+                        of few cores each, to avoid creating many groups, or to
+                        smooth the distribution a little bit when not all cores
+                        are in use. Please note that it can have very bad
+                        performance effects when the communication between CCX
+                        is slow. This is generally recommended against.
+
    - group-by-3-clusters same as "group-by-cluster" but create a group every
                         three clusters. This can make sense on CPUs having many
                         clusters of few cores each, to avoid creating many
@@ -2031,6 +2047,14 @@ cpu-policy <policy>
                         between clusters is slow. This is generally recommended
                         against.
 
+   - group-by-4-ccx     same as "group-by-ccx" below but create a group every
+                        four CCX. This can make sense on CPUs having many CCX
+                        of few cores each, to avoid creating many groups, or to
+                        smooth the distribution a little bit when not all cores
+                        are in use. Please note that it can have very bad
+                        performance effects when the communication between CCX
+                        is slow. This is generally recommended against.
+
    - group-by-4-clusters same as "group-by-cluster" but create a group every
                         four clusters. This can make sense on CPUs having many
                         clusters of few cores each, to avoid creating many
@@ -2040,6 +2064,21 @@ cpu-policy <policy>
                         between clusters is slow. This is generally recommended
                         against.
 
+   - group-by-ccx       if neither "nbthread" not "nbtgroups" were set, then
+                        one thread group is created for each CPU core complex
+                        ("CCX") with available CPUs, each with as many threads
+                        as CPUs. A CCX groups CPUs having a similarly fast
+                        access to the last level cache ("LLC"), typically the
+                        L3 cache. On most modern machines, it is critical for
+                        performance not to mix CPUs from distant CCX in the
+                        same thread group. All threads of a group are then
+                        bound to all CPUs of the CCX so that intra-group
+                        communications remain local to the CCX without
+                        enforcing too strong a binding. The per-group thread
+                        limits and thread-group limits are respected. This is
+                        recommended on multi-socket and NUMA systems, as well
+                        as CPUs with bad inter-CCX latencies.
+
    - group-by-cluster   if neither "nbthread" not "nbtgroups" were set, then
                         one thread group is created for each CPU cluster with
                         available CPUs, each with as many threads as CPUs. All
@@ -2049,6 +2088,14 @@ cpu-policy <policy>
                         per-group thread limits and thread-group limits are
                         respected. This is recommended on multi-socket and NUMA
                         systems, as well as CPUs with bad inter-CCX latencies.
+                        On most server machines, clusters and CCX are the same,
+                        but on heterogenous machines ("performance" vs
+                        "efficiency" or "big" vs "little"), a cluster will
+                        generally be made of only a part of a CCX composed only
+                        of very similar CPUs (same type, +/-5% frequency
+                        difference max). The difference is visible on modern
+                        laptops and desktop machines used by developers and
+                        admins to validate setups.
 
    - performance        exactly like group-by-cluster above, except that CPU
                         clusters whose performance is less than half of the
diff --git a/src/cpu_topo.c b/src/cpu_topo.c
index 42e44ecb2..f80a01f9b 100644
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -52,6 +52,7 @@ static int cpu_policy = 1; // "first-usable-node"
 
 /* list of CPU policies for "cpu-policy". The default one is the first one. */
 static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
+static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
 static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
 static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
 static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
@@ -60,6 +61,10 @@ static int cpu_policy_resource(int policy, int tmin, int tmax, int gmin, int gma
 static struct ha_cpu_policy ha_cpu_policy[] = {
 	{ .name = "none",               .desc = "use all available CPUs",                           .fct = NULL   },
 	{ .name = "first-usable-node",  .desc = "use only first usable node if nbthreads not set",  .fct = cpu_policy_first_usable_node, .arg = 0 },
+	{ .name = "group-by-ccx",       .desc = "make one thread group per CCX",                    .fct = cpu_policy_group_by_ccx ,     .arg = 1 },
+	{ .name = "group-by-2-ccx",     .desc = "make one thread group per 2 CCX",                  .fct = cpu_policy_group_by_ccx ,     .arg = 2 },
+	{ .name = "group-by-3-ccx",     .desc = "make one thread group per 3 CCX",                  .fct = cpu_policy_group_by_ccx ,     .arg = 3 },
+	{ .name = "group-by-4-ccx",     .desc = "make one thread group per 4 CCX",                  .fct = cpu_policy_group_by_ccx ,     .arg = 4 },
 	{ .name = "group-by-cluster",   .desc = "make one thread group per core cluster",           .fct = cpu_policy_group_by_cluster , .arg = 1 },
 	{ .name = "group-by-2-clusters",.desc = "make one thread group per 2 core clusters",        .fct = cpu_policy_group_by_cluster , .arg = 2 },
 	{ .name = "group-by-3-clusters",.desc = "make one thread group per 3 core clusters",        .fct = cpu_policy_group_by_cluster , .arg = 3 },
@@ -1179,6 +1184,122 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
 	return 0;
 }
 
+/* the "group-by-ccx" cpu-policy:
+ *  - does nothing if nbthread or thread-groups are set
+ *  - otherwise tries to create one thread-group per CCX (defined as the ID of
+ *    the last level cache), with as many threads as CPUs in the CCX, and bind
+ *    all the threads of this group to all the CPUs of the CCX. In practice, an
+ *    ID of layer3 will have been assigned so we'll use this.
+ * Also implements the variants "group-by-2-ccx", "group-by-3-ccx" and
+ * "group-by-4-ccx".
+ */
+static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
+{
+	struct hap_cpuset visited_ccx_set;
+	struct hap_cpuset node_cpu_set;
+	int cpu, cpu_start;
+	int cpu_count;
+	int l3id;
+	int thr_per_grp, nb_grp;
+	int thr;
+	int div;
+
+	if (global.nbthread)
+		return 0;
+
+	if (global.nbtgroups)
+		return 0;
+
+	ha_cpuset_zero(&visited_ccx_set);
+
+	/* iterate over each new ccx */
+	cpu_start = 0;
+
+	/* used as a divisor of ccx */
+	div = ha_cpu_policy[policy].arg;
+	div = div ? div : 1;
+
+	while (global.nbtgroups < MAX_TGROUPS && global.nbthread < MAX_THREADS) {
+		ha_cpuset_zero(&node_cpu_set);
+		l3id = -1; cpu_count = 0;
+
+		for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
+			/* skip disabled and already visited CPUs */
+			if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
+				continue;
+
+			if (ha_cpuset_isset(&visited_ccx_set, ha_cpu_topo[cpu].ca_id[3] / div))
+				continue;
+
+			if (l3id < 0) {
+				l3id = ha_cpu_topo[cpu].ca_id[3] / div;
+				cpu_start = cpu + 1;
+			}
+			else if (l3id != ha_cpu_topo[cpu].ca_id[3] / div)
+				continue;
+
+			/* make a mask of all of this cluster's CPUs */
+			ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
+			cpu_count++;
+		}
+
+		/* now l3id = next L3 ID or -1 if none; cpu_count is the
+		 * number of CPUs in this CCX, and cpu_start is the next
+		 * cpu to restart from to scan for new clusters.
+		 */
+		if (l3id < 0 || !cpu_count)
+			break;
+
+		ha_cpuset_set(&visited_ccx_set, l3id);
+
+		/* check that we're still within limits. If there are too many
+		 * CPUs but enough groups left, we'll try to make more smaller
+		 * groups, of the closest size each.
+		 */
+		nb_grp = (cpu_count + MAX_THREADS_PER_GROUP - 1) / MAX_THREADS_PER_GROUP;
+		if (nb_grp > MAX_TGROUPS - global.nbtgroups)
+			nb_grp = MAX_TGROUPS - global.nbtgroups;
+		thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
+		if (thr_per_grp > MAX_THREADS_PER_GROUP)
+			thr_per_grp = MAX_THREADS_PER_GROUP;
+
+		while (nb_grp && cpu_count > 0) {
+			/* create at most thr_per_grp threads */
+			if (thr_per_grp > cpu_count)
+				thr_per_grp = cpu_count;
+
+			if (thr_per_grp + global.nbthread > MAX_THREADS)
+				thr_per_grp = MAX_THREADS - global.nbthread;
+
+			/* let's create the new thread group */
+			ha_tgroup_info[global.nbtgroups].base  = global.nbthread;
+			ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
+
+			/* assign to this group the required number of threads */
+			for (thr = 0; thr < thr_per_grp; thr++) {
+				ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
+				ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
+				ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
+				/* map these threads to all the CPUs */
+				ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+			}
+
+			cpu_count -= thr_per_grp;
+			global.nbthread += thr_per_grp;
+			global.nbtgroups++;
+			if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
+				break;
+		}
+	}
+
+	if (global.nbthread)
+		ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
+	else
+		ha_diag_warning("Could not determine any CPU cluster\n");
+
+	return 0;
+}
+
 /* the "performance" cpu-policy:
  *  - does nothing if nbthread or thread-groups are set
  *  - eliminates clusters whose total capacity is below half of others