diff --git a/include/haproxy/cpu_topo.h b/include/haproxy/cpu_topo.h
index d80244079..5271af3ec 100644
--- a/include/haproxy/cpu_topo.h
+++ b/include/haproxy/cpu_topo.h
@@ -43,6 +43,9 @@ int cpu_map_configured(void);
  */
 void cpu_dump_topology(const struct ha_cpu_topo *topo);
 
+/* re-order a CPU topology array by locality to help form groups. */
+void cpu_reorder_by_locality(struct ha_cpu_topo *topo, int entries);
+
 /* re-order a CPU topology array by CPU index only, to undo the function above,
  * in case other calls need to be made on top of this.
  */
@@ -52,5 +55,6 @@ void cpu_reorder_by_index(struct ha_cpu_topo *topo, int entries);
  * outside cpu_topo).
  */
 int _cmp_cpu_index(const void *a, const void *b);
+int _cmp_cpu_locality(const void *a, const void *b);
 
 #endif /* _HAPROXY_CPU_TOPO_H */
diff --git a/src/cpu_topo.c b/src/cpu_topo.c
index 363ab8707..d0bf266e5 100644
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -239,6 +239,89 @@ int _cmp_cpu_index(const void *a, const void *b)
 	return 0;
 }
 
+/* function used by qsort to compare two hwcpus and arrange them by vicinity
+ * only. -1 says a<b, 1 says a>b. The goal is to arrange the closest CPUs
+ * together, preferring locality over performance in order to keep latency
+ * as low as possible, so that when picking a fixed number of threads, the
+ * closest ones are used in priority. It's also used to help arranging groups
+ * at the end.
+ */
+int _cmp_cpu_locality(const void *a, const void *b)
+{
+	const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
+	const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
+
+	/* first, online vs offline */
+	if (!(l->st & HA_CPU_F_EXCL_MASK) && (r->st & HA_CPU_F_EXCL_MASK))
+		return -1;
+
+	if (!(r->st & HA_CPU_F_EXCL_MASK) && (l->st & HA_CPU_F_EXCL_MASK))
+		return 1;
+
+	/* next, package ID */
+	if (l->pk_id >= 0 && l->pk_id < r->pk_id)
+		return -1;
+	if (l->pk_id > r->pk_id && r->pk_id >= 0)
+		return  1;
+
+	/* next, node ID */
+	if (l->no_id >= 0 && l->no_id < r->no_id)
+		return -1;
+	if (l->no_id > r->no_id && r->no_id >= 0)
+		return  1;
+
+	/* next, L4 */
+	if (l->ca_id[4] >= 0 && l->ca_id[4] < r->ca_id[4])
+		return -1;
+	if (l->ca_id[4] > r->ca_id[4] && r->ca_id[4] >= 0)
+		return  1;
+
+	/* next, L3 */
+	if (l->ca_id[3] >= 0 && l->ca_id[3] < r->ca_id[3])
+		return -1;
+	if (l->ca_id[3] > r->ca_id[3] && r->ca_id[3] >= 0)
+		return  1;
+
+	/* next, cluster */
+	if (l->cl_gid >= 0 && l->cl_gid < r->cl_gid)
+		return -1;
+	if (l->cl_gid > r->cl_gid && r->cl_gid >= 0)
+		return  1;
+
+	/* next, L2 */
+	if (l->ca_id[2] >= 0 && l->ca_id[2] < r->ca_id[2])
+		return -1;
+	if (l->ca_id[2] > r->ca_id[2] && r->ca_id[2] >= 0)
+		return  1;
+
+	/* next, thread set */
+	if (l->ts_id >= 0 && l->ts_id < r->ts_id)
+		return -1;
+	if (l->ts_id > r->ts_id && r->ts_id >= 0)
+		return  1;
+
+	/* next, L1 */
+	if (l->ca_id[1] >= 0 && l->ca_id[1] < r->ca_id[1])
+		return -1;
+	if (l->ca_id[1] > r->ca_id[1] && r->ca_id[1] >= 0)
+		return  1;
+
+	/* next, L0 */
+	if (l->ca_id[0] >= 0 && l->ca_id[0] < r->ca_id[0])
+		return -1;
+	if (l->ca_id[0] > r->ca_id[0] && r->ca_id[0] >= 0)
+		return  1;
+
+	/* next, IDX, so that SMT ordering is preserved */
+	if (l->idx >= 0 && l->idx < r->idx)
+		return -1;
+	if (l->idx > r->idx && r->idx >= 0)
+		return  1;
+
+	/* exactly the same (e.g. absent) */
+	return 0;
+}
+
 /* re-order a CPU topology array by CPU index only. This is mostly used before
  * listing CPUs regardless of their characteristics.
  */
@@ -247,6 +330,12 @@ void cpu_reorder_by_index(struct ha_cpu_topo *topo, int entries)
 	qsort(topo, entries, sizeof(*topo), _cmp_cpu_index);
 }
 
+/* re-order a CPU topology array by locality to help form groups. */
+void cpu_reorder_by_locality(struct ha_cpu_topo *topo, int entries)
+{
+	qsort(topo, entries, sizeof(*topo), _cmp_cpu_locality);
+}
+
 /* returns an optimal maxcpus for the current system. It will take into
  * account what is reported by the OS, if any, otherwise will fall back
  * to the cpuset size, which serves as an upper limit in any case.