From 74743b0d88139015d4037f207442dfad850b6c2a Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Mon, 11 Nov 2024 19:53:41 +0100 Subject: [PATCH] fix test failures on x86, gcc -O1 x86 builds don't use SIMD, fast math and inlining causes distances to be quite unstable and 1) comparison with the threshold no longer works, the distance calculated twice between the same two vectors comes out differently 2) a bunch of identical vectors get the non-zero distance between them and HNSW cross-links them with no outbound links (if there're more than 2M identical vectors). Let's strengthen the select_neighbors heuristic to skip neighbors that are too close to each other MDEV-35418 suggests a better solution for this. --- sql/vector_mhnsw.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index 2669706f47e..bee74c4d13a 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -908,7 +908,7 @@ static int select_neighbors(MHNSW_Share *ctx, TABLE *graph, size_t layer, { Visited *vec= pq.pop(); FVectorNode * const node= vec->node; - const float target_dista= vec->distance_to_target / alpha; + const float target_dista= std::max(32*FLT_EPSILON, vec->distance_to_target / alpha); bool discard= false; for (size_t i=0; i < neighbors.num; i++) if ((discard= node->distance_to(neighbors.links[i]->vec) <= target_dista)) @@ -1348,7 +1348,7 @@ int mhnsw_read_next(TABLE *table) } ctx->release(false, table->s); // release shared ctx result->ctx= trx; // replace it with trx - result->ctx_version= trx->version; + result->ctx_version= trx->version; std::swap(trx, ctx); // free shared ctx in this scope, keep trx } @@ -1358,7 +1358,7 @@ int mhnsw_read_next(TABLE *table) static_cast(result->pos), 0, &result->found, false)) return err; result->pos= 0; - result->threshold= new_threshold; + result->threshold= new_threshold + FLT_EPSILON; return mhnsw_read_next(table); }