MDEV-35897 vector index search allocates too much memory for large ef_search

never estimate that a graph search will visit more nodes than there
are in the graph. In fact, let's reduce the graph size by 30%, it'll
increase the false positive rate of a bloom filter by 2% when
visiting the whole graph, it doesn't affect recall noticeably.

we need to read the shared graph size under a lock. let's store it
in the thread-local unused TABLE::used_stat_records member.
This commit is contained in:
Sergei Golubchik 2025-04-20 10:42:53 +02:00
parent 395db6f1d5
commit 82867e07e3

View File

@ -1,5 +1,5 @@
/*
Copyright (c) 2024, MariaDB plc
Copyright (c) 2024, 2025, MariaDB plc
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -753,6 +753,8 @@ MHNSW_Share *MHNSW_Share::get_from_share(TABLE_SHARE *share, TABLE *table)
}
if (ctx)
ctx->refcnt++;
if (table) // hijack TABLE::used_stat_records
table->hlindex->used_stat_records= ctx->node_cache.size();
share->unlock_share();
return ctx;
}
@ -1144,8 +1146,9 @@ static int search_layer(MHNSW_Share *ctx, TABLE *graph, const FVector *target,
// WARNING! heuristic here
const double est_heuristic= 8 * std::sqrt(ctx->max_neighbors(layer));
const uint est_size= static_cast<uint>(est_heuristic * std::pow(ef, ctx->ef_power));
VisitedSet visited(root, target, est_size);
double est_size= est_heuristic * std::pow(ef, ctx->ef_power);
set_if_smaller(est_size, graph->used_stat_records/1.3);
VisitedSet visited(root, target, static_cast<uint>(est_size));
candidates.init(max_ef, false, Visited::cmp);
best.init(ef, true, Visited::cmp);
@ -1213,9 +1216,9 @@ static int search_layer(MHNSW_Share *ctx, TABLE *graph, const FVector *target,
}
}
set_if_bigger(ctx->diameter, max_distance); // not atomic, but it's ok
if (ef > 1 && visited.count*2 > est_size)
if (ef > 1 && visited.count > est_size)
{
double ef_power= std::log(visited.count*2/est_heuristic) / std::log(ef);
double ef_power= std::log(visited.count/est_heuristic) / std::log(ef);
set_if_bigger(ctx->ef_power, ef_power); // not atomic, but it's ok
}