BUG/MEDIUM: event_hdl: clean soft-stop handling

soft-stop was not explicitly handled in event_hdl API.

Because of this, event_hdl was causing some leaks on deinit paths.
Moreover, a task responsible for handling events could require some
additional cleanups (ie: advanced async task), and as the task was not
protected against abort when soft-stopping, such cleanup could not be
performed unless the task itself implements the required protections,
which is not optimal.

Consider this new approach:
 'jobs' global variable is incremented whenever an async subscription is
 created to prevent the related task from being aborted before the task
 acknowledges the final END event.

 Once the END event is acknowledged and freed by the task, the 'jobs'
 variable is decremented, and the deinit process may continue (including
 the abortion of remaining tasks not guarded by the 'jobs' variable).

To do this, a new global mt_list is required: known_event_hdl_sub_list
This list tracks the known (initialized) subscription lists within the
process.

sub_lists are automatically added to the "known" list when calling
event_hdl_sub_list_init(), and are removed from the list with
event_hdl_sub_list_destroy().

This allows us to implement a global thread-safe event_hdl deinit()
function that is automatically called on soft-stop thanks to signal(0).
When event_hdl deinit() is initiated, we simply iterate against the known
subscription lists to destroy them.

event_hdl_subscribe_ptr() was slightly modified to make sure that a sub_list
may not accept new subscriptions once it is destroyed (removed from the
known list)
This can occur between the time the soft-stop is initiated (signal(0)) and
haproxy actually enters in the deinit() function (once tasks are either
finished or aborted and other threads already joined).

It is safe to destroy() the subscription list multiple times as long
as the pointer is still valid (ie: first on soft-stop when handling
the '0' signal, then from regular deinit() path): the function does
nothing if the subscription list is already removed.

We partially reverted "BUG/MINOR: event_hdl: make event_hdl_subscribe thread-safe"
since we can use parent mt_list locking instead of a dedicated lock to make
the check gainst duplicate subscription ID.
(insert_lock is not useful anymore)

The check in itself is not changed, only the locking method.

sizeof(event_hdl_sub_list) slightly increases: from 24 bits to 32bits due
to the additional mt_list struct within it.

With that said, having thread-safe list to store known subscription lists
is a good thing: it could help to implement additional management
logic for subcription lists and could be useful to add some stats or
debugging tools in the future.

If 68e692da0 ("MINOR: event_hdl: add event handler base api")
is being backported, then this commit should be backported with it.
This commit is contained in:
Aurelien DARRAGON 2023-03-16 10:54:24 +01:00 committed by Christopher Faulet
parent 3a81e997ac
commit ef6ca67176
4 changed files with 65 additions and 19 deletions

View File

@ -25,7 +25,6 @@
#include <stdint.h>
#include <haproxy/api-t.h>
#include <haproxy/thread-t.h>
/* event data struct are defined as followed */
struct event_hdl_cb_data_template {
@ -75,7 +74,7 @@ struct event_hdl_sub_type
struct event_hdl_sub_list_head {
struct mt_list head;
__decl_thread(HA_SPINLOCK_T insert_lock);
struct mt_list known; /* api uses this to track known subscription lists */
};
/* event_hdl_sub_list is an alias (please use this for portability) */

View File

@ -425,7 +425,6 @@ enum lock_label {
IDLE_CONNS_LOCK,
QUIC_LOCK,
OCSP_LOCK,
EHDL_LOCK,
OTHER_LOCK,
/* WT: make sure never to use these ones outside of development,
* we need them for lock profiling!

View File

@ -16,6 +16,7 @@
#include <haproxy/task.h>
#include <haproxy/tools.h>
#include <haproxy/errors.h>
#include <haproxy/signal.h>
#include <haproxy/xxhash.h>
/* event types changes in event_hdl-t.h file should be reflected in the
@ -51,10 +52,31 @@ static int event_hdl_async_max_notif_at_once = 10;
/* global subscription list (implicit where NULL is used as sublist argument) */
static event_hdl_sub_list global_event_hdl_sub_list;
/* every known subscription lists are tracked in this list (including the global one) */
static struct mt_list known_event_hdl_sub_list = MT_LIST_HEAD_INIT(known_event_hdl_sub_list);
static void _event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list);
static void event_hdl_deinit(struct sig_handler *sh)
{
event_hdl_sub_list *cur_list;
struct mt_list *elt1, elt2;
/* destroy all known subscription lists */
mt_list_for_each_entry_safe(cur_list, &known_event_hdl_sub_list, known, elt1, elt2) {
/* remove cur elem from list */
MT_LIST_DELETE_SAFE(elt1);
/* then destroy it */
_event_hdl_sub_list_destroy(cur_list);
}
}
static void event_hdl_init(void)
{
/* initialize global subscription list */
event_hdl_sub_list_init(&global_event_hdl_sub_list);
/* register the deinit function, will be called on soft-stop */
signal_register_fct(0, event_hdl_deinit, 0);
}
/* general purpose hashing function when you want to compute
@ -159,6 +181,7 @@ void event_hdl_async_free_event(struct event_hdl_async_event *e)
* (it is already removed from mt_list, no race can occur)
*/
event_hdl_drop(e->sub_mgmt.this);
HA_ATOMIC_DEC(&jobs);
}
else if (e->_data &&
HA_ATOMIC_SUB_FETCH(&e->_data->refcount, 1) == 0) {
@ -381,6 +404,7 @@ struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list,
struct event_hdl_sub *new_sub = NULL;
struct mt_list *elt1, elt2;
struct event_hdl_async_task_default_ctx *task_ctx = NULL;
struct mt_list lock;
if (!sub_list)
sub_list = &global_event_hdl_sub_list; /* fall back to global list */
@ -453,12 +477,13 @@ struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list,
/* ready for registration */
MT_LIST_INIT(&new_sub->mt_list);
lock = MT_LIST_LOCK_ELT(&sub_list->known);
/* check if such identified hdl is not already registered */
if (hdl.id) {
struct event_hdl_sub *cur_sub;
uint8_t found = 0;
HA_SPIN_LOCK(EHDL_LOCK, &sub_list->insert_lock);
mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) {
if (hdl.id == cur_sub->hdl.id) {
/* we found matching registered hdl */
@ -468,17 +493,35 @@ struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list,
}
if (found) {
/* error already registered */
HA_SPIN_UNLOCK(EHDL_LOCK, &sub_list->insert_lock);
MT_LIST_UNLOCK_ELT(&sub_list->known, lock);
event_hdl_report_hdl_state(ha_alert, &hdl, "SUB", "could not subscribe: subscription with this id already exists");
goto cleanup;
}
}
if (lock.next == &sub_list->known) {
/* this is an expected corner case on de-init path, a subscribe attempt
* was made but the subscription list is already destroyed, we pretend
* it is a memory/IO error since it should not be long before haproxy
* enters the deinit() function anyway
*/
MT_LIST_UNLOCK_ELT(&sub_list->known, lock);
goto cleanup;
}
/* Append in list (global or user specified list).
* For now, append when sync mode, and insert when async mode
* so that async handlers are executed first
*/
if (hdl.async) {
/* Prevent the task from being aborted on soft-stop: let's wait
* until the END event is acknowledged by the task.
* (decrease is performed in event_hdl_async_free_event())
*
* If we don't do this, event_hdl API will leak and we won't give
* a chance to the event-handling task to perform cleanup
*/
HA_ATOMIC_INC(&jobs);
/* async mode, insert at the beginning of the list */
MT_LIST_INSERT(&sub_list->head, &new_sub->mt_list);
} else {
@ -486,8 +529,7 @@ struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list,
MT_LIST_APPEND(&sub_list->head, &new_sub->mt_list);
}
if (hdl.id)
HA_SPIN_UNLOCK(EHDL_LOCK, &sub_list->insert_lock);
MT_LIST_UNLOCK_ELT(&sub_list->known, lock);
return new_sub;
@ -763,7 +805,21 @@ void event_hdl_sub_list_init(event_hdl_sub_list *sub_list)
{
BUG_ON(!sub_list); /* unexpected, global sublist is managed internally */
MT_LIST_INIT(&sub_list->head);
HA_SPIN_INIT(&sub_list->insert_lock);
MT_LIST_APPEND(&known_event_hdl_sub_list, &sub_list->known);
}
/* internal function, assumes that sub_list ptr is always valid */
static void _event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list)
{
struct event_hdl_sub *cur_sub;
struct mt_list *elt1, elt2;
mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) {
/* remove cur elem from list */
MT_LIST_DELETE_SAFE(elt1);
/* then free it */
_event_hdl_unsubscribe(cur_sub);
}
}
/* when a subscription list is no longer used, call this
@ -772,17 +828,10 @@ void event_hdl_sub_list_init(event_hdl_sub_list *sub_list)
*/
void event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list)
{
struct event_hdl_sub *cur_sub;
struct mt_list *elt1, elt2;
BUG_ON(!sub_list); /* unexpected, global sublist is managed internally */
mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) {
/* remove cur elem from list */
MT_LIST_DELETE_SAFE(elt1);
/* then free it */
_event_hdl_unsubscribe(cur_sub);
}
HA_SPIN_DESTROY(&sub_list->insert_lock);
if (!MT_LIST_DELETE(&sub_list->known))
return; /* already destroyed */
_event_hdl_sub_list_destroy(sub_list);
}
INITCALL0(STG_INIT, event_hdl_init);

View File

@ -444,7 +444,6 @@ static const char *lock_label(enum lock_label label)
case IDLE_CONNS_LOCK: return "IDLE_CONNS";
case QUIC_LOCK: return "QUIC";
case OCSP_LOCK: return "OCSP";
case EHDL_LOCK: return "EVENT_HDL";
case OTHER_LOCK: return "OTHER";
case DEBUG1_LOCK: return "DEBUG1";
case DEBUG2_LOCK: return "DEBUG2";