From a8ea6627a4d3a099e69a88962b71649da8dadfa4 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Sun, 11 Jun 2023 17:44:58 +0200
Subject: [PATCH] MDEV-31448: Killing a replica thread awaiting its GCO can
 hang/crash a parallel replica

The problem was an incorrect unmark_start_commit() in
signal_error_to_sql_driver_thread(). If an event group gets an error, this
unmark could run after the following GCO started, and the subsequent
re-marking could access de-allocated GCO.

The offending unmark_start_commit() looks obviously incorrect, and the fix
is to just remove it. It was introduced in the MDEV-8302 patch, the commit
message of which suggests it was added there solely to satisfy an assertion
in ha_rollback_trans(). So update this assertion instead to not trigger for
event groups that experienced an error (rgi->worker_error). When an error
occurs in an event group, all following event groups are skipped anyway, so
the unmark should never be needed in this case.

Reviewed-by: Andrei Elkin <andrei.elkin@mariadb.com>
Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 sql/handler.cc      | 13 +++++++++++--
 sql/rpl_parallel.cc |  5 -----
 sql/rpl_parallel.h  |  4 ++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/sql/handler.cc b/sql/handler.cc
index f7d781eb82f..48ce7b3f1f8 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1898,13 +1898,22 @@ int ha_rollback_trans(THD *thd, bool all)
       attempt. Otherwise those following transactions can run too early, and
       possibly cause replication to fail. See comments in retry_event_group().
 
+      (This concerns rollbacks due to temporary errors where the transaction
+      will be retried afterwards. For non-recoverable errors, following
+      transactions will not start but just be skipped as the worker threads
+      perform the error stop).
+
       There were several bugs with this in the past that were very hard to
       track down (MDEV-7458, MDEV-8302). So we add here an assertion for
       rollback without signalling following transactions. And in release
       builds, we explicitly do the signalling before rolling back.
     */
-    DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit));
-    if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
+    DBUG_ASSERT( !(thd->rgi_slave &&
+                   !thd->rgi_slave->worker_error &&
+                   thd->rgi_slave->did_mark_start_commit));
+    if (thd->rgi_slave &&
+        !thd->rgi_slave->worker_error &&
+        thd->rgi_slave->did_mark_start_commit)
       thd->rgi_slave->unmark_start_commit();
   }
 #endif
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 98256118909..1dab1224286 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -286,16 +286,11 @@ static void
 signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
 {
   rgi->worker_error= err;
-  /*
-    In case we get an error during commit, inform following transactions that
-    we aborted our commit.
-  */
   DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
       if (rgi->current_gtid.seq_no == 2002) {
         debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2"));
       }});
 
-  rgi->unmark_start_commit();
   rgi->cleanup_context(thd, true);
   rgi->rli->abort_slave= true;
   rgi->rli->stop_for_until= false;
diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h
index b7304d204ee..6b03306692b 100644
--- a/sql/rpl_parallel.h
+++ b/sql/rpl_parallel.h
@@ -91,6 +91,10 @@ struct group_commit_orderer {
   };
   uint8 flags;
 #ifndef DBUG_OFF
+  /*
+    Flag set when the GCO has been freed and entered the free list, to catch
+    (in debug) errors in the complex lifetime of this object.
+  */
   bool gc_done;
 #endif
 };