From b4ace139a11a312ea4d32509295b45030ff8f119 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 15 Aug 2023 12:14:31 +0300
Subject: [PATCH 1/3] Remove the often-hanging test innodb.alter_rename_files

The test innodb.alter_rename_files rather frequently hangs in
checkpoint_set_now. The test was removed in MariaDB Server 10.5
commit 37e7bde12abddcda4d5505450e39a739561bd4d5 when the code that
it aimed to cover was simplified. Starting with MariaDB Server 10.5
the page flushing and log checkpointing is much simpler, handled
by the single buf_flush_page_cleaner() thread.

Let us remove the test to avoid occasional failures. We are not going
to fix the cause of the failure in MariaDB Server 10.4.
---
 .../suite/innodb/r/alter_rename_files.result  | 19 ------------
 .../suite/innodb/t/alter_rename_files.test    | 31 -------------------
 2 files changed, 50 deletions(-)
 delete mode 100644 mysql-test/suite/innodb/r/alter_rename_files.result
 delete mode 100644 mysql-test/suite/innodb/t/alter_rename_files.test

diff --git a/mysql-test/suite/innodb/r/alter_rename_files.result b/mysql-test/suite/innodb/r/alter_rename_files.result
deleted file mode 100644
index 4e373d95fad..00000000000
--- a/mysql-test/suite/innodb/r/alter_rename_files.result
+++ /dev/null
@@ -1,19 +0,0 @@
-CREATE TABLE t1 (x INT NOT NULL UNIQUE KEY) ENGINE=InnoDB;
-INSERT INTO t1 VALUES(5);
-SET GLOBAL innodb_log_checkpoint_now=TRUE;
-SET DEBUG_SYNC='commit_cache_rebuild SIGNAL ready WAIT_FOR finish';
-ALTER TABLE t1 FORCE;;
-connect  con1,localhost,root,,;
-SET DEBUG_SYNC='now WAIT_FOR ready';
-SET GLOBAL innodb_log_checkpoint_now=TRUE;
-SET DEBUG_SYNC='now SIGNAL finish';
-disconnect con1;
-connection default;
-SHOW CREATE TABLE t1;
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `x` int(11) NOT NULL,
-  UNIQUE KEY `x` (`x`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-DROP TABLE t1;
-SET DEBUG_SYNC='RESET';
diff --git a/mysql-test/suite/innodb/t/alter_rename_files.test b/mysql-test/suite/innodb/t/alter_rename_files.test
deleted file mode 100644
index 27408320f7d..00000000000
--- a/mysql-test/suite/innodb/t/alter_rename_files.test
+++ /dev/null
@@ -1,31 +0,0 @@
---source include/have_debug.inc
---source include/have_debug_sync.inc
---source include/have_innodb.inc
---source include/count_sessions.inc
-
-CREATE TABLE t1 (x INT NOT NULL UNIQUE KEY) ENGINE=InnoDB;
-INSERT INTO t1 VALUES(5);
-
-SET GLOBAL innodb_log_checkpoint_now=TRUE;
-
-# Start an ALTER TABLE and stop it before renaming the files
-SET DEBUG_SYNC='commit_cache_rebuild SIGNAL ready WAIT_FOR finish';
-
---send ALTER TABLE t1 FORCE;
-
-connect (con1,localhost,root,,);
-
-SET DEBUG_SYNC='now WAIT_FOR ready';
-
-SET GLOBAL innodb_log_checkpoint_now=TRUE;
-
-SET DEBUG_SYNC='now SIGNAL finish';
-
-disconnect con1;
-connection default;
-reap;
-SHOW CREATE TABLE t1;
-DROP TABLE t1;
-SET DEBUG_SYNC='RESET';
-
---source include/wait_until_count_sessions.inc

From 920789e9d413504a137aab486961eac06d02b203 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Sun, 9 Jul 2023 16:45:47 +0200
Subject: [PATCH 2/3] MDEV-31482: Lock wait timeout with INSERT-SELECT,
 autoinc, and statement-based replication

Remove the exception that InnoDB does not report auto-increment locks waits
to the parallel replication.

There was an assumption that these waits could not cause conflicts with
in-order parallel replication and thus need not be reported. However, this
assumption is wrong and it is possible to get conflicts that lead to hangs
for the duration of --innodb-lock-wait-timeout. This can be seen with three
transactions:

1. T1 is waiting for T3 on an autoinc lock
2. T2 is waiting for T1 to commit
3. T3 is waiting on a normal row lock held by T2

Here, T3 needs to be deadlock killed on the wait by T1.

Note: This should be null-merged to 10.6, as a different fix is needed
there due to InnoDB lock code changes.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 .../suite/rpl/r/rpl_parallel_autoinc.result   |  95 ++++++++++++
 .../suite/rpl/t/rpl_parallel_autoinc.test     | 140 ++++++++++++++++++
 sql/sql_class.cc                              |   6 -
 storage/innobase/lock/lock0lock.cc            |   8 +-
 4 files changed, 236 insertions(+), 13 deletions(-)
 create mode 100644 mysql-test/suite/rpl/r/rpl_parallel_autoinc.result
 create mode 100644 mysql-test/suite/rpl/t/rpl_parallel_autoinc.test

diff --git a/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result b/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result
new file mode 100644
index 00000000000..c1829bafa1a
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result
@@ -0,0 +1,95 @@
+include/master-slave.inc
+[connection master]
+MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
+include/rpl_connect.inc [creating slave2]
+include/rpl_connect.inc [creating slave3]
+connection master;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY AUTO_INCREMENT, b INT, c INT, INDEX (c)) ENGINE=InnoDB;
+INSERT INTO t1 (b,c) VALUES (0, 1), (0, 1), (0, 2), (0,3), (0, 5), (0, 7), (0, 8);
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (10,1), (20,2), (30,3), (40,4), (50,5);
+CREATE TABLE t3 (a VARCHAR(20) PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES ('row for T1', 0), ('row for T2', 0), ('row for T3', 0);
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 3;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+set @@global.innodb_lock_wait_timeout= 20;
+connection master;
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T1";
+INSERT INTO t1(b, c) SELECT 1, t2.b FROM t2 WHERE a=10;
+Warnings:
+Note	1592	Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statements writing to a table with an auto-increment column after selecting from another table are unsafe because the order in which rows are retrieved determines what (if any) rows will be written. This order cannot be predicted and may differ on master and the slave
+COMMIT;
+DELETE FROM t1 WHERE c >= 4 and c < 6;
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T3";
+INSERT INTO t1(b, c) SELECT 3, t2.b FROM t2 WHERE a >= 20 AND a <= 40;
+Warnings:
+Note	1592	Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statements writing to a table with an auto-increment column after selecting from another table are unsafe because the order in which rows are retrieved determines what (if any) rows will be written. This order cannot be predicted and may differ on master and the slave
+COMMIT;
+include/save_master_gtid.inc
+connection slave1;
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T1" FOR UPDATE;
+a	b
+row for T1	0
+connection slave2;
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T3" FOR UPDATE;
+a	b
+row for T3	0
+connection slave3;
+BEGIN;
+DELETE FROM t2 WHERE a=30;
+connection slave;
+include/start_slave.inc
+connection slave2;
+ROLLBACK;
+connection slave1;
+ROLLBACK;
+connection slave3;
+ROLLBACK;
+connection slave;
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b	c
+1	0	1
+2	0	1
+3	0	2
+4	0	3
+6	0	7
+7	0	8
+8	1	1
+9	3	2
+10	3	3
+11	3	4
+SELECT * FROM t2 ORDER BY a;
+a	b
+10	1
+20	2
+30	3
+40	4
+50	5
+SELECT * FROM t3 ORDER BY a;
+a	b
+row for T1	1
+row for T2	0
+row for T3	1
+connection master;
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format");
+DROP TABLE t1, t2, t3;
+connection slave;
+include/stop_slave.inc
+SET @@global.slave_parallel_threads= 0;
+SET @@global.slave_parallel_mode= conservative;
+SET @@global.innodb_lock_wait_timeout= 50;
+include/start_slave.inc
+SELECT @@GLOBAL.innodb_autoinc_lock_mode;
+@@GLOBAL.innodb_autoinc_lock_mode
+1
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test b/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test
new file mode 100644
index 00000000000..0e96b4dfb80
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test
@@ -0,0 +1,140 @@
+--source include/have_binlog_format_statement.inc
+--source include/have_innodb.inc
+--source include/master-slave.inc
+
+--echo MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
+
+# The scenario is transactions T1, T2, T3:
+#
+#   T1 is waiting for T3 on an autoinc lock
+#   T2 is waiting for T1 to commit
+#   T3 is waiting on a normal row lock held by T2
+#
+# This caused a hang until innodb_lock_wait_timeout, because autoinc
+# locks were not reported to the in-order parallel replication, so T3
+# was not deadlock killed.
+
+--let $lock_wait_timeout=20
+
+--let $rpl_connection_name= slave2
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--let $rpl_connection_name= slave3
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--connection master
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+
+# A table as destination for INSERT-SELECT
+CREATE TABLE t1 (a INT PRIMARY KEY AUTO_INCREMENT, b INT, c INT, INDEX (c)) ENGINE=InnoDB;
+INSERT INTO t1 (b,c) VALUES (0, 1), (0, 1), (0, 2), (0,3), (0, 5), (0, 7), (0, 8);
+
+# A table as source for INSERT-SELECT.
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (10,1), (20,2), (30,3), (40,4), (50,5);
+
+# A table to help order slave worker threads to setup the desired scenario.
+CREATE TABLE t3 (a VARCHAR(20) PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES ('row for T1', 0), ('row for T2', 0), ('row for T3', 0);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+--let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout`
+--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
+--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
+set @@global.slave_parallel_threads= 3;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+eval set @@global.innodb_lock_wait_timeout= $lock_wait_timeout;
+
+--connection master
+# Transaction T1.
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T1";
+INSERT INTO t1(b, c) SELECT 1, t2.b FROM t2 WHERE a=10;
+COMMIT;
+
+# Transaction T2.
+DELETE FROM t1 WHERE c >= 4 and c < 6;
+
+# Transaction T3.
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T3";
+INSERT INTO t1(b, c) SELECT 3, t2.b FROM t2 WHERE a >= 20 AND a <= 40;
+COMMIT;
+
+--source include/save_master_gtid.inc
+
+--connection slave1
+# Temporarily block T1 to create the scheduling that triggers the bug.
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T1" FOR UPDATE;
+
+--connection slave2
+# Temporarily block T3 from starting (so T2 can reach commit).
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T3" FOR UPDATE;
+
+--connection slave3
+# This critical step blocks T3 after it has inserted its first row,
+# and thus taken the auto-increment lock, but before it has reached
+# the point where it gets a row lock wait on T2. Even though
+# auto-increment lock waits were not reported due to the bug,
+# transitive lock waits (T1 waits on autoinc of T3 which waits on row
+# on T2) _were_ reported as T1 waiting on T2, and thus a deadlock kill
+# happened and the bug was not triggered.
+BEGIN;
+DELETE FROM t2 WHERE a=30;
+
+--connection slave
+--source include/start_slave.inc
+
+# First let T2 complete until it is waiting for T1 to commit.
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state='Waiting for prior transaction to commit' and command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+# Then let T3 reach the point where it has obtained the autoinc lock,
+# but it is not yet waiting for a row lock held by T2.
+--connection slave2
+ROLLBACK;
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state='Sending data' and info LIKE 'INSERT INTO t1(b, c) SELECT 3, t2.b%' and time_ms > 500 and command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+# Now let T1 continue, while T3 is holding the autoinc lock but before
+# it is waiting for T2. Wait a short while to give the hang a chance to
+# happen; T1 needs to get to request the autoinc lock before we let T3
+# continue. (There's a small chance the sleep will be too small, which will
+# let the test occasionally pass on non-fixed server).
+--connection slave1
+ROLLBACK;
+--sleep 0.5
+
+# Now let T3 continue; the bug was that this lead to an undetected
+# deadlock that remained until innodb lock wait timeout.
+--connection slave3
+ROLLBACK;
+
+--connection slave
+--let $slave_timeout= `SELECT $lock_wait_timeout/2`
+--source include/sync_with_master_gtid.inc
+--let $slave_timeout=
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY a;
+
+# Cleanup.
+--connection master
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format");
+DROP TABLE t1, t2, t3;
+
+--connection slave
+--source include/stop_slave.inc
+eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads;
+eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode;
+eval SET @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout;
+--source include/start_slave.inc
+SELECT @@GLOBAL.innodb_autoinc_lock_mode;
+--source include/rpl_end.inc
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index d2a3cbcf613..ae388515961 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5144,12 +5144,6 @@ thd_need_wait_reports(const MYSQL_THD thd)
   deadlock with the pre-determined commit order, we kill the later
   transaction, and later re-try it, to resolve the deadlock.
 
-  This call need only receive reports about waits for locks that will remain
-  until the holding transaction commits. InnoDB auto-increment locks,
-  for example, are released earlier, and so need not be reported. (Such false
-  positives are not harmful, but could lead to unnecessary kill and retry, so
-  best avoided).
-
   Returns 1 if the OTHER_THD will be killed to resolve deadlock, 0 if not. The
   actual kill will happen later, asynchronously from another thread. The
   caller does not need to take any actions on the return value if the
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 8e75955c8af..b31e8ba5b56 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -6941,13 +6941,7 @@ DeadlockChecker::search()
 			return m_start;
 		}
 
-		/* We do not need to report autoinc locks to the upper
-		layer. These locks are released before commit, so they
-		can not cause deadlocks with binlog-fixed commit
-		order. */
-		if (m_report_waiters
-		    && (lock_get_type_low(lock) != LOCK_TABLE
-			|| lock_get_mode(lock) != LOCK_AUTO_INC)) {
+		if (m_report_waiters) {
 			thd_rpl_deadlock_check(m_start->mysql_thd,
 					       lock->trx->mysql_thd);
 		}

From 900c4d692073ae51413d8f739977216a56663cbf Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Tue, 11 Jul 2023 00:31:29 +0200
Subject: [PATCH 3/3] MDEV-31655: Parallel replication deadlock victim
 preference code errorneously removed

Restore code to make InnoDB choose the second transaction as a deadlock
victim if two transactions deadlock that need to commit in-order for
parallel replication. This code was erroneously removed when VATS was
implemented in InnoDB.

Also add a test case for InnoDB choosing the right deadlock victim.
Also fixes this bug, with testcase that reliably reproduces:

MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master

Note: This should be null-merged to 10.6, as a different fix is needed
there due to InnoDB locking code changes.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 .../binlog_encryption/rpl_parallel.result     | 42 +++++++++-
 mysql-test/suite/rpl/r/rpl_parallel.result    | 42 +++++++++-
 .../r/rpl_parallel_deadlock_victim2.result    | 50 +++++++++++
 mysql-test/suite/rpl/t/rpl_parallel.test      | 71 +++++++++++++++-
 .../rpl/t/rpl_parallel_deadlock_victim2.test  | 83 +++++++++++++++++++
 sql/rpl_parallel.cc                           | 12 ++-
 sql/slave.cc                                  |  1 +
 sql/sql_class.cc                              | 43 ++++++++++
 sql/sql_insert.cc                             |  7 ++
 storage/innobase/lock/lock0lock.cc            | 17 ++++
 storage/innobase/lock/lock0wait.cc            | 15 +++-
 storage/innobase/trx/trx0trx.cc               | 15 ++++
 12 files changed, 393 insertions(+), 5 deletions(-)
 create mode 100644 mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result
 create mode 100644 mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test

diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel.result b/mysql-test/suite/binlog_encryption/rpl_parallel.result
index b75a66a634a..b24ff7ba53d 100644
--- a/mysql-test/suite/binlog_encryption/rpl_parallel.result
+++ b/mysql-test/suite/binlog_encryption/rpl_parallel.result
@@ -2,6 +2,7 @@ include/master-slave.inc
 [connection master]
 connection server_2;
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
 SET GLOBAL slave_parallel_threads=10;
 ERROR HY000: This operation cannot be performed as you have a running slave ''; run STOP SLAVE '' first
 include/stop_slave.inc
@@ -1680,13 +1681,52 @@ a
 2000
 SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
 a
+MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+connection server_1;
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+COMMIT;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 5;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+connection master;
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 1+1000;
+SET @commit_id= 2+1000;
+SET @commit_id= 3+1000;
+SET @commit_id= 4+1000;
+SET @commit_id= 5+1000;
+SET @commit_id= 6+1000;
+SET @commit_id= 7+1000;
+SET @commit_id= 8+1000;
+SET @commit_id= 9+1000;
+SET @commit_id= 10+1000;
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*)	SUM(a*100*b)
+10	225000
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*)	SUM(a*100*b)
+10	225000
 connection server_2;
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
 include/start_slave.inc
 SET DEBUG_SYNC= 'RESET';
 connection server_1;
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
 SET DEBUG_SYNC= 'RESET';
 include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 9b2e68d366e..ef89d954faa 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -2,6 +2,7 @@ include/master-slave.inc
 [connection master]
 connection server_2;
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
 SET GLOBAL slave_parallel_threads=10;
 ERROR HY000: This operation cannot be performed as you have a running slave ''; run STOP SLAVE '' first
 include/stop_slave.inc
@@ -1679,13 +1680,52 @@ a
 2000
 SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
 a
+MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+connection server_1;
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+COMMIT;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 5;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+connection master;
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 1+1000;
+SET @commit_id= 2+1000;
+SET @commit_id= 3+1000;
+SET @commit_id= 4+1000;
+SET @commit_id= 5+1000;
+SET @commit_id= 6+1000;
+SET @commit_id= 7+1000;
+SET @commit_id= 8+1000;
+SET @commit_id= 9+1000;
+SET @commit_id= 10+1000;
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*)	SUM(a*100*b)
+10	225000
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*)	SUM(a*100*b)
+10	225000
 connection server_2;
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
 include/start_slave.inc
 SET DEBUG_SYNC= 'RESET';
 connection server_1;
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
 SET DEBUG_SYNC= 'RESET';
 include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result
new file mode 100644
index 00000000000..f154f37a057
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result
@@ -0,0 +1,50 @@
+include/master-slave.inc
+[connection master]
+connection master;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1(a INT) ENGINE=INNODB;
+INSERT INTO t1 VALUES(1);
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 2;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+set @@global.slave_transaction_retries= 2;
+*** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
+connection master;
+SET @@gtid_seq_no= 100;
+INSERT INTO t1  SELECT 1+a FROM t1;
+SET @@gtid_seq_no= 200;
+INSERT INTO t1  SELECT 2+a FROM t1;
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+include/save_master_gtid.inc
+connection slave;
+SET @save_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start";
+SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200";
+SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait";
+SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill";
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @save_dbug;
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+connection slave;
+include/stop_slave.inc
+SET @@global.slave_parallel_threads= 0;
+SET @@global.slave_parallel_mode= conservative;
+SET @@global.slave_transaction_retries= 10;
+include/start_slave.inc
+connection master;
+DROP TABLE t1;
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index 9ba7a30f2eb..d43cec4df34 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -13,6 +13,7 @@
 
 --connection server_2
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
 --error ER_SLAVE_MUST_STOP
 SET GLOBAL slave_parallel_threads=10;
 --source include/stop_slave.inc
@@ -2203,16 +2204,84 @@ SELECT * FROM t1 WHERE a>=2000 ORDER BY a;
 SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
 
 
+--echo MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+# The problem was that InnoDB would choose the wrong deadlock victim.
+# Create a lot of transactions that can cause deadlocks, and use error
+# injection to check that the first transactions in each group is never
+# selected as deadlock victim.
+--let $rows= 10
+--let $transactions= 5
+--let $gcos= 10
+
+--connection server_1
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+--disable_query_log
+--let $i= 0
+while ($i < 10) {
+  eval INSERT INTO t7 VALUES ($i, 0);
+  inc $i;
+}
+--enable_query_log
+COMMIT;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+eval set @@global.slave_parallel_threads= $transactions;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+# This error injection will allow no retries for GTIDs divisible by 1000.
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+
+--connection master
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+--let $j= 1
+while ($j <= $gcos) {
+  eval SET @commit_id= $j+1000;
+  --let $i= 0
+  while ($i < $transactions) {
+    --disable_query_log
+    eval SET SESSION gtid_seq_no= 1000 + 1000*$j + $i;
+    BEGIN;
+    --let $k= 0
+    while ($k < $rows) {
+      eval UPDATE t7 SET b=b+1 WHERE a=(($i+$k) MOD $rows);
+      inc $k;
+    }
+    COMMIT;
+    --enable_query_log
+    inc $i;
+  }
+  inc $j;
+}
+
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+
+
 # Clean up.
 --connection server_2
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
 --source include/start_slave.inc
 SET DEBUG_SYNC= 'RESET';
 
 --connection server_1
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
 SET DEBUG_SYNC= 'RESET';
 
 --source include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test
new file mode 100644
index 00000000000..522cec18bbc
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test
@@ -0,0 +1,83 @@
+--source include/master-slave.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_binlog_format_statement.inc
+
+--connection master
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1(a INT) ENGINE=INNODB;
+INSERT INTO t1 VALUES(1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries`
+--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
+--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
+set @@global.slave_parallel_threads= 2;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+set @@global.slave_transaction_retries= 2;
+
+--echo *** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
+# This was a failure where a transaction T1 could deadlock multiple times
+# with T2, eventually exceeding the default --slave-transaction-retries=10.
+# Root cause was MDEV-31655, causing InnoDB to wrongly choose T1 as deadlock
+# victim over T2. If thread scheduling is right, it was possible for T1 to
+# repeatedly deadlock, roll back, and have time to grab an S lock again before
+# T2 woke up and got its waiting X lock, thus repeating the same deadlock over
+# and over.
+# Once the bug is fixed, it is not possible to re-create the same execution
+# and thread scheduling. Instead we inject small sleeps in a way that
+# triggered the problem when the bug was there, to demonstrate that the
+# problem no longer occurs.
+
+--connection master
+# T1
+SET @@gtid_seq_no= 100;
+INSERT INTO t1  SELECT 1+a FROM t1;
+# T2
+SET @@gtid_seq_no= 200;
+INSERT INTO t1  SELECT 2+a FROM t1;
+
+SELECT * FROM t1 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+SET @save_dbug= @@GLOBAL.debug_dbug;
+
+# Inject various delays to hint thread scheduling to happen in the way that
+# triggered MDEV-28776.
+
+# Small delay starting T1 so it will be the youngest trx and be chosen over
+# T2 as the deadlock victim by default in InnoDB.
+SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start";
+
+# Small delay before taking insert X lock to give time for both T1 and T2 to
+# get the S lock first and cause a deadlock.
+SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200";
+
+# Small delay after T2's wait on the X lock, to give time for T1 retry to
+# re-aquire the T1 S lock first.
+SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait";
+
+# Delay deadlock kill of T2.
+SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill";
+
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @save_dbug;
+SELECT * FROM t1 ORDER BY a;
+
+# Cleanup.
+--connection slave
+--source include/stop_slave.inc
+eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads;
+eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode;
+eval SET @@global.slave_transaction_retries= $save_transaction_retries;
+--source include/start_slave.inc
+
+--connection master
+DROP TABLE t1;
+
+--source include/rpl_end.inc
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index ad9c2ac802c..ba5cf54e673 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -1284,6 +1284,11 @@ handle_rpl_parallel_thread(void *arg)
         bool did_enter_cond= false;
         PSI_stage_info old_stage;
 
+        DBUG_EXECUTE_IF("rpl_parallel_delay_gtid_0_x_100_start", {
+            if (rgi->current_gtid.domain_id==0 &&
+                rgi->current_gtid.seq_no == 100)
+              my_sleep(10000);
+          });
 #ifdef ENABLED_DEBUG_SYNC
         DBUG_EXECUTE_IF("hold_worker_on_schedule", {
             if (rgi->current_gtid.domain_id == 0 &&
@@ -1463,8 +1468,13 @@ handle_rpl_parallel_thread(void *arg)
                         err= dbug_simulate_tmp_error(rgi, thd););
         if (unlikely(err))
         {
+          ulong max_retries= slave_trans_retries;
           convert_kill_to_deadlock_error(rgi);
-          if (has_temporary_error(thd) && slave_trans_retries > 0)
+          DBUG_EXECUTE_IF("rpl_mdev31655_zero_retries",
+                          if ((rgi->current_gtid.seq_no % 1000) == 0)
+                            max_retries= 0;
+                          );
+          if (has_temporary_error(thd) && max_retries > 0)
             err= retry_event_group(rgi, rpt, qev);
         }
       }
diff --git a/sql/slave.cc b/sql/slave.cc
index 3fe89a95e5e..cdb16f873c3 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -502,6 +502,7 @@ static void bg_rpl_load_gtid_slave_state(void *)
 static void bg_slave_kill(void *victim)
 {
   THD *to_kill= (THD *)victim;
+  DBUG_EXECUTE_IF("rpl_delay_deadlock_kill", my_sleep(1500000););
   to_kill->awake(KILL_CONNECTION);
   mysql_mutex_lock(&to_kill->LOCK_wakeup_ready);
   to_kill->rgi_slave->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index ae388515961..e7e27401d61 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5272,6 +5272,49 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
   return 0;
 }
 
+
+/*
+  If the storage engine detects a deadlock, and needs to choose a victim
+  transaction to roll back, it can call this function to ask the upper
+  server layer for which of two possible transactions is prefered to be
+  aborted and rolled back.
+
+  In parallel replication, if two transactions are running in parallel and
+  one is fixed to commit before the other, then the one that commits later
+  will be prefered as the victim - chosing the early transaction as a victim
+  will not resolve the deadlock anyway, as the later transaction still needs
+  to wait for the earlier to commit.
+
+  The return value is -1 if the first transaction is prefered as a deadlock
+  victim, 1 if the second transaction is prefered, or 0 for no preference (in
+  which case the storage engine can make the choice as it prefers).
+*/
+extern "C" int
+thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
+{
+  rpl_group_info *rgi1, *rgi2;
+
+  if (!thd1 || !thd2)
+    return 0;
+
+  /*
+    If the transactions are participating in the same replication domain in
+    parallel replication, then request to select the one that will commit
+    later (in the fixed commit order from the master) as the deadlock victim.
+  */
+  rgi1= thd1->rgi_slave;
+  rgi2= thd2->rgi_slave;
+  if (rgi1 && rgi2 &&
+      rgi1->is_parallel_exec &&
+      rgi1->rli == rgi2->rli &&
+      rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id)
+    return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1;
+
+  /* No preferences, let the storage engine decide. */
+  return 0;
+}
+
+
 extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
 {
   return(thd->transaction.all.modified_non_trans_table);
diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc
index a2c571373cb..a64314814dd 100644
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@ -78,6 +78,7 @@
 #include "sql_audit.h"
 #include "sql_derived.h"                        // mysql_handle_derived
 #include "sql_prepare.h"
+#include "rpl_rli.h"
 #include <my_bit.h>
 
 #include "debug_sync.h"
@@ -1753,6 +1754,12 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info)
   save_read_set=  table->read_set;
   save_write_set= table->write_set;
 
+  DBUG_EXECUTE_IF("rpl_write_record_small_sleep_gtid_100_200",
+    {
+      if (thd->rgi_slave && (thd->rgi_slave->current_gtid.seq_no == 100 ||
+                             thd->rgi_slave->current_gtid.seq_no == 200))
+        my_sleep(20000);
+    });
   if (info->handle_duplicates == DUP_REPLACE ||
       info->handle_duplicates == DUP_UPDATE)
   {
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index b31e8ba5b56..cc20ab95930 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -71,6 +71,9 @@ static void lock_grant_after_reset(lock_t* lock);
 extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
 extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
 extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+#ifdef HAVE_REPLICATION
+extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+#endif
 
 /** Pretty-print a table lock.
 @param[in,out]	file	output stream
@@ -1546,6 +1549,20 @@ static bool has_higher_priority(lock_t *lock1, lock_t *lock2)
 	} else if (!lock_get_wait(lock2)) {
 		return false;
 	}
+
+#ifdef HAVE_REPLICATION
+	// Ask the upper server layer if any of the two trx should be prefered.
+	int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd,
+							lock2->trx->mysql_thd);
+	if (preference == -1) {
+		// lock1 is preferred as a victim, so lock2 has higher priority
+		return false;
+	} else if (preference == 1) {
+		// lock2 is preferred as a victim, so lock1 has higher priority
+		return true;
+	}
+#endif
+
 	return lock1->trx->start_time_micro <= lock2->trx->start_time_micro;
 }
 
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
index 5eb03f668b3..a26b3a6efaa 100644
--- a/storage/innobase/lock/lock0wait.cc
+++ b/storage/innobase/lock/lock0wait.cc
@@ -278,7 +278,9 @@ lock_wait_suspend_thread(
 	}
 
 	ulint	lock_type = ULINT_UNDEFINED;
-
+#ifndef DBUG_OFF
+	ulint	lock_mode = LOCK_NONE;
+#endif
 	/* The wait_lock can be cleared by another thread when the
 	lock is released. But the wait can only be initiated by the
 	current thread which owns the transaction. Only acquire the
@@ -288,6 +290,9 @@ lock_wait_suspend_thread(
 		wait_lock = trx->lock.wait_lock;
 		if (wait_lock) {
 			lock_type = lock_get_type_low(wait_lock);
+#ifndef DBUG_OFF
+			lock_mode = lock_get_mode(wait_lock);
+#endif
 		}
 		lock_mutex_exit();
 	}
@@ -336,6 +341,14 @@ lock_wait_suspend_thread(
 	}
 
 	os_event_wait(slot->event);
+	DBUG_EXECUTE_IF("small_sleep_after_lock_wait",
+		{
+			if (lock_type == LOCK_REC && lock_mode == LOCK_X &&
+			    trx->error_state != DB_DEADLOCK &&
+			    !trx_is_interrupted(trx)) {
+				my_sleep(20000);
+			}
+		});
 
 	thd_wait_end(trx->mysql_thd);
 
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 7cd95878b0c..b742d1c3686 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -52,6 +52,11 @@ Created 3/26/1996 Heikki Tuuri
 #include <set>
 #include <new>
 
+#ifdef HAVE_REPLICATION
+extern "C"
+int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+#endif
+
 /** The bit pattern corresponding to TRX_ID_MAX */
 const byte trx_id_max_bytes[8] = {
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
@@ -1907,6 +1912,16 @@ trx_weight_ge(
 	ibool	a_notrans_edit;
 	ibool	b_notrans_edit;
 
+#ifdef HAVE_REPLICATION
+	/* First ask the upper server layer if it has any preference for which
+	to prefer as a deadlock victim. */
+	int pref= thd_deadlock_victim_preference(a->mysql_thd, b->mysql_thd);
+	if (pref < 0)
+		return FALSE;
+	else if (pref > 0)
+		return TRUE;
+#endif
+
 	/* If mysql_thd is NULL for a transaction we assume that it has
 	not edited non-transactional tables. */