MDEV-33798: ROW base optimistic deadlock with concurrent writes on same table

One case is conflicting transactions T1 and T2 with different domain id, in
optimistic parallel replication in non-GTID mode. Then T2 will
wait_for_prior_commit on T1; and if T1 got a row lock wait on T2 it would
hang, as different domains caused the deadlock kill to be skipped in
thd_rpl_deadlock_check().

More generally, if we have transactions T1 and T2 in one domain/master
connection, and independent transactions U in another, then we can
still deadlock like this:

  T1 row low wait on U
  U row lock wait on T2
  T2 wait_for_prior_commit on T1

This commit enforces the deadlock kill in these cases. If the waited-for
transaction is speculatively applied, then it will be deadlock killed in
case of a conflict, even if the two transactions are in different domains
or master connections.

Reviewed-by: Andrei Elkin <andrei.elkin@mariadb.com>
Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Kristian Nielsen 2024-04-30 14:51:50 +02:00
parent 90b95c6149
commit e365877bae
4 changed files with 373 additions and 8 deletions

View File

@ -0,0 +1,143 @@
include/rpl_init.inc [topology=1->2,1->3]
connect server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2;
connection server_2;
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
SET @old_timeout= @@GLOBAL.lock_wait_timeout;
SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=5;
set global slave_parallel_mode= aggressive;
SET GLOBAL lock_wait_timeout= 86400;
SET GLOBAL innodb_lock_wait_timeout= 86400;
SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
include/start_slave.inc
connection server_1;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0);
connection server_2;
include/stop_slave.inc
connection server_2b;
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
a b
1 0
SELECT * FROM t1 WHERE a=5 FOR UPDATE;
a b
5 0
connection server_1;
SET SESSION gtid_domain_id= 1;
BEGIN;
UPDATE t1 SET b=1 WHERE a=1;
UPDATE t1 SET b=1 WHERE a=7;
COMMIT;
UPDATE t1 SET b=2 WHERE a=3;
SET SESSION gtid_domain_id=2;
BEGIN;
UPDATE t1 SET b=3 WHERE a=5;
UPDATE t1 SET b=3 WHERE a=3;
COMMIT;
UPDATE t1 SET b=4 WHERE a=7;
SET SESSION gtid_domain_id= 0;
include/save_master_gtid.inc
connection server_2;
include/start_slave.inc
connection server_2b;
ROLLBACK;
connection server_2;
include/sync_with_master_gtid.inc
SELECT a, (
(a=1 AND b=1) OR
(a=3 AND (b=2 OR b=3)) OR
(a=5 AND b=3) OR
(a=7 AND (b=1 OR b=4)) OR
((a MOD 2)=0 AND b=0)) AS `ok`
FROM t1
ORDER BY a;
a ok
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
connection server_3;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection server_2;
include/stop_slave.inc
CHANGE MASTER 'm2' to master_port=MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos;
connection server_1;
SET SESSION gtid_domain_id= 1;
BEGIN;
UPDATE t1 SET b=11 WHERE a=1;
UPDATE t1 SET b=11 WHERE a=7;
COMMIT;
UPDATE t1 SET b=12 WHERE a=3;
SET SESSION gtid_domain_id= 1;
connection server_3;
SET SESSION gtid_domain_id=3;
BEGIN;
UPDATE t1 SET b=13 WHERE a=5;
UPDATE t1 SET b=13 WHERE a=3;
COMMIT;
UPDATE t1 SET b=14 WHERE a=7;
include/save_master_gtid.inc
connection server_2b;
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
a b
1 1
SELECT * FROM t1 WHERE a=5 FOR UPDATE;
a b
5 3
START ALL SLAVES;
Warnings:
Note 1937 SLAVE 'm2' started
Note 1937 SLAVE '' started
connection server_2b;
ROLLBACK;
connection server_1;
include/save_master_gtid.inc
connection server_2;
include/sync_with_master_gtid.inc
connection server_3;
include/save_master_gtid.inc
connection server_2;
include/sync_with_master_gtid.inc
SELECT a, (
(a=1 AND b=11) OR
(a=3 AND (b=12 OR b=13)) OR
(a=5 AND b=13) OR
(a=7 AND (b=11 OR b=14)) OR
((a MOD 2)=0 AND b=0)) AS `ok`
FROM t1
ORDER BY a;
a ok
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
SET default_master_connection = 'm2';
include/stop_slave.inc
RESET SLAVE 'm2' ALL;
SET default_master_connection = '';
connection server_3;
include/start_slave.inc
disconnect server_2b;
connection server_1;
DROP TABLE t1;
connection server_2;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
set global slave_parallel_mode= @old_parallel_mode;
SET GLOBAL lock_wait_timeout= @old_timeout;
SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
include/start_slave.inc
include/rpl_end.inc

View File

@ -0,0 +1,17 @@
!include suite/rpl/my.cnf
[mysqld.1]
log-slave-updates
loose-innodb
[mysqld.2]
log-slave-updates
loose-innodb
[mysqld.3]
log-slave-updates
loose-innodb
[ENV]
SERVER_MYPORT_3= @mysqld.3.port
SERVER_MYSOCK_3= @mysqld.3.socket

View File

@ -0,0 +1,182 @@
--source include/have_innodb.inc
--source include/have_log_bin.inc
--let $rpl_topology=1->2,1->3
--source include/rpl_init.inc
--connect (server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2)
--connection server_2
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
SET @old_timeout= @@GLOBAL.lock_wait_timeout;
SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=5;
set global slave_parallel_mode= aggressive;
# High timeout so we get replication sync error and test failure if the
# conflict handling is insufficient and lock wait timeout occurs.
SET GLOBAL lock_wait_timeout= 86400;
SET GLOBAL innodb_lock_wait_timeout= 86400;
SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
--source include/start_slave.inc
--connection server_1
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0);
--save_master_pos
--connection server_2
--sync_with_master
--source include/stop_slave.inc
# Test the following scenario:
#
# Transactions T1, T2 in domain 1, U1, U2 in domain 2.
# Wait cycle T1->U2->U1->T2->T1 as follows:
# T1 row lock wait on U2
# U2 wait_for_prior_commit on U1
# U1 row lock wait on T2
# T2 wait_for_prior_commit on T1
#
# Test that the wait cycle is broken correctly with deadlock kill.
--connection server_2b
# Temporarily block T1 and U1.
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
SELECT * FROM t1 WHERE a=5 FOR UPDATE;
--connection server_1
SET SESSION gtid_domain_id= 1;
# T1 in domain 1
BEGIN;
UPDATE t1 SET b=1 WHERE a=1;
UPDATE t1 SET b=1 WHERE a=7;
COMMIT;
# T2 in domain 1
UPDATE t1 SET b=2 WHERE a=3;
SET SESSION gtid_domain_id=2;
# U1 in domain 2
BEGIN;
UPDATE t1 SET b=3 WHERE a=5;
UPDATE t1 SET b=3 WHERE a=3;
COMMIT;
# U2 in domain 2
UPDATE t1 SET b=4 WHERE a=7;
SET SESSION gtid_domain_id= 0;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
# Wait until T2, U2 are holding the row locks.
--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%'
--source include/wait_condition.inc
# Then let T1, U1 continue to conflict on the row locks, and check that
# replication correctly handles the conflict.
--connection server_2b
ROLLBACK;
--connection server_2
--source include/sync_with_master_gtid.inc
# Allow either domain to "win" on the conflicting updates.
SELECT a, (
(a=1 AND b=1) OR
(a=3 AND (b=2 OR b=3)) OR
(a=5 AND b=3) OR
(a=7 AND (b=1 OR b=4)) OR
((a MOD 2)=0 AND b=0)) AS `ok`
FROM t1
ORDER BY a;
# Now try the same thing with multi-source replication.
# Make server_3 a second master
--connection server_3
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--connection server_2
--source include/stop_slave.inc
--replace_result $SERVER_MYPORT_3 MYPORT_3
eval CHANGE MASTER 'm2' to master_port=$SERVER_MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos;
--connection server_1
SET SESSION gtid_domain_id= 1;
# T1 in domain 1
BEGIN;
UPDATE t1 SET b=11 WHERE a=1;
UPDATE t1 SET b=11 WHERE a=7;
COMMIT;
# T2 in domain 1
UPDATE t1 SET b=12 WHERE a=3;
SET SESSION gtid_domain_id= 1;
--connection server_3
SET SESSION gtid_domain_id=3;
# U1 in domain 3
BEGIN;
UPDATE t1 SET b=13 WHERE a=5;
UPDATE t1 SET b=13 WHERE a=3;
COMMIT;
# U2 in domain 3
UPDATE t1 SET b=14 WHERE a=7;
--source include/save_master_gtid.inc
--connection server_2b
# Temporarily block T1 and U1.
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
SELECT * FROM t1 WHERE a=5 FOR UPDATE;
START ALL SLAVES;
# Wait until T2, U2 are holding the row locks.
--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%'
--source include/wait_condition.inc
--connection server_2b
ROLLBACK;
--connection server_1
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
--connection server_3
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
SELECT a, (
(a=1 AND b=11) OR
(a=3 AND (b=12 OR b=13)) OR
(a=5 AND b=13) OR
(a=7 AND (b=11 OR b=14)) OR
((a MOD 2)=0 AND b=0)) AS `ok`
FROM t1
ORDER BY a;
SET default_master_connection = 'm2';
--source include/stop_slave.inc
RESET SLAVE 'm2' ALL;
SET default_master_connection = '';
--connection server_3
--source include/start_slave.inc
# Cleanup
--disconnect server_2b
--connection server_1
DROP TABLE t1;
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
set global slave_parallel_mode= @old_parallel_mode;
SET GLOBAL lock_wait_timeout= @old_timeout;
SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
--source include/start_slave.inc
--source include/rpl_end.inc

View File

@ -5400,14 +5400,37 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd)
return 0;
if (!rgi->is_parallel_exec)
return 0;
if (rgi->rli != other_rgi->rli)
return 0;
if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
return 0;
if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
return 0;
if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
return 0;
if (rgi->rli == other_rgi->rli)
{
/*
Within the same master connection, we can compare transaction order on
the GTID sub_id, and rollback the later transaction to allow the earlier
transaction to commit first.
*/
if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
return 0;
if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
return 0;
}
else
{
/*
Lock conflicts between different master connection should usually not
occur, but could still happen if user is running some special setup that
tolerates conflicting updates (or in case of user error). We do not have a
pre-defined ordering of transactions in this case, but we still need to
handle conflicts in _some_ way to avoid undetected deadlocks and hangs.
We do this by rolling back and retrying any transaction that is being
_optimistically_ applied. This can be overly conservative in some cases,
but should be fine as conflicts between different master connections are
not expected to be common. And it ensures that we won't end up in a
deadlock and hang due to a transaction doing wait_for_prior_commit while
holding locks that block something in another master connection.
*/
if (other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC)
return 0;
}
/*
This transaction is about to wait for another transaction that is required
by replication binlog order to commit after. This would cause a deadlock.