From b68c100076ceed77e0260e2906f0944d5f696503 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 20 Aug 2024 18:00:23 +0300
Subject: [PATCH 01/22] MDEV-34565 MariaDB crashes with SIGILL because the OS
 does not support AVX512

In commit 232d7a5e2dc50181294b927e2bc4b02e282725a8 we almost got
the detection logic right. However, the XGETBV instruction would
crash if Linux was started up with the option noxsave.

have_vpclmulqdq(): Check for the XSAVE flag at the correct position
and also for the AVX flag.

This was tested on Ubuntu 22.04 by starting up its Linux 5.15 kernel
with and without the noxsave option.
---
 mysys/crc32/crc32c_x86.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mysys/crc32/crc32c_x86.cc b/mysys/crc32/crc32c_x86.cc
index 3ddddf1303c..fb5dc19f7a5 100644
--- a/mysys/crc32/crc32c_x86.cc
+++ b/mysys/crc32/crc32c_x86.cc
@@ -39,7 +39,7 @@ extern "C" unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size);
 
 constexpr uint32_t cpuid_ecx_SSE42= 1U << 20;
 constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U << 1;
-constexpr uint32_t cpuid_ecx_XSAVE= 1U << 26;
+constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27;
 
 static uint32_t cpuid_ecx()
 {
@@ -395,7 +395,7 @@ static bool os_have_avx512()
 
 static ATTRIBUTE_NOINLINE bool have_vpclmulqdq(uint32_t cpuid_ecx)
 {
-  if (!(cpuid_ecx & cpuid_ecx_XSAVE) || !os_have_avx512())
+  if ((~cpuid_ecx & cpuid_ecx_AVX_AND_XSAVE) || !os_have_avx512())
     return false;
 # ifdef _MSC_VER
   int regs[4];

From 0b7d19d5002e8b7b2f7c97492dc3da3916d4a0b9 Mon Sep 17 00:00:00 2001
From: Oleksandr Byelkin <sanja@mariadb.com>
Date: Tue, 20 Aug 2024 16:12:02 +0200
Subject: [PATCH 02/22] MDEV-34785: Assertion failure in
 Item_func_or_sum::do_build_clone (Item_func_not_all)

Missed method added.
---
 mysql-test/main/item_types.result | 9 +++++++++
 mysql-test/main/item_types.test   | 9 +++++++++
 sql/item_cmpfunc.h                | 2 ++
 3 files changed, 20 insertions(+)

diff --git a/mysql-test/main/item_types.result b/mysql-test/main/item_types.result
index 0193d33be6d..a0068772cea 100644
--- a/mysql-test/main/item_types.result
+++ b/mysql-test/main/item_types.result
@@ -42,5 +42,14 @@ SELECT * FROM v WHERE f = '10.5.20';
 f
 drop view v;
 #
+# MDEV-34785: Assertion failure in Item_func_or_sum::do_build_clone
+# (Item_func_not_all)
+#
+CREATE VIEW t AS SELECT 0 AS a;
+SELECT * FROM t WHERE a=ALL (SELECT 0);
+a
+0
+DROP VIEW t;
+#
 # End of 10.5 tests
 #
diff --git a/mysql-test/main/item_types.test b/mysql-test/main/item_types.test
index 2818ae582af..0a4100e9163 100644
--- a/mysql-test/main/item_types.test
+++ b/mysql-test/main/item_types.test
@@ -46,6 +46,15 @@ CREATE VIEW v AS SELECT version() AS f;
 SELECT * FROM v WHERE f = '10.5.20';
 drop view v;
 
+--echo #
+--echo # MDEV-34785: Assertion failure in Item_func_or_sum::do_build_clone
+--echo # (Item_func_not_all)
+--echo #
+
+CREATE VIEW t AS SELECT 0 AS a;
+SELECT * FROM t WHERE a=ALL (SELECT 0);
+DROP VIEW t;
+
 --echo #
 --echo # End of 10.5 tests
 --echo #
diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h
index adbb00147dc..fd5c2bc9873 100644
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -711,6 +711,8 @@ public:
   void set_sub_test(Item_maxmin_subselect *item) { test_sub_item= item; test_sum_item= 0;};
   bool empty_underlying_subquery();
   Item *neg_transformer(THD *thd) override;
+  Item *do_get_copy(THD *thd) const override
+  { return get_item_copy<Item_func_not_all>(thd, this); }
 };
 
 

From eadf0f63a284ba4392089d9271eadf1ce2f62f04 Mon Sep 17 00:00:00 2001
From: Oleksandr Byelkin <sanja@mariadb.com>
Date: Wed, 21 Aug 2024 15:32:14 +0200
Subject: [PATCH 03/22] fix MDEV-34771 & MDEV-34776

removed duplicated methods
---
 sql/item.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sql/item.h b/sql/item.h
index cddef65f4fd..511f86f5dbc 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -4706,7 +4706,6 @@ public:
   }
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_string_with_introducer>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 
@@ -4721,7 +4720,6 @@ public:
   { }
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_string_sys>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 
@@ -4738,7 +4736,6 @@ public:
   { }
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_string_ascii>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 
@@ -4777,7 +4774,6 @@ public:
   }
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_static_string_func>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 
@@ -4797,7 +4793,6 @@ public:
   }
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_partition_func_safe_string>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 
@@ -4960,7 +4955,6 @@ public:
   void print(String *str, enum_query_type query_type) override;
   Item *do_get_copy(THD *thd) const override
   { return get_item_copy<Item_bin_string>(thd, this); }
-  Item *do_build_clone(THD *thd) const override { return get_copy(thd); }
 };
 
 

From 1f040ae0485ba16746229f8db9ffbe1af947aa03 Mon Sep 17 00:00:00 2001
From: Monty <monty@mariadb.org>
Date: Tue, 20 Aug 2024 11:31:58 +0300
Subject: [PATCH 04/22] MDEV-34043 Drastically slower query performance between
 CentOS (2sec) and Rocky (48sec)

One cause of the slowdown is because the ftruncate call can be much
slower on some systems.  ftruncate() is called by Aria for internal
temporary tables, tables created by the optimizer, when the upper level
asks Aria to delete the previous result set. This is needed when some
content from previous tables changes.

I have now changed Aria so that for internal temporary tables we don't
call ftruncate() anymore for maria_delete_all_rows().

I also had to update the Aria repair code to use the logical datafile
size and not the on-disk datafile size, which may contain data from a
previous result set.  The repair code is called to create indexes for
the internal temporary table after it is filled.
I also replaced a call to mysql_file_size() with a pwrite() in
_ma_bitmap_create_first().

Reviewer: Sergei Petrunia <sergey@mariadb.com>
Tester: Dave Gosselin <dave.gosselin@mariadb.com>
---
 storage/maria/ma_bitmap.c     | 18 +++++++++++-------
 storage/maria/ma_check.c      | 26 ++++++++++++++++++++++----
 storage/maria/ma_delete_all.c | 14 +++++++++++---
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
index 48a0545467c..613dcd18e02 100644
--- a/storage/maria/ma_bitmap.c
+++ b/storage/maria/ma_bitmap.c
@@ -3070,21 +3070,25 @@ my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
 int _ma_bitmap_create_first(MARIA_SHARE *share)
 {
   uint block_size= share->bitmap.block_size;
+  size_t error;
   File file= share->bitmap.file.file;
-  uchar marker[CRC_SIZE];
+  uchar *temp_buff;
+
+  if (!(temp_buff= (uchar*) my_alloca(block_size)))
+    return 1;
+  bzero(temp_buff, block_size);
 
   /*
     Next write operation of the page will write correct CRC
     if it is needed
   */
-  int4store(marker, MARIA_NO_CRC_BITMAP_PAGE);
+  int4store(temp_buff + block_size - CRC_SIZE, MARIA_NO_CRC_BITMAP_PAGE);
 
-  if (mysql_file_chsize(file, block_size - sizeof(marker),
-                        0, MYF(MY_WME)) ||
-      my_pwrite(file, marker, sizeof(marker),
-                block_size - sizeof(marker),
-                MYF(MY_NABP | MY_WME)))
+  error= my_pwrite(file, temp_buff, block_size, 0, MYF(MY_NABP | MY_WME));
+  my_afree(temp_buff);
+  if (error)
     return 1;
+
   share->state.state.data_file_length= block_size;
   _ma_bitmap_delete_all(share);
   return 0;
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
index 0c80b58d7e3..7f104325d00 100644
--- a/storage/maria/ma_check.c
+++ b/storage/maria/ma_check.c
@@ -418,6 +418,8 @@ int maria_chk_size(HA_CHECK *param, register MARIA_HA *info)
     /* We cannot check file sizes for S3 */
     DBUG_RETURN(0);
   }
+  /* We should never come here with internal temporary tables */
+  DBUG_ASSERT(!share->internal_table);
 
   if (!(param->testflag & T_SILENT))
     puts("- check file-size");
@@ -713,6 +715,8 @@ static int chk_index_down(HA_CHECK *param, MARIA_HA *info,
   MARIA_PAGE ma_page;
   DBUG_ENTER("chk_index_down");
 
+  DBUG_ASSERT(!share->internal_table);
+
   /* Key blocks must lay within the key file length entirely. */
   if (page + keyinfo->block_length > share->state.state.key_file_length)
   {
@@ -2464,7 +2468,16 @@ static int initialize_variables_for_repair(HA_CHECK *param,
     return 1;
 
   /* calculate max_records */
-  sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+  if (!share->internal_table)
+  {
+    /* Get real file size */
+    sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+  }
+  else
+  {
+    /* For internal temporary files we are using the logical file length */
+    sort_info->filelength= share->state.state.data_file_length;
+  }
 
   param->max_progress= sort_info->filelength;
   if ((param->testflag & T_CREATE_MISSING_KEYS) ||
@@ -2860,7 +2873,8 @@ int maria_repair(HA_CHECK *param, register MARIA_HA *info,
   {
     fputs("          \r",stdout); fflush(stdout);
   }
-  if (mysql_file_chsize(share->kfile.file,
+  if (!share->internal_table &&
+      mysql_file_chsize(share->kfile.file,
                         share->state.state.key_file_length, 0, MYF(0)))
   {
     _ma_check_print_warning(param,
@@ -4168,7 +4182,8 @@ int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
   if (param->testflag & T_CALC_CHECKSUM)
     share->state.state.checksum=param->glob_crc;
 
-  if (mysql_file_chsize(share->kfile.file,
+  if (!share->internal_table &&
+      mysql_file_chsize(share->kfile.file,
                         share->state.state.key_file_length, 0, MYF(0)))
     _ma_check_print_warning(param,
 			   "Can't change size of indexfile, error: %d",
@@ -4706,7 +4721,8 @@ int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
   if (param->testflag & T_CALC_CHECKSUM)
     share->state.state.checksum=param->glob_crc;
 
-  if (mysql_file_chsize(share->kfile.file,
+  if (!share->internal_table &&
+      mysql_file_chsize(share->kfile.file,
                         share->state.state.key_file_length, 0, MYF(0)))
     _ma_check_print_warning(param,
 			   "Can't change size of indexfile, error: %d",
@@ -6108,6 +6124,8 @@ int maria_test_if_almost_full(MARIA_HA *info)
 {
   MARIA_SHARE *share= info->s;
 
+  DBUG_ASSERT(!share->internal_table);
+
   if (share->options & HA_OPTION_COMPRESS_RECORD)
     return 0;
   return mysql_file_seek(share->kfile.file, 0L, MY_SEEK_END,
diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c
index c1019c01c66..d799e213d5f 100644
--- a/storage/maria/ma_delete_all.c
+++ b/storage/maria/ma_delete_all.c
@@ -103,9 +103,17 @@ int maria_delete_all_rows(MARIA_HA *info)
 #endif
 
   if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX,
-                            FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) ||
-      mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
-      mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)))
+                            FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED))
+    goto err;
+  /*
+    Avoid truncate of internal temporary tables as this can have a big
+    performance overhead when called by mysql_handle_single_derived()
+    tables in MariaDB as part of split materialization.
+  */
+  if (!share->internal_table &&
+      (mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
+       mysql_file_chsize(share->kfile.file, share->base.keystart, 0,
+                         MYF(MY_WME))))
     goto err;
 
   if (_ma_initialize_data_file(share, info->dfile.file))

From 3e5e97b26fd1d2ed31833218bdb568625f2eab3f Mon Sep 17 00:00:00 2001
From: Brandon Nesterenko <brandon.nesterenko@mariadb.com>
Date: Thu, 22 Aug 2024 12:59:40 -0600
Subject: [PATCH 05/22] MDEV-34799: "Could not write packet" err message args
 off by 1

MDEV-33582 (3541bd63f0d) changed the "Could not write packet"
error message in net_serv.cc to use the function
sql_print_warning(), instead of my_printf_error(). The flags
argument was not removed in this change though, so the old
flags were printed in place of the file descriptor, and all
other args are presenting for the wrong field (and length is
never showed).

This patch removes flags as a parameter to sql_print_warning().
---
 sql/net_serv.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/net_serv.cc b/sql/net_serv.cc
index 48d523d53ea..283a768dcd3 100644
--- a/sql/net_serv.cc
+++ b/sql/net_serv.cc
@@ -780,7 +780,6 @@ net_real_write(NET *net,const uchar *packet, size_t len)
       {
         sql_print_warning("Could not write packet: fd: %lld  state: %d  "
                           "errno: %d  vio_errno: %d  length: %ld",
-                          MYF(ME_ERROR_LOG | ME_WARNING),
                           (longlong) vio_fd(net->vio), (int) net->vio->state,
                           vio_errno(net->vio), net->last_errno,
                           (ulong) (end-pos));

From 9db2b327d41395349681697efd85a700be1b1146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Fri, 23 Aug 2024 13:27:50 +0300
Subject: [PATCH 06/22] MDEV-34759: buf_page_get_low() is unnecessarily
 acquiring exclusive latch

buf_page_ibuf_merge_try(): A new, separate function for invoking
ibuf_merge_or_delete_for_page() when needed. Use the already requested
page latch for determining if the call is necessary. If it is and
if we are currently holding rw_latch==RW_S_LATCH, upgrading to an exclusive
latch may involve waiting that another thread acquires and releases
a U or X latch on the page. If we have to wait, we must recheck if the
call to ibuf_merge_or_delete_for_page() is still needed. If the page
turns out to be corrupted, we will release and fail the operation.
Finally, the exclusive page latch will be downgraded to the originally
requested latch.

ssux_lock_impl::rd_u_upgrade_try(): Attempt to upgrade a shared lock to
an update lock.

sux_lock::s_x_upgrade_try(): Attempt to upgrade a shared lock to
exclusive.

sux_lock::s_x_upgrade(): Upgrade a shared lock to exclusive.
Return whether a wait was elided.

ssux_lock_impl::u_rd_downgrade(), sux_lock::u_s_downgrade():
Downgrade an update lock to shared.
---
 storage/innobase/buf/buf0buf.cc            | 192 ++++++++++++---------
 storage/innobase/include/srw_lock.h        |   9 +
 storage/innobase/include/sux_lock.h        |  34 ++++
 storage/innobase/unittest/innodb_sync-t.cc |  27 +++
 4 files changed, 183 insertions(+), 79 deletions(-)

diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 44b093e93ed..f52ffc9de9c 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2404,6 +2404,78 @@ err_exit:
 	return(FALSE);
 }
 
+ATTRIBUTE_COLD
+/** Try to merge buffered changes to a buffer pool page.
+@param block     buffer-fixed and latched block
+@param rw_latch  RW_X_LATCH, RW_SX_LATCH, RW_S_LATCH held on block
+@param err       error code
+@return whether the page is invalid (corrupted) */
+static bool buf_page_ibuf_merge_try(buf_block_t *block, ulint rw_latch,
+                                    dberr_t *err)
+{
+  ut_ad(block->page.lock.have_any());
+  ut_ad(block->page.buf_fix_count());
+
+  if (fil_page_get_type(block->page.frame) != FIL_PAGE_INDEX ||
+      !page_is_leaf(block->page.frame))
+    return false;
+
+  if (rw_latch != RW_X_LATCH)
+  {
+    if (rw_latch == RW_S_LATCH)
+    {
+      if (!block->page.lock.s_x_upgrade())
+      {
+        uint32_t state;
+        state= block->page.state();
+        if (state < buf_page_t::UNFIXED)
+        {
+        fail:
+          block->page.lock.x_unlock();
+          return true;
+        }
+        ut_ad(state & ~buf_page_t::LRU_MASK);
+        ut_ad(state < buf_page_t::READ_FIX);
+        if (state < buf_page_t::IBUF_EXIST || state >= buf_page_t::REINIT)
+          /* ibuf_merge_or_delete_for_page() was already invoked in
+          another thread. */
+          goto downgrade_to_s;
+      }
+    }
+    else
+    {
+      ut_ad(rw_latch == RW_SX_LATCH);
+      block->page.lock.u_x_upgrade();
+    }
+  }
+
+  ut_ad(block->page.lock.have_x());
+  block->page.clear_ibuf_exist();
+  if (dberr_t e= ibuf_merge_or_delete_for_page(block, block->page.id(),
+                                               block->zip_size()))
+  {
+    if (err)
+      *err= e;
+    goto fail;
+  }
+
+  switch (rw_latch) {
+  default:
+    ut_ad(rw_latch == RW_X_LATCH);
+    break;
+  case RW_SX_LATCH:
+    block->page.lock.x_u_downgrade();
+    break;
+  case RW_S_LATCH:
+  downgrade_to_s:
+    block->page.lock.x_u_downgrade();
+    block->page.lock.u_s_downgrade();
+    break;
+  }
+
+  return false;
+}
+
 /** Low level function used to get access to a database page.
 @param[in]	page_id			page id
 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
@@ -2443,6 +2515,7 @@ buf_page_get_low(
 	      || (rw_latch == RW_X_LATCH)
 	      || (rw_latch == RW_SX_LATCH)
 	      || (rw_latch == RW_NO_LATCH));
+	ut_ad(rw_latch != RW_NO_LATCH || !allow_ibuf_merge);
 
 	if (err) {
 		*err = DB_SUCCESS;
@@ -2844,89 +2917,50 @@ re_evict_fail:
 	state to FREED). Therefore, after acquiring the page latch we
 	must recheck the state. */
 
-	if (state >= buf_page_t::UNFIXED
-	    && allow_ibuf_merge
-	    && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
-	    && page_is_leaf(block->page.frame)) {
-		block->page.lock.x_lock();
-		state = block->page.state();
-		ut_ad(state < buf_page_t::READ_FIX);
-
-		if (state >= buf_page_t::IBUF_EXIST
-		    && state < buf_page_t::REINIT) {
-			block->page.clear_ibuf_exist();
-			if (dberr_t local_err =
-			    ibuf_merge_or_delete_for_page(block, page_id,
-							  block->zip_size())) {
-				if (err) {
-					*err = local_err;
-				}
-				goto release_and_ignore_block;
-			}
-		} else if (state < buf_page_t::UNFIXED) {
-release_and_ignore_block:
-			block->page.lock.x_unlock();
-			goto ignore_block;
-		}
-
-#ifdef BTR_CUR_HASH_ADAPT
-		btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
-
-		switch (rw_latch) {
-		case RW_NO_LATCH:
-			block->page.lock.x_unlock();
-			break;
-		case RW_S_LATCH:
-			block->page.lock.x_unlock();
-			block->page.lock.s_lock();
-			break;
-		case RW_SX_LATCH:
-			block->page.lock.x_u_downgrade();
-			break;
-		default:
-			ut_ad(rw_latch == RW_X_LATCH);
-		}
-
-		mtr->memo_push(block, mtr_memo_type_t(rw_latch));
-	} else {
-		switch (rw_latch) {
-		case RW_NO_LATCH:
-			mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+	switch (rw_latch) {
+	case RW_NO_LATCH:
+		ut_ad(!allow_ibuf_merge);
+		mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+		return block;
+	case RW_S_LATCH:
+		block->page.lock.s_lock();
+		break;
+	case RW_SX_LATCH:
+		block->page.lock.u_lock();
+		ut_ad(!block->page.is_io_fixed());
+		break;
+	default:
+		ut_ad(rw_latch == RW_X_LATCH);
+		if (block->page.lock.x_lock_upgraded()) {
+			ut_ad(block->page.id() == page_id);
+			block->unfix();
+			mtr->page_lock_upgrade(*block);
 			return block;
-		case RW_S_LATCH:
-			block->page.lock.s_lock();
-			break;
-		case RW_SX_LATCH:
-			block->page.lock.u_lock();
-			ut_ad(!block->page.is_io_fixed());
-			break;
-		default:
-			ut_ad(rw_latch == RW_X_LATCH);
-			if (block->page.lock.x_lock_upgraded()) {
-				ut_ad(block->page.id() == page_id);
-				block->unfix();
-				mtr->page_lock_upgrade(*block);
-				return block;
-			}
 		}
-
-		mtr->memo_push(block, mtr_memo_type_t(rw_latch));
-		state = block->page.state();
-
-		if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
-			mtr->release_last_page();
-			goto ignore_unfixed;
-		}
-
-		ut_ad(state < buf_page_t::READ_FIX
-		      || state > buf_page_t::WRITE_FIX);
-
-#ifdef BTR_CUR_HASH_ADAPT
-		btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
 	}
 
+	mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+	state = block->page.state();
+
+	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+	corrupted:
+		mtr->release_last_page();
+		goto ignore_unfixed;
+	}
+
+	ut_ad(state < buf_page_t::READ_FIX
+	      || state > buf_page_t::WRITE_FIX);
+	if (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT
+	    && allow_ibuf_merge
+	    && buf_page_ibuf_merge_try(block, rw_latch, err)) {
+		ut_ad(block == mtr->at_savepoint(mtr->get_savepoint() - 1));
+		mtr->lock_register(mtr->get_savepoint() - 1, MTR_MEMO_BUF_FIX);
+		goto corrupted;
+	}
+#ifdef BTR_CUR_HASH_ADAPT
+	btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+
 	ut_ad(page_id_t(page_get_space_id(block->page.frame),
 			page_get_page_no(block->page.frame)) == page_id);
 	return block;
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
index 72b78df146c..3f0fc997fbf 100644
--- a/storage/innobase/include/srw_lock.h
+++ b/storage/innobase/include/srw_lock.h
@@ -282,6 +282,8 @@ public:
 #endif
   }
 
+  bool rd_u_upgrade_try() { return writer.wr_lock_try(); }
+
   void u_wr_upgrade()
   {
     DBUG_ASSERT(writer.is_locked());
@@ -296,6 +298,13 @@ public:
     readers.store(0, std::memory_order_release);
     /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
   }
+  void u_rd_downgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    ut_d(uint32_t lk=) readers.fetch_add(1, std::memory_order_relaxed);
+    ut_ad(lk < WRITER);
+    u_unlock();
+  }
 
   void rd_unlock()
   {
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
index a8f51fdcdba..ea9487b943c 100644
--- a/storage/innobase/include/sux_lock.h
+++ b/storage/innobase/include/sux_lock.h
@@ -198,6 +198,30 @@ public:
   /** Upgrade an update lock */
   inline void u_x_upgrade();
   inline void u_x_upgrade(const char *file, unsigned line);
+  /** @return whether a shared lock was upgraded to exclusive */
+  bool s_x_upgrade_try()
+  {
+    ut_ad(have_s());
+    ut_ad(!have_u_or_x());
+    if (!lock.rd_u_upgrade_try())
+      return false;
+    claim_ownership();
+    s_unlock();
+    lock.u_wr_upgrade();
+    recursive= RECURSIVE_X;
+    return true;
+  }
+  __attribute__((warn_unused_result))
+  /** @return whether the operation succeeded without waiting */
+  bool s_x_upgrade()
+  {
+    if (s_x_upgrade_try())
+      return true;
+    s_unlock();
+    x_lock();
+    return false;
+  }
+
   /** Downgrade a single exclusive lock to an update lock */
   void x_u_downgrade()
   {
@@ -206,6 +230,16 @@ public:
     recursive*= RECURSIVE_U;
     lock.wr_u_downgrade();
   }
+  /** Downgrade a single update lock to a shared lock */
+  void u_s_downgrade()
+  {
+    ut_ad(have_u_or_x());
+    ut_ad(recursive == RECURSIVE_U);
+    recursive= 0;
+    set_new_owner(0);
+    lock.u_rd_downgrade();
+    ut_d(s_lock_register());
+  }
 
   /** Acquire an exclusive lock or upgrade an update lock
   @return whether U locks were upgraded to X */
diff --git a/storage/innobase/unittest/innodb_sync-t.cc b/storage/innobase/unittest/innodb_sync-t.cc
index d0289086b24..5ad726d8429 100644
--- a/storage/innobase/unittest/innodb_sync-t.cc
+++ b/storage/innobase/unittest/innodb_sync-t.cc
@@ -92,6 +92,25 @@ static void test_ssux_lock()
       ssux.wr_u_downgrade();
       ssux.u_unlock();
     }
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      ssux.rd_lock();
+      assert(!critical);
+      if (ssux.rd_u_upgrade_try())
+      {
+        assert(!critical);
+        ssux.rd_unlock();
+        ssux.u_wr_upgrade();
+        assert(!critical);
+        critical= true;
+        critical= false;
+        ssux.wr_u_downgrade();
+        ssux.u_rd_downgrade();
+      }
+      assert(!critical);
+      ssux.rd_unlock();
+    }
   }
 }
 
@@ -129,6 +148,14 @@ static void test_sux_lock()
       critical= false;
       sux.x_u_downgrade();
       sux.u_unlock();
+      sux.s_lock();
+      std::ignore= sux.s_x_upgrade();
+      assert(!critical);
+      sux.x_lock();
+      critical= true;
+      sux.x_unlock();
+      critical= false;
+      sux.x_unlock();
     }
   }
 }

From 9020baf126c7d4068b40c57bc2d7dcb747b4b341 Mon Sep 17 00:00:00 2001
From: Sergei Petrunia <sergey@mariadb.com>
Date: Sat, 24 Aug 2024 19:01:06 +0300
Subject: [PATCH 07/22] Trivial fix: Make test_if_cheaper_ordering() use
 actual_rec_per_key()

Discovered this while working on MDEV-34720: test_if_cheaper_ordering()
uses rec_per_key, while the original estimate for the access method
is produced in best_access_path() by using actual_rec_per_key().

Make test_if_cheaper_ordering() also use actual_rec_per_key().
Also make several getter function "const" to make this compile.
Also adjusted the testcase to handle this (the change backported from
11.0)
---
 mysql-test/main/subselect_innodb.result | 1 +
 mysql-test/main/subselect_innodb.test   | 2 ++
 sql/sql_select.cc                       | 3 ++-
 sql/sql_statistics.h                    | 2 +-
 sql/structs.h                           | 2 +-
 sql/table.cc                            | 2 +-
 6 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/mysql-test/main/subselect_innodb.result b/mysql-test/main/subselect_innodb.result
index ea0affd575f..c8c0fd693f2 100644
--- a/mysql-test/main/subselect_innodb.result
+++ b/mysql-test/main/subselect_innodb.result
@@ -554,6 +554,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 #
 # MDEV-6081: ORDER BY+ref(const): selectivity is very incorrect (MySQL Bug#14338686)
 #
+insert into t2 select seq,seq,seq from seq_10000_to_11000;
 alter table t2 add key2 int;
 update t2 set key2=key1;
 alter table t2 add key(key2);
diff --git a/mysql-test/main/subselect_innodb.test b/mysql-test/main/subselect_innodb.test
index f675dda91b4..12ce5cabbeb 100644
--- a/mysql-test/main/subselect_innodb.test
+++ b/mysql-test/main/subselect_innodb.test
@@ -558,7 +558,9 @@ from
 --echo #
 --echo # MDEV-6081: ORDER BY+ref(const): selectivity is very incorrect (MySQL Bug#14338686)
 --echo #
+--source include/have_sequence.inc
 
+insert into t2 select seq,seq,seq from seq_10000_to_11000;
 alter table t2 add key2 int;
 update t2 set key2=key1;
 alter table t2 add key(key2);
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 176310224a2..d8958041160 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -30391,7 +30391,8 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
     else
     {
       const KEY *ref_keyinfo= table->key_info + ref_key;
-      refkey_rows_estimate= ref_keyinfo->rec_per_key[tab->ref.key_parts - 1];
+      refkey_rows_estimate=
+        (ha_rows)ref_keyinfo->actual_rec_per_key(tab->ref.key_parts - 1);
     }
     set_if_bigger(refkey_rows_estimate, 1);
   }
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 166e7a75c79..6d7dd3618ff 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -472,7 +472,7 @@ public:
 
   bool avg_frequency_is_inited() { return avg_frequency != NULL; }
 
-  double get_avg_frequency(uint i)
+  double get_avg_frequency(uint i) const
   {
     return (double) avg_frequency[i] / Scale_factor_avg_frequency;
   }
diff --git a/sql/structs.h b/sql/structs.h
index 1641b8b3cbd..76ae013ca3b 100644
--- a/sql/structs.h
+++ b/sql/structs.h
@@ -166,7 +166,7 @@ typedef struct st_key {
   engine_option_value *option_list;
   ha_index_option_struct *option_struct;                  /* structure with parsed options */
 
-  double actual_rec_per_key(uint i);
+  double actual_rec_per_key(uint i) const;
 
   bool without_overlaps;
   /*
diff --git a/sql/table.cc b/sql/table.cc
index 373c70c5983..60345ac5a21 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -10034,7 +10034,7 @@ uint TABLE_SHARE::actual_n_key_parts(THD *thd)
 }  
 
 
-double KEY::actual_rec_per_key(uint i)
+double KEY::actual_rec_per_key(uint i) const
 { 
   if (rec_per_key == 0)
     return 0;

From d58734d7812f7fccbb1a2cd802f8ce9cd0c693ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Mon, 26 Aug 2024 12:22:44 +0300
Subject: [PATCH 08/22] MDEV-34520 purge_sys_t::wait_FTS sleeps 10ms, even if
 it does not have to

There were two separate Atomic_counter<uint32_t>, purge_sys.m_SYS_paused
and purge_sys.m_FTS_paused. In purge_sys.wait_FTS() we have to read both
atomically. We used to use an overkill solution for this, acquiring
purge_sys.latch and waiting 10 milliseconds between samples. To make
matters worse, the 10-millisecond wait was unconditional, which would
unnecessarily suspend the purge_coordinator_task every now and then.

It turns out that we can fold both "reference counts" into a single
Atomic_relaxed<uint32_t> and avoid the purge_sys.latch.
To assess whether std::memory_order_relaxed is acceptable, we should
consider the operations that read these "reference counts", that is,
purge_sys_t::wait_FTS(bool) and purge_sys_t::must_wait_FTS().

Outside debug assertions, purge_sys.must_wait_FTS() is only invoked in
trx_purge_table_acquire(), which is covered by a shared dict_sys.latch.
We would increment the counter as part of a DDL operation, but before
acquiring an exclusive dict_sys.latch. So, a
purge_sys_t::close_and_reopen() loop could be triggered slightly
prematurely, before a problematic DDL operation is actually executed.
Decrementing the counter is less of an issue; purge_sys.resume_FTS()
or purge_sys.resume_SYS() would mostly be invoked while holding an
exclusive dict_sys.latch; ha_innobase::delete_table() does it outside
that critical section. Still, this would only cause some extra wait in
the purge_coordinator_task, just like at the start of a DDL operation.

There are two calls to purge_sys_t::wait_FTS(bool): in the above mentioned
purge_sys_t::close_and_reopen() and in purge_sys_t::clone_oldest_view(),
both invoked by the purge_coordinator_task. There is also a
purge_sys.clone_oldest_view<true>() call at startup when no DDL operation
can be in progress.

purge_sys_t::m_SYS_paused: Merged into m_FTS_paused, using a new
multiplier PAUSED_SYS = 65536.

purge_sys_t::wait_FTS(): Remove an unnecessary sleep as well as the
access to purge_sys.latch. It suffices to poll purge_sys.m_FTS_paused.

purge_sys_t::stop_FTS(): Do not acquire purge_sys.latch.

Reviewed by: Debarun Banerjee
---
 storage/innobase/include/trx0purge.h | 20 +++++++++++++-------
 storage/innobase/srv/srv0srv.cc      | 11 +++++------
 storage/innobase/trx/trx0purge.cc    |  9 +--------
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index dc28c866c74..a3df8f7bc5c 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -149,10 +149,11 @@ public:
 private:
   /** number of pending stop() calls without resume() */
   Atomic_counter<uint32_t> m_paused;
-  /** number of stop_SYS() calls without resume_SYS() */
-  Atomic_counter<uint32_t> m_SYS_paused;
-  /** number of stop_FTS() calls without resume_FTS() */
-  Atomic_counter<uint32_t> m_FTS_paused;
+  /** PAUSED_SYS * number of stop_SYS() calls without resume_SYS() +
+  number of stop_FTS() calls without resume_FTS() */
+  Atomic_relaxed<uint32_t> m_FTS_paused;
+  /** The stop_SYS() multiplier in m_FTS_paused */
+  static constexpr const uint32_t PAUSED_SYS= 1U << 16;
 
   /** latch protecting end_view */
   alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
@@ -321,16 +322,21 @@ private:
   void wait_FTS(bool also_sys);
 public:
   /** Suspend purge in data dictionary tables */
-  void stop_SYS() { m_SYS_paused++; }
+  void stop_SYS()
+  {
+    ut_d(const auto p=) m_FTS_paused.fetch_add(PAUSED_SYS);
+    ut_ad(p < p + PAUSED_SYS);
+  }
   /** Resume purge in data dictionary tables */
   static void resume_SYS(void *);
 
   /** Pause purge during a DDL operation that could drop FTS_ tables. */
   void stop_FTS();
   /** Resume purge after stop_FTS(). */
-  void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+  void resume_FTS()
+  { ut_d(const auto p=) m_FTS_paused.fetch_sub(1); ut_ad(p & ~PAUSED_SYS); }
   /** @return whether stop_SYS() is in effect */
-  bool must_wait_FTS() const { return m_FTS_paused; }
+  bool must_wait_FTS() const { return m_FTS_paused & ~PAUSED_SYS; }
 
 private:
   /**
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index b29caf255fc..8921f17ef6a 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -1298,10 +1298,9 @@ bool purge_sys_t::running()
 
 void purge_sys_t::stop_FTS()
 {
-  latch.rd_lock(SRW_LOCK_CALL);
-  m_FTS_paused++;
-  latch.rd_unlock();
-  while (m_active)
+  ut_d(const auto paused=) m_FTS_paused.fetch_add(1);
+  ut_ad(paused < PAUSED_SYS);
+  while (m_active.load(std::memory_order_acquire))
     std::this_thread::sleep_for(std::chrono::seconds(1));
 }
 
@@ -1335,8 +1334,8 @@ void purge_sys_t::stop()
 /** Resume purge in data dictionary tables */
 void purge_sys_t::resume_SYS(void *)
 {
-  ut_d(auto paused=) purge_sys.m_SYS_paused--;
-  ut_ad(paused);
+  ut_d(auto paused=) purge_sys.m_FTS_paused.fetch_sub(PAUSED_SYS);
+  ut_ad(paused >= PAUSED_SYS);
 }
 
 /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index 00cd2b89e3a..c7be15bad20 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -1065,15 +1065,8 @@ static void trx_purge_close_tables(purge_node_t *node, THD *thd)
 
 void purge_sys_t::wait_FTS(bool also_sys)
 {
-  bool paused;
-  do
-  {
-    latch.wr_lock(SRW_LOCK_CALL);
-    paused= m_FTS_paused || (also_sys && m_SYS_paused);
-    latch.wr_unlock();
+  for (const uint32_t mask= also_sys ? ~0U : ~PAUSED_SYS; m_FTS_paused & mask;)
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-  while (paused);
 }
 
 __attribute__((nonnull))

From b7b9f3ce825a08f16ad851c16f603365ae473195 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Mon, 26 Aug 2024 12:23:06 +0300
Subject: [PATCH 09/22] MDEV-34515: Contention between purge and workload

In a Sysbench oltp_update_index workload that involves 1 table,
a serious contention between the workload and the purge of history
was observed. This was the worst when the table contained only 1 record.

This turned out to be fixed by setting innodb_purge_batch_size=128,
which corresponds to the number of usable persistent rollback segments.
When we go above that, there would be contention between row_purge_poss_sec()
and the workload, typically on the clustered index page latch, sometimes
also on a secondary index page latch. It might be that with smaller
batches, trx_sys.history_size() will end up pausing all concurrent
transaction start/commit frequently enough so that purge will be able
to make some progress, so that there would be less contention on the
index page latches between purge and SQL execution.

In commit aa719b5010c929132b4460b78113fbd07497d9c8 (part of MDEV-32050)
the interpretation of the parameter innodb_purge_batch_size was slightly
changed. It would correspond to the maximum desired size of the
purge_sys.pages cache. Before that change, the parameter was referring to
a number of undo log pages, but the accounting might have been inaccurate.

To avoid a regression, we will reduce the default value to
innodb_purge_batch_size=127, which will also be compatible with
innodb_undo_tablespaces>1 (which will disable rollback segment 0).

Additionally, some logic in the purge and MVCC checks is simplified.
The purge tasks will make use of purge_sys.pages when accessing undo
log pages to find out if a secondary index record can be removed.
If an undo page needs to be looked up in buf_pool.page_hash, we will
merely buffer-fix it. This is correct, because the undo pages are
append-only in nature. Holding purge_sys.latch or purge_sys.end_latch
or the fact that the current thread is executing as a part of an
in-progress purge batch will prevent the contents of the undo page from
being freed and subsequently reused. The buffer-fix will prevent the
page from being evicted form the buffer pool. Thanks to this logic,
we can refer to the undo log record directly in the buffer pool page
and avoid copying the record.

buf_pool_t::page_fix(): Look up and buffer-fix a page. This is useful
for accessing undo log pages, which are append-only by nature.
There will be no need to deal with change buffer or ROW_FORMAT=COMPRESSED
in that case.

purge_sys_t::view_guard::view_guard(): Allow the type of guard to be
acquired: end_latch, latch, or no latch (in case we are a purge thread).

purge_sys_t::view_guard::get(): Read-only accessor to purge_sys.pages.

purge_sys_t::get_page(): Invoke buf_pool_t::page_fix().

row_vers_old_has_index_entry(): Replaced with row_purge_is_unsafe()
and row_undo_mod_sec_unsafe().

trx_undo_get_undo_rec(): Merged to trx_undo_prev_version_build().

row_purge_poss_sec(): Add the parameter mtr and remove redundant
or unused parameters sec_pcur, sec_mtr, is_tree. We will use the
caller's mtr object but release any acquired page latches before
returning.

btr_cur_get_page(), page_cur_get_page(): Do not invoke page_align().

row_purge_remove_sec_if_poss_leaf(): Return the value of PAGE_MAX_TRX_ID
to be checked against the page in row_purge_remove_sec_if_poss_tree().
If the secondary index page was not changed meanwhile, it will be
unnecessary to invoke row_purge_poss_sec() again.

trx_undo_prev_version_build(): Access any undo log pages using
the caller's mini-transaction object.

row_purge_vc_matches_cluster(): Moved to the only compilation unit that
needs it.

Reviewed by: Debarun Banerjee
---
 .../r/innodb_purge_batch_size_basic.result    |   8 +-
 .../suite/sys_vars/r/sysvars_innodb.result    |   2 +-
 storage/innobase/btr/btr0cur.cc               |   2 +-
 storage/innobase/buf/buf0buf.cc               |  45 ++
 storage/innobase/handler/ha_innodb.cc         |   2 +-
 storage/innobase/include/btr0cur.h            |  10 +-
 storage/innobase/include/btr0cur.inl          |  12 -
 storage/innobase/include/buf0buf.h            |   6 +
 storage/innobase/include/page0cur.h           |  12 +-
 storage/innobase/include/page0cur.inl         |  12 -
 storage/innobase/include/row0purge.h          |  27 +-
 storage/innobase/include/row0vers.h           |  61 +-
 storage/innobase/include/trx0purge.h          |  44 +-
 storage/innobase/include/trx0rec.h            |  54 +-
 storage/innobase/read/read0read.cc            |   2 +-
 storage/innobase/row/row0log.cc               |   4 +-
 storage/innobase/row/row0purge.cc             | 611 ++++++++++++++----
 storage/innobase/row/row0sel.cc               |   2 +-
 storage/innobase/row/row0umod.cc              | 154 ++++-
 storage/innobase/row/row0upd.cc               |   2 +-
 storage/innobase/row/row0vers.cc              | 504 +--------------
 storage/innobase/trx/trx0purge.cc             |  22 +-
 storage/innobase/trx/trx0rec.cc               | 247 +++----
 23 files changed, 958 insertions(+), 887 deletions(-)

diff --git a/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result
index 442d44e7fb2..f5b01aa8016 100644
--- a/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result
@@ -1,19 +1,19 @@
 SET @global_start_value = @@global.innodb_purge_batch_size;
 SELECT @global_start_value;
 @global_start_value
-1000
+127
 '#--------------------FN_DYNVARS_046_01------------------------#'
 SET @@global.innodb_purge_batch_size = 1;
 SET @@global.innodb_purge_batch_size = DEFAULT;
 SELECT @@global.innodb_purge_batch_size;
 @@global.innodb_purge_batch_size
-1000
+127
 '#---------------------FN_DYNVARS_046_02-------------------------#'
 SET innodb_purge_batch_size = 1;
 ERROR HY000: Variable 'innodb_purge_batch_size' is a GLOBAL variable and should be set with SET GLOBAL
 SELECT @@innodb_purge_batch_size;
 @@innodb_purge_batch_size
-1000
+127
 SELECT local.innodb_purge_batch_size;
 ERROR 42S02: Unknown table 'local' in field list
 SET global innodb_purge_batch_size = 1;
@@ -112,4 +112,4 @@ SELECT @@global.innodb_purge_batch_size;
 SET @@global.innodb_purge_batch_size = @global_start_value;
 SELECT @@global.innodb_purge_batch_size;
 @@global.innodb_purge_batch_size
-1000
+127
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 148ec4f9c9b..893761e0ce8 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -1293,7 +1293,7 @@ READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	OPTIONAL
 VARIABLE_NAME	INNODB_PURGE_BATCH_SIZE
 SESSION_VALUE	NULL
-DEFAULT_VALUE	1000
+DEFAULT_VALUE	127
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	BIGINT UNSIGNED
 VARIABLE_COMMENT	Number of UNDO log pages to purge in one batch from the history list.
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index fcfd98ab197..cfbc6532c41 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1277,7 +1277,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
       ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
       auto& chain = buf_pool.page_hash.cell_get(page_id.fold());
 
-      if (!row_purge_poss_sec(purge_node, index(), tuple))
+      if (!row_purge_poss_sec(purge_node, index(), tuple, mtr))
         /* The record cannot be purged yet. */
         flag= BTR_CUR_DELETE_REF;
       else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(),
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index f52ffc9de9c..7a466939eae 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2476,6 +2476,51 @@ static bool buf_page_ibuf_merge_try(buf_block_t *block, ulint rw_latch,
   return false;
 }
 
+buf_block_t* buf_pool_t::page_fix(const page_id_t id)
+{
+  ha_handler_stats *const stats= mariadb_stats;
+  buf_inc_get(stats);
+  auto& chain= page_hash.cell_get(id.fold());
+  page_hash_latch &hash_lock= page_hash.lock_get(chain);
+  for (;;)
+  {
+    hash_lock.lock_shared();
+    buf_page_t *b= page_hash.get(id, chain);
+    if (b)
+    {
+      uint32_t state= b->fix();
+      hash_lock.unlock_shared();
+      ut_ad(!b->in_zip_hash);
+      ut_ad(b->frame);
+      ut_ad(state >= buf_page_t::FREED);
+      if (state >= buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX)
+      {
+        b->lock.s_lock();
+        state= b->state();
+        ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+        b->lock.s_unlock();
+      }
+      if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
+      {
+        /* The page was marked as freed or corrupted. */
+        b->unfix();
+        b= nullptr;
+      }
+      return reinterpret_cast<buf_block_t*>(b);
+    }
+
+    hash_lock.unlock_shared();
+    switch (buf_read_page(id, 0)) {
+    default:
+      return nullptr;
+    case DB_SUCCESS:
+    case DB_SUCCESS_LOCKED_REC:
+      mariadb_increment_pages_read(stats);
+      buf_read_ahead_random(id, 0, false);
+    }
+  }
+}
+
 /** Low level function used to get access to a database page.
 @param[in]	page_id			page id
 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 57bb38f2df3..f569f4ba6d1 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -18946,7 +18946,7 @@ static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
   PLUGIN_VAR_OPCMDARG,
   "Number of UNDO log pages to purge in one batch from the history list.",
   NULL, NULL,
-  1000,			/* Default setting */
+  127,			/* Default setting */
   1,			/* Minimum value */
   innodb_purge_batch_size_MAX, 0);
 
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index f6abc9f5e52..96b6d212168 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -78,14 +78,10 @@ page_zip_des_t*
 btr_cur_get_page_zip(
 /*=================*/
 	btr_cur_t*	cursor);/*!< in: tree cursor */
-/*********************************************************//**
-Returns the page of a tree cursor.
+/** Returns the page of a tree cursor.
 @return pointer to page */
-UNIV_INLINE
-page_t*
-btr_cur_get_page(
-/*=============*/
-	btr_cur_t*	cursor);/*!< in: tree cursor */
+#define btr_cur_get_page(cursor) (cursor)->block()->page.frame
+
 /*********************************************************//**
 Returns the index of a cursor.
 @param cursor b-tree cursor
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
index 955cf34288e..5981b1465c9 100644
--- a/storage/innobase/include/btr0cur.inl
+++ b/storage/innobase/include/btr0cur.inl
@@ -48,18 +48,6 @@ btr_cur_get_page_zip(
 	return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
 }
 
-/*********************************************************//**
-Returns the page of a tree cursor.
-@return pointer to page */
-UNIV_INLINE
-page_t*
-btr_cur_get_page(
-/*=============*/
-	btr_cur_t*	cursor)	/*!< in: tree cursor */
-{
-	return(page_align(page_cur_get_rec(&(cursor->page_cur))));
-}
-
 /*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 0996046cda0..a2c55f3edf7 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1416,6 +1416,12 @@ public:
   }
 
 public:
+  /** Look up and buffer-fix a page.
+  @param id        page identifier
+  @return undo log page, buffer-fixed
+  @retval nullptr  if the undo page was corrupted or freed */
+  buf_block_t *page_fix(const page_id_t id);
+
   /** @return whether the buffer pool contains a page
   @tparam allow_watch  whether to allow watch_is_sentinel()
   @param page_id       page identifier
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index 28aa30565e4..bef8a679ea0 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -31,14 +31,6 @@ Created 10/4/1994 Heikki Tuuri
 
 #ifdef UNIV_DEBUG
 /*********************************************************//**
-Gets pointer to the page frame where the cursor is positioned.
-@return page */
-UNIV_INLINE
-page_t*
-page_cur_get_page(
-/*==============*/
-	page_cur_t*	cur);	/*!< in: page cursor */
-/*********************************************************//**
 Gets pointer to the buffer block where the cursor is positioned.
 @return page */
 UNIV_INLINE
@@ -60,12 +52,12 @@ page_cur_get_page_zip(
 UNIV_INLINE
 rec_t *page_cur_get_rec(const page_cur_t *cur);
 #else /* UNIV_DEBUG */
-# define page_cur_get_page(cur)		page_align((cur)->rec)
 # define page_cur_get_block(cur)	(cur)->block
 # define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
 # define page_cur_get_rec(cur)		(cur)->rec
 #endif /* UNIV_DEBUG */
-# define is_page_cur_get_page_zip(cur)	is_buf_block_get_page_zip((cur)->block)
+#define page_cur_get_page(cur)		page_cur_get_block(cur)->page.frame
+#define is_page_cur_get_page_zip(cur)	is_buf_block_get_page_zip((cur)->block)
 /*********************************************************//**
 Sets the cursor object to point before the first user record
 on the page. */
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
index 1638b5749ff..884deeea551 100644
--- a/storage/innobase/include/page0cur.inl
+++ b/storage/innobase/include/page0cur.inl
@@ -25,18 +25,6 @@ Created 10/4/1994 Heikki Tuuri
 *************************************************************************/
 
 #ifdef UNIV_DEBUG
-/*********************************************************//**
-Gets pointer to the page frame where the cursor is positioned.
-@return page */
-UNIV_INLINE
-page_t*
-page_cur_get_page(
-/*==============*/
-	page_cur_t*	cur)	/*!< in: page cursor */
-{
-  return page_align(page_cur_get_rec(cur));
-}
-
 /*********************************************************//**
 Gets pointer to the buffer block where the cursor is positioned.
 @return page */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 1daf4d4abe7..baa7777e6c8 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -50,26 +50,13 @@ inserts a record that the secondary index entry would refer to.
 However, in that case, the user transaction would also re-insert the
 secondary index entry after purge has removed it and released the leaf
 page latch.
-@param[in,out]	node		row purge node
-@param[in]	index		secondary index
-@param[in]	entry		secondary index entry
-@param[in,out]	sec_pcur	secondary index cursor or NULL
-				if it is called for purge buffering
-				operation.
-@param[in,out]	sec_mtr		mini-transaction which holds
-				secondary index entry or NULL if it is
-				called for purge buffering operation.
-@param[in]	is_tree		true=pessimistic purge,
-				false=optimistic (leaf-page only)
-@return true if the secondary index record can be purged */
-bool
-row_purge_poss_sec(
-	purge_node_t*	node,
-	dict_index_t*	index,
-	const dtuple_t*	entry,
-	btr_pcur_t*	sec_pcur=NULL,
-	mtr_t*		sec_mtr=NULL,
-	bool		is_tree=false);
+@param node   row purge node
+@param index  secondary index
+@param entry  secondary index entry
+@param mtr    mini-transaction for looking up clustered index
+@return whether the secondary index record can be purged */
+bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index,
+                        const dtuple_t *entry, mtr_t *mtr);
 
 /***************************************************************
 Does the purge operation.
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
index 60f310e1b0f..2ddffa41af1 100644
--- a/storage/innobase/include/row0vers.h
+++ b/storage/innobase/include/row0vers.h
@@ -54,32 +54,47 @@ row_vers_impl_x_locked(
 	dict_index_t*	index,
 	const rec_offs*	offsets);
 
-/** Finds out if a version of the record, where the version >= the current
-purge_sys.view, should have ientry as its secondary index entry. We check
-if there is any not delete marked version of the record where the trx
-id >= purge view, and the secondary index entry == ientry; exactly in
-this case we return TRUE.
-@param[in]	also_curr	TRUE if also rec is included in the versions
-				to search; otherwise only versions prior
-				to it are searched
-@param[in]	rec		record in the clustered index; the caller
-				must have a latch on the page
-@param[in]	mtr		mtr holding the latch on rec; it will
-				also hold the latch on purge_view
-@param[in]	index		secondary index
-@param[in]	ientry		secondary index entry
-@param[in]	roll_ptr	roll_ptr for the purge record
-@param[in]	trx_id		transaction ID on the purging record
-@return TRUE if earlier version should have */
+/** Find out whether data tuple has missing data type
+for indexed virtual column.
+@param tuple   data tuple
+@param index   virtual index
+@return true if tuple has missing column type */
+bool dtuple_vcol_data_missing(const dtuple_t &tuple,
+                              const dict_index_t &index);
+/** build virtual column value from current cluster index record data
+@param[in,out]	row		the cluster index row in dtuple form
+@param[in]	clust_index	clustered index
+@param[in]	index		the secondary index
+@param[in]	heap		heap used to build virtual dtuple. */
 bool
-row_vers_old_has_index_entry(
-	bool			also_curr,
-	const rec_t*		rec,
-	mtr_t*			mtr,
+row_vers_build_clust_v_col(
+	dtuple_t*		row,
+	dict_index_t*		clust_index,
 	dict_index_t*		index,
-	const dtuple_t*		ientry,
+	mem_heap_t*		heap);
+/** Build a dtuple contains virtual column data for current cluster index
+@param[in]	rec		cluster index rec
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	cluster rec offset
+@param[in]	index		secondary index
+@param[in]	trx_id		transaction ID on the purging record,
+				or 0 if called outside purge
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in,out]	heap		heap memory
+@param[in,out]	v_heap		heap memory to keep virtual column tuple
+@param[in,out]	mtr		mini-transaction
+@return dtuple contains virtual column data */
+dtuple_t*
+row_vers_build_cur_vrow(
+	const rec_t*		rec,
+	dict_index_t*		clust_index,
+	rec_offs**		clust_offsets,
+	dict_index_t*		index,
+	trx_id_t		trx_id,
 	roll_ptr_t		roll_ptr,
-	trx_id_t		trx_id);
+	mem_heap_t*		heap,
+	mem_heap_t*		v_heap,
+	mtr_t*			mtr);
 
 /*****************************************************************//**
 Constructs the version of a clustered index record which a consistent
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index a3df8f7bc5c..760c98bc677 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -438,10 +438,17 @@ public:
 
   struct view_guard
   {
-    inline view_guard();
+    enum guard { END_VIEW= -1, PURGE= 0, VIEW= 1};
+    guard latch;
+    inline view_guard(guard latch);
     inline ~view_guard();
+    /** Fetch an undo log page.
+    @param id   page identifier
+    @param mtr  mini-transaction
+    @return reference to buffer page, possibly buffer-fixed in mtr */
+    inline const buf_block_t *get(const page_id_t id, mtr_t *mtr);
 
-    /** @return purge_sys.view */
+    /** @return purge_sys.view or purge_sys.end_view */
     inline const ReadViewBase &view() const;
   };
 
@@ -470,14 +477,39 @@ public:
 /** The global data structure coordinating a purge */
 extern purge_sys_t	purge_sys;
 
-purge_sys_t::view_guard::view_guard()
-{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+purge_sys_t::view_guard::view_guard(purge_sys_t::view_guard::guard latch) :
+  latch(latch)
+{
+  switch (latch) {
+  case VIEW:
+    purge_sys.latch.rd_lock(SRW_LOCK_CALL);
+    break;
+  case END_VIEW:
+    purge_sys.end_latch.rd_lock();
+    break;
+  case PURGE:
+    /* the access is within a purge batch; purge_coordinator_task
+    will wait for all workers to complete before updating the views */
+    break;
+  }
+}
 
 purge_sys_t::view_guard::~view_guard()
-{ purge_sys.latch.rd_unlock(); }
+{
+  switch (latch) {
+  case VIEW:
+    purge_sys.latch.rd_unlock();
+    break;
+  case END_VIEW:
+    purge_sys.end_latch.rd_unlock();
+    break;
+  case PURGE:
+    break;
+  }
+}
 
 const ReadViewBase &purge_sys_t::view_guard::view() const
-{ return purge_sys.view; }
+{ return latch == END_VIEW ? purge_sys.end_view : purge_sys.view; }
 
 purge_sys_t::end_view_guard::end_view_guard()
 { purge_sys.end_latch.rd_lock(); }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
index 8d70b61517a..3ed2ea8cdea 100644
--- a/storage/innobase/include/trx0rec.h
+++ b/storage/innobase/include/trx0rec.h
@@ -157,50 +157,44 @@ trx_undo_report_row_operation(
 /** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
 is being called purge view and we would like to get the purge record
 even it is in the purge view (in normal case, it will return without
-fetching the purge record */
+fetching the purge record) */
 static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
 
 /** This tells trx_undo_prev_version_build() to fetch the old value in
 the undo log (which is the after image for an update) */
 static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
 
-/** indicate a call from row_vers_old_has_index_entry() */
+/** indicate a call from row_undo_mod_sec_is_unsafe() */
 static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
 
+/** indicate a call from row_purge_is_unsafe() */
+static constexpr ulint TRX_UNDO_CHECK_PURGE_PAGES = 8;
+
 /** Build a previous version of a clustered index record. The caller
 must hold a latch on the index page of the clustered index record.
-@param	rec		version of a clustered index record
-@param	index		clustered index
-@param	offsets		rec_get_offsets(rec, index)
-@param	heap		memory heap from which the memory needed is
-			allocated
-@param	old_vers	previous version or NULL if rec is the
-			first inserted version, or if history data
-			has been deleted (an error), or if the purge
-			could have removed the version
-			though it has not yet done so
-@param	v_heap		memory heap used to create vrow
-			dtuple if it is not yet created. This heap
-                        diffs from "heap" above in that it could be
-                        prebuilt->old_vers_heap for selection
-@param	vrow		virtual column info, if any
-@param	v_status	status determine if it is going into this
-			function by purge thread or not.
-			And if we read "after image" of undo log
+@param rec       version of a clustered index record
+@param index     clustered index
+@param offsets   rec_get_offsets(rec, index)
+@param heap      memory heap from which the memory needed is allocated
+@param old_vers  previous version, or NULL if rec is the first inserted
+                 version, or if history data has been deleted (an error),
+                 or if the purge could have removed the version though
+                 it has not yet done so
+@param mtr       mini-transaction
+@param v_status  TRX_UNDO_PREV_IN_PURGE, ...
+@param v_heap    memory heap used to create vrow dtuple if it is not yet
+                 created. This heap diffs from "heap" above in that it could be
+                 prebuilt->old_vers_heap for selection
+@param vrow      virtual column info, if any
 @return error code
 @retval DB_SUCCESS if previous version was successfully built,
 or if it was an insert or the undo record refers to the table before rebuild
 @retval DB_MISSING_HISTORY if the history is missing */
-dberr_t
-trx_undo_prev_version_build(
-	const rec_t 	*rec,
-	dict_index_t	*index,
-	rec_offs	*offsets,
-	mem_heap_t	*heap,
-	rec_t		**old_vers,
-	mem_heap_t	*v_heap,
-	dtuple_t	**vrow,
-	ulint		v_status);
+dberr_t trx_undo_prev_version_build(const rec_t *rec, dict_index_t *index,
+                                    rec_offs *offsets, mem_heap_t *heap,
+                                    rec_t **old_vers, mtr_t *mtr,
+                                    ulint v_status,
+                                    mem_heap_t *v_heap, dtuple_t **vrow);
 
 /** Read from an undo log record a non-virtual column value.
 @param ptr	pointer to remaining part of the undo record
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
index 97eda7dba32..46d58326edf 100644
--- a/storage/innobase/read/read0read.cc
+++ b/storage/innobase/read/read0read.cc
@@ -160,7 +160,7 @@ may be pointing to garbage (an undo log record discarded by purge),
 but it will never be dereferenced, because the purge view is older
 than any active transaction.
 
-For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+For details see: row_undo_mod_sec_is_unsafe() and row_purge_poss_sec()
 */
 
 
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index e1ce8161a02..6f3425aab17 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -3857,7 +3857,7 @@ UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index,
     if (is_same(roll_ptr))
       return version;
     trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version,
-                                nullptr, nullptr, 0);
+                                &mtr, 0, nullptr, nullptr);
     version= prev_version;
   }
   while (version);
@@ -4026,7 +4026,7 @@ void UndorecApplier::log_update(const dtuple_t &tuple,
       copy_rec= rec_copy(mem_heap_alloc(
         heap, rec_offs_size(offsets)), match_rec, offsets);
     trx_undo_prev_version_build(match_rec, clust_index, offsets, heap,
-                                &prev_version, nullptr, nullptr, 0);
+                                &prev_version, &mtr, 0, nullptr, nullptr);
 
     prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets,
                                   clust_index->n_core_fields,
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
index 4533b7166b7..69da812c9f7 100644
--- a/storage/innobase/row/row0purge.cc
+++ b/storage/innobase/row/row0purge.cc
@@ -271,6 +271,433 @@ row_purge_remove_clust_if_poss(
 	return(false);
 }
 
+/** Check a virtual column value index secondary virtual index matches
+that of current cluster index record, which is recreated from information
+stored in undo log
+@param[in]	rec		record in the clustered index
+@param[in]	icentry		the index entry built from a cluster row
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	offsets on the cluster record
+@param[in]	index		the secondary index
+@param[in]	ientry		the secondary index entry
+@param[in]	roll_ptr	the rollback pointer for the purging record
+@param[in]	trx_id		trx id for the purging record
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	v_row		dtuple holding the virtual rows (if needed)
+@return true if matches, false otherwise */
+static
+bool
+row_purge_vc_matches_cluster(
+	const rec_t*	rec,
+	const dtuple_t* icentry,
+	dict_index_t*	clust_index,
+	rec_offs*	clust_offsets,
+	dict_index_t*	index,
+	const dtuple_t* ientry,
+	roll_ptr_t	roll_ptr,
+	trx_id_t	trx_id,
+	mtr_t*		mtr,
+	dtuple_t**	vrow)
+{
+	const rec_t*	version;
+	rec_t*          prev_version;
+	mem_heap_t*	heap2;
+	mem_heap_t*	heap = NULL;
+	mem_heap_t*	tuple_heap;
+	ulint		num_v = dict_table_get_n_v_cols(index->table);
+	bool		compare[REC_MAX_N_FIELDS];
+	ulint		n_fields = dtuple_get_n_fields(ientry);
+	ulint		n_non_v_col = 0;
+	ulint		n_cmp_v_col = 0;
+	const dfield_t* field1;
+	dfield_t*	field2;
+	ulint		i;
+
+	/* First compare non-virtual columns (primary keys) */
+	ut_ad(index->n_fields == n_fields);
+	ut_ad(n_fields == dtuple_get_n_fields(icentry));
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	{
+		const dfield_t* a = ientry->fields;
+		const dfield_t* b = icentry->fields;
+
+		for (const dict_field_t *ifield = index->fields,
+			     *const end = &index->fields[index->n_fields];
+		     ifield != end; ifield++, a++, b++) {
+			if (!ifield->col->is_virtual()) {
+				if (cmp_dfield_dfield(a, b)) {
+					return false;
+				}
+				n_non_v_col++;
+			}
+		}
+	}
+
+	tuple_heap = mem_heap_create(1024);
+
+	ut_ad(n_fields > n_non_v_col);
+
+	*vrow = dtuple_create_with_vcol(tuple_heap, 0, num_v);
+	dtuple_init_v_fld(*vrow);
+
+	for (i = 0; i < num_v; i++) {
+		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+			 = DATA_MISSING;
+		compare[i] = false;
+	}
+
+	version = rec;
+
+	while (n_cmp_v_col < n_fields - n_non_v_col) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
+			version, clust_index, clust_offsets);
+
+		ut_ad(cur_roll_ptr != 0);
+		ut_ad(roll_ptr != 0);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, mtr,
+			TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE,
+			nullptr, vrow);
+
+		if (heap2) {
+			mem_heap_free(heap2);
+		}
+
+		if (!prev_version) {
+			/* Versions end here */
+			goto func_exit;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		ulint	entry_len = dict_index_get_n_fields(index);
+
+		for (i = 0; i < entry_len; i++) {
+			const dict_field_t*	ind_field
+				 = dict_index_get_nth_field(index, i);
+			const dict_col_t*	col = ind_field->col;
+			field1 = dtuple_get_nth_field(ientry, i);
+
+			if (!col->is_virtual()) {
+				continue;
+			}
+
+			const dict_v_col_t*     v_col
+                                = reinterpret_cast<const dict_v_col_t*>(col);
+			field2
+				= dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+			if ((dfield_get_type(field2)->mtype != DATA_MISSING)
+			    && (!compare[v_col->v_pos])) {
+
+				if (ind_field->prefix_len != 0
+				    && !dfield_is_null(field2)) {
+					field2->len = unsigned(
+						dtype_get_at_most_n_mbchars(
+							field2->type.prtype,
+							field2->type.mbminlen,
+							field2->type.mbmaxlen,
+							ind_field->prefix_len,
+							field2->len,
+							static_cast<char*>
+							(field2->data)));
+				}
+
+				/* The index field mismatch */
+				if (cmp_dfield_dfield(field2, field1)) {
+					mem_heap_free(tuple_heap);
+					mem_heap_free(heap);
+					return(false);
+				}
+
+				compare[v_col->v_pos] = true;
+				n_cmp_v_col++;
+			}
+		}
+
+		trx_id_t	rec_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+			break;
+		}
+
+		version = prev_version;
+	}
+
+func_exit:
+	if (n_cmp_v_col == 0) {
+		*vrow = NULL;
+	}
+
+	mem_heap_free(tuple_heap);
+	mem_heap_free(heap);
+
+	/* FIXME: In the case of n_cmp_v_col is not the same as
+	n_fields - n_non_v_col, callback is needed to compare the rest
+	columns. At the timebeing, we will need to return true */
+	return (true);
+}
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param node    purge node
+@param index   secondary index
+@param ientry  secondary index entry
+@param mtr     mini-transaction
+@return whether ientry cannot be purged */
+static bool row_purge_is_unsafe(const purge_node_t &node,
+                                dict_index_t *index,
+                                const dtuple_t *ientry, mtr_t *mtr)
+{
+	const rec_t*	rec = btr_pcur_get_rec(&node.pcur);
+	roll_ptr_t	roll_ptr = node.roll_ptr;
+	trx_id_t	trx_id = node.trx_id;
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index = node.pcur.index();
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	dtuple_t*	row;
+	const dtuple_t*	entry;
+	dtuple_t*	vrow = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	ut_ad(index->table == clust_index->table);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	if (dict_index_has_virtual(index)) {
+		v_heap = mem_heap_create(100);
+	}
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(clust_offsets))) {
+		row_ext_t*	ext;
+
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys.view must
+		'overtake' any read view of an active transaction.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		if (dict_index_has_virtual(index)) {
+
+
+#ifdef DBUG_OFF
+# define dbug_v_purge false
+#else /* DBUG_OFF */
+                        bool    dbug_v_purge = false;
+#endif /* DBUG_OFF */
+
+			DBUG_EXECUTE_IF(
+				"ib_purge_virtual_index_callback",
+				dbug_v_purge = true;);
+
+			roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+				rec, clust_index, clust_offsets);
+
+			/* if the row is newly inserted, then the virtual
+			columns need to be computed */
+			if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
+			    || dbug_v_purge) {
+
+				if (!row_vers_build_clust_v_col(
+					    row, clust_index, index, heap)) {
+					goto unsafe_to_purge;
+				}
+
+				entry = row_build_index_entry(
+					row, ext, index, heap);
+				if (entry && !dtuple_coll_cmp(ientry, entry)) {
+					goto unsafe_to_purge;
+				}
+			} else {
+				/* Build index entry out of row */
+				entry = row_build_index_entry(row, ext, index, heap);
+				/* entry could only be NULL if
+				the clustered index record is an uncommitted
+				inserted record whose BLOBs have not been
+				written yet. The secondary index record
+				can be safely removed, because it cannot
+				possibly refer to this incomplete
+				clustered index record. (Insert would
+				always first be completed for the
+				clustered index record, then proceed to
+				secondary indexes.) */
+
+				if (entry && row_purge_vc_matches_cluster(
+					    rec, entry,
+					    clust_index, clust_offsets,
+					    index, ientry, roll_ptr,
+					    trx_id, mtr, &vrow)) {
+					goto unsafe_to_purge;
+				}
+			}
+			clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+							clust_index
+							->n_core_fields,
+							ULINT_UNDEFINED, &heap);
+		} else {
+
+			entry = row_build_index_entry(
+				row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset BLOB
+			pointers.  This must be a freshly inserted record.  If
+			this is called from
+			row_purge_remove_sec_if_poss_low(), the thread will
+			hold latches on the clustered index and the secondary
+			index.  Because the insert works in three steps:
+
+				(1) insert the record to clustered index
+				(2) store the BLOBs and update BLOB pointers
+				(3) insert records to secondary indexes
+
+			the purge thread can safely ignore freshly inserted
+			records and delete the secondary index record.  The
+			thread that inserted the new record will be inserting
+			the secondary index records. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because the row is maybe being modified so that
+			the clustered index record has already been updated to
+			a different binary value in a char field, but the
+			collation identifies the old and new value anyway! */
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+unsafe_to_purge:
+				mem_heap_free(heap);
+
+				if (v_heap) {
+					mem_heap_free(v_heap);
+				}
+				return true;
+			}
+		}
+	} else if (dict_index_has_virtual(index)) {
+		/* The current cluster index record could be
+		deleted, but the previous version of it might not. We will
+		need to get the virtual column data from undo record
+		associated with current cluster index */
+
+		cur_vrow = row_vers_build_cur_vrow(
+			rec, clust_index, &clust_offsets,
+			index, trx_id, roll_ptr, heap, v_heap, mtr);
+	}
+
+	version = rec;
+
+	for (;;) {
+		mem_heap_t* heap2 = heap;
+		heap = mem_heap_create(1024);
+		vrow = NULL;
+
+		trx_undo_prev_version_build(version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version, mtr,
+					    TRX_UNDO_CHECK_PURGE_PAGES,
+					    nullptr,
+					    dict_index_has_virtual(index)
+					    ? &vrow : nullptr);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (!prev_version) {
+			/* Versions end here */
+			mem_heap_free(heap);
+
+			if (v_heap) {
+				mem_heap_free(v_heap);
+			}
+
+			return false;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				if (dtuple_vcol_data_missing(*vrow, *index)) {
+					goto nochange_index;
+				}
+				/* Keep the virtual row info for the next
+				version, unless it is changed */
+				mem_heap_empty(v_heap);
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Nothing for this index has changed,
+				continue */
+nochange_index:
+				version = prev_version;
+				continue;
+			}
+		}
+
+		if (!rec_get_deleted_flag(prev_version,
+					  rec_offs_comp(clust_offsets))) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, NULL, NULL, &ext, heap);
+
+			if (dict_index_has_virtual(index)) {
+				ut_ad(cur_vrow);
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+				goto unsafe_to_purge;
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
 /** Determines if it is possible to remove a secondary index entry.
 Removal is possible if the secondary index entry does not refer to any
 not delete marked version of a clustered index record where DB_TRX_ID
@@ -284,66 +711,45 @@ inserts a record that the secondary index entry would refer to.
 However, in that case, the user transaction would also re-insert the
 secondary index entry after purge has removed it and released the leaf
 page latch.
-@param[in,out]	node		row purge node
-@param[in]	index		secondary index
-@param[in]	entry		secondary index entry
-@param[in,out]	sec_pcur	secondary index cursor or NULL
-				if it is called for purge buffering
-				operation.
-@param[in,out]	sec_mtr		mini-transaction which holds
-				secondary index entry or NULL if it is
-				called for purge buffering operation.
-@param[in]	is_tree		true=pessimistic purge,
-				false=optimistic (leaf-page only)
-@return true if the secondary index record can be purged */
-bool
-row_purge_poss_sec(
-	purge_node_t*	node,
-	dict_index_t*	index,
-	const dtuple_t*	entry,
-	btr_pcur_t*	sec_pcur,
-	mtr_t*		sec_mtr,
-	bool		is_tree)
+@param node   row purge node
+@param index  secondary index
+@param entry  secondary index entry
+@param mtr    mini-transaction for looking up clustered index
+@return whether the secondary index record can be purged */
+bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index,
+                        const dtuple_t *entry, mtr_t *mtr)
 {
-	bool	can_delete;
-	mtr_t	mtr;
+  ut_ad(!index->is_clust());
+  const auto savepoint= mtr->get_savepoint();
+  bool can_delete= !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr);
 
-	ut_ad(!dict_index_is_clust(index));
+  if (!can_delete)
+  {
+    ut_ad(node->pcur.pos_state == BTR_PCUR_IS_POSITIONED);
+    can_delete= !row_purge_is_unsafe(*node, index, entry, mtr);
+    node->pcur.pos_state = BTR_PCUR_WAS_POSITIONED;
+    node->pcur.latch_mode= BTR_NO_LATCHES;
+  }
 
-	mtr_start(&mtr);
-
-	can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
-		|| !row_vers_old_has_index_entry(true,
-						 btr_pcur_get_rec(&node->pcur),
-						 &mtr, index, entry,
-						 node->roll_ptr, node->trx_id);
-
-	/* Persistent cursor is closed if reposition fails. */
-	if (node->found_clust) {
-		btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
-	} else {
-		mtr.commit();
-	}
-
-	ut_ad(mtr.has_committed());
-
-	return can_delete;
+  mtr->rollback_to_savepoint(savepoint);
+  return can_delete;
 }
 
-/***************************************************************
-Removes a secondary index entry if possible, by modifying the
-index tree.  Does not try to buffer the delete.
-@return TRUE if success or if not found */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-ibool
-row_purge_remove_sec_if_poss_tree(
-/*==============================*/
-	purge_node_t*	node,	/*!< in: row purge node */
-	dict_index_t*	index,	/*!< in: index */
-	const dtuple_t*	entry)	/*!< in: index entry */
+__attribute__((nonnull, warn_unused_result))
+/** Remove a secondary index entry if possible, by modifying the index tree.
+@param node             purge node
+@param index            secondary index
+@param entry            index entry
+@param page_max_trx_id  the PAGE_MAX_TRX_ID
+                        when row_purge_remove_sec_if_poss_leaf() was invoked
+@return whether the operation succeeded */
+static bool row_purge_remove_sec_if_poss_tree(purge_node_t *node,
+					      dict_index_t *index,
+					      const dtuple_t *entry,
+					      trx_id_t page_max_trx_id)
 {
 	btr_pcur_t		pcur;
-	ibool			success	= TRUE;
+	bool			success	= true;
 	dberr_t			err;
 	mtr_t			mtr;
 
@@ -389,7 +795,9 @@ row_purge_remove_sec_if_poss_tree(
 	we should do nothing. */
 
 found:
-	if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
+	if (page_max_trx_id
+	    == page_get_max_trx_id(btr_cur_get_page(&pcur.btr_cur))
+	    || row_purge_poss_sec(node, index, entry, &mtr)) {
 
 		/* Remove the index record, which should have been
 		marked for deletion. */
@@ -428,26 +836,23 @@ found:
 func_exit:
 	btr_pcur_close(&pcur); // FIXME: need this?
 	mtr.commit();
-
-	return(success);
+	return success;
 }
 
-/***************************************************************
-Removes a secondary index entry without modifying the index tree,
-if possible.
-@retval true if success or if not found
-@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-bool
-row_purge_remove_sec_if_poss_leaf(
-/*==============================*/
-	purge_node_t*	node,	/*!< in: row purge node */
-	dict_index_t*	index,	/*!< in: index */
-	const dtuple_t*	entry)	/*!< in: index entry */
+__attribute__((nonnull, warn_unused_result))
+/** Remove a secondary index entry if possible, without modifying the tree.
+@param node             purge node
+@param index            secondary index
+@param entry            index entry
+@return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree()
+@retval 0 if success or if not found */
+static trx_id_t row_purge_remove_sec_if_poss_leaf(purge_node_t *node,
+                                                  dict_index_t *index,
+                                                  const dtuple_t *entry)
 {
 	mtr_t			mtr;
 	btr_pcur_t		pcur;
-	bool			success	= true;
+	trx_id_t		page_max_trx_id = 0;
 
 	log_free_check();
 	ut_ad(index->table == node->table);
@@ -478,7 +883,7 @@ row_purge_remove_sec_if_poss_leaf(
 found:
 		/* Before attempting to purge a record, check
 		if it is safe to do so. */
-		if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
+		if (row_purge_poss_sec(node, index, entry, &mtr)) {
 			btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
 
 			/* Only delete-marked records should be purged. */
@@ -526,8 +931,11 @@ found:
 				}
 			}
 
-			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
-				!= DB_FAIL;
+			if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+                            == DB_FAIL) {
+				page_max_trx_id = page_get_max_trx_id(
+					btr_cur_get_page(btr_cur));
+                        }
 		}
 
 		/* (The index entry is still needed,
@@ -539,15 +947,15 @@ found:
 		/* The deletion was buffered. */
 	case ROW_NOT_FOUND:
 		/* The index entry does not exist, nothing to do. */
-func_exit:
-		mtr.commit();
-cleanup:
-		btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
-		return(success);
+		goto func_exit;
 	}
 
-	ut_error;
-	return(false);
+	ut_ad("invalid state" == 0);
+func_exit:
+	mtr.commit();
+cleanup:
+	btr_pcur_close(&pcur); // FIXME: remove? when is btr_cur->rtr_info set?
+	return page_max_trx_id;
 }
 
 /***********************************************************//**
@@ -560,38 +968,21 @@ row_purge_remove_sec_if_poss(
 	dict_index_t*	index,	/*!< in: index */
 	const dtuple_t*	entry)	/*!< in: index entry */
 {
-	ibool	success;
-	ulint	n_tries		= 0;
+  if (UNIV_UNLIKELY(!entry))
+    /* The node->row must have lacked some fields of this index. This
+    is possible when the undo log record was written before this index
+    was created. */
+    return;
 
-	/*	fputs("Purge: Removing secondary record\n", stderr); */
-
-	if (!entry) {
-		/* The node->row must have lacked some fields of this
-		index. This is possible when the undo log record was
-		written before this index was created. */
-		return;
-	}
-
-	if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
-
-		return;
-	}
-retry:
-	success = row_purge_remove_sec_if_poss_tree(node, index, entry);
-	/* The delete operation may fail if we have little
-	file space left: TODO: easiest to crash the database
-	and restart with more file space */
-
-	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
-
-		n_tries++;
-
-		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
-
-		goto retry;
-	}
-
-	ut_a(success);
+  if (trx_id_t page_max_trx_id=
+      row_purge_remove_sec_if_poss_leaf(node, index, entry))
+    for (auto n_tries= BTR_CUR_RETRY_DELETE_N_TIMES;
+         !row_purge_remove_sec_if_poss_tree(node, index, entry,
+                                            page_max_trx_id);
+         std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME))
+      /* The delete operation may fail if we have little
+      file space left (if innodb_file_per_table=0?) */
+      ut_a(--n_tries);
 }
 
 /***********************************************************//**
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index dd3a50a22b3..199fdcc0fe7 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -6614,7 +6614,7 @@ rec_loop:
           err= trx_undo_prev_version_build(clust_rec,
                                            clust_index, clust_offsets,
                                            vers_heap, &old_vers,
-                                           nullptr, nullptr, 0);
+                                           &mtr, 0, nullptr, nullptr);
           if (prev_heap)
             mem_heap_free(prev_heap);
           if (err != DB_SUCCESS)
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index 52f54443911..ed7a60c8792 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -470,6 +470,144 @@ func_exit:
 	return(err);
 }
 
+/** Find out if an accessible version of a clustered index record
+corresponds to a secondary index entry.
+@param rec    record in a latched clustered index page
+@param index  secondary index
+@param ientry secondary index entry
+@param mtr    mini-transaction
+@return whether an accessible non-dete-marked version of rec
+corresponds to ientry */
+static bool row_undo_mod_sec_is_unsafe(const rec_t *rec, dict_index_t *index,
+                                       const dtuple_t *ientry, mtr_t *mtr)
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		comp;
+	dtuple_t*	vrow = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	if (dict_index_has_virtual(index)) {
+		v_heap = mem_heap_create(100);
+		/* The current cluster index record could be
+		deleted, but the previous version of it might not. We will
+		need to get the virtual column data from undo record
+		associated with current cluster index */
+
+		cur_vrow = row_vers_build_cur_vrow(
+			rec, clust_index, &clust_offsets,
+			index, 0, 0, heap, v_heap, mtr);
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		vrow = NULL;
+
+		trx_undo_prev_version_build(version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version,
+					    mtr, TRX_UNDO_CHECK_PURGEABILITY,
+					    nullptr,
+					    dict_index_has_virtual(index)
+					    ? &vrow : nullptr);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (!prev_version) {
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				if (dtuple_vcol_data_missing(*vrow, *index)) {
+					goto nochange_index;
+				}
+				/* Keep the virtual row info for the next
+				version, unless it is changed */
+				mem_heap_empty(v_heap);
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Nothing for this index has changed,
+				continue */
+nochange_index:
+				version = prev_version;
+				continue;
+			}
+		}
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, NULL, NULL, &ext, heap);
+
+			if (dict_index_has_virtual(index)) {
+				ut_ad(cur_vrow);
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+				break;
+			}
+		}
+
+		version = prev_version;
+	}
+
+	mem_heap_free(heap);
+
+	if (v_heap) {
+		mem_heap_free(v_heap);
+	}
+
+	return !!prev_version;
+}
+
 /***********************************************************//**
 Delete marks or removes a secondary index entry if found.
 @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
@@ -488,7 +626,6 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	btr_cur_t*		btr_cur;
 	dberr_t			err	= DB_SUCCESS;
 	mtr_t			mtr;
-	mtr_t			mtr_vers;
 	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
 
 	row_mtr_start(&mtr, index, !modify_leaf);
@@ -555,17 +692,14 @@ found:
 	which cannot be purged yet, requires its existence. If some requires,
 	we should delete mark the record. */
 
-	mtr_vers.start();
-
-	ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) ==
-	      btr_pcur_t::SAME_ALL);
+	ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr) ==
+	     btr_pcur_t::SAME_ALL);
 
 	/* For temporary table, we can skip to check older version of
 	clustered index entry, because there is no MVCC or purge. */
 	if (node->table->is_temporary()
-	    || row_vers_old_has_index_entry(
-		    false, btr_pcur_get_rec(&node->pcur),
-		    &mtr_vers, index, entry, 0, 0)) {
+	    || row_undo_mod_sec_is_unsafe(
+		       btr_pcur_get_rec(&node->pcur), index, entry, &mtr)) {
 		btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
 					  btr_cur_get_rec(btr_cur), &mtr);
 	} else {
@@ -599,7 +733,9 @@ found:
 		}
 	}
 
-	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	ut_ad(node->pcur.pos_state == BTR_PCUR_IS_POSITIONED);
+	node->pcur.pos_state = BTR_PCUR_WAS_POSITIONED;
+	node->pcur.latch_mode = BTR_NO_LATCHES;
 
 func_exit:
 	btr_pcur_close(&pcur);
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index a39574d2f64..03118fb25f2 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -702,7 +702,7 @@ fetch; output: fetched length of the prefix
 @param[in,out]	heap		heap where to allocate
 @return BLOB prefix
 @retval NULL if the record is incomplete (should only happen
-in row_vers_vc_matches_cluster() executed concurrently with another purge) */
+in row_purge_vc_matches_cluster() executed concurrently with another purge) */
 static
 byte*
 row_upd_ext_fetch(
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
index c8913ec9623..6b1fe87630e 100644
--- a/storage/innobase/row/row0vers.cc
+++ b/storage/innobase/row/row0vers.cc
@@ -194,8 +194,8 @@ row_vers_impl_x_locked_low(
 
 		trx_undo_prev_version_build(
 			version, clust_index, clust_offsets,
-			heap, &prev_version, NULL,
-			dict_index_has_virtual(index) ? &vrow : NULL, 0);
+			heap, &prev_version, mtr, 0, NULL,
+			dict_index_has_virtual(index) ? &vrow : NULL);
 
 		ut_d(trx->mutex_lock());
 		const bool committed = trx_state_eq(
@@ -446,7 +446,6 @@ row_vers_impl_x_locked(
 @param[in]	clust_index	clustered index
 @param[in]	index		the secondary index
 @param[in]	heap		heap used to build virtual dtuple. */
-static
 bool
 row_vers_build_clust_v_col(
 	dtuple_t*		row,
@@ -490,26 +489,25 @@ row_vers_build_clust_v_col(
 }
 
 /** Build latest virtual column data from undo log
-@param[in]	in_purge	whether this is the purge thread
 @param[in]	rec		clustered index record
 @param[in]	clust_index	clustered index
 @param[in,out]	clust_offsets	offsets on the clustered index record
 @param[in]	index		the secondary index
+@param[in]	trx_id		transaction ID on the purging record,
+				or 0 if called outside purge
 @param[in]	roll_ptr	the rollback pointer for the purging record
-@param[in]	trx_id		trx id for the purging record
 @param[in,out]	v_heap		heap used to build vrow
 @param[out]	v_row		dtuple holding the virtual rows
 @param[in,out]	mtr		mtr holding the latch on rec */
 static
 void
 row_vers_build_cur_vrow_low(
-	bool			in_purge,
 	const rec_t*		rec,
 	dict_index_t*		clust_index,
 	rec_offs*		clust_offsets,
 	dict_index_t*		index,
-	roll_ptr_t		roll_ptr,
 	trx_id_t		trx_id,
+	roll_ptr_t		roll_ptr,
 	mem_heap_t*		v_heap,
 	dtuple_t**		vrow,
 	mtr_t*			mtr)
@@ -539,7 +537,7 @@ row_vers_build_cur_vrow_low(
 	/* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
 	bit to search the undo log until we hit the current undo log with
 	roll_ptr */
-	const ulint	status = in_purge
+	const ulint	status = trx_id
 		? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE
 		: TRX_UNDO_GET_OLD_V_VALUE;
 
@@ -551,7 +549,7 @@ row_vers_build_cur_vrow_low(
 
 		trx_undo_prev_version_build(
 			version, clust_index, clust_offsets,
-			heap, &prev_version, NULL, vrow, status);
+			heap, &prev_version, mtr, status, nullptr, vrow);
 
 		if (heap2) {
 			mem_heap_free(heap2);
@@ -603,212 +601,27 @@ row_vers_build_cur_vrow_low(
 	mem_heap_free(heap);
 }
 
-/** Check a virtual column value index secondary virtual index matches
-that of current cluster index record, which is recreated from information
-stored in undo log
-@param[in]	rec		record in the clustered index
-@param[in]	icentry		the index entry built from a cluster row
-@param[in]	clust_index	cluster index
-@param[in]	clust_offsets	offsets on the cluster record
-@param[in]	index		the secondary index
-@param[in]	ientry		the secondary index entry
-@param[in]	roll_ptr	the rollback pointer for the purging record
-@param[in]	trx_id		trx id for the purging record
-@param[in,out]	v_heap		heap used to build virtual dtuple
-@param[in,out]	v_row		dtuple holding the virtual rows (if needed)
-@param[in]	mtr		mtr holding the latch on rec
-@return true if matches, false otherwise */
-static
-bool
-row_vers_vc_matches_cluster(
-	const rec_t*	rec,
-	const dtuple_t* icentry,
-	dict_index_t*	clust_index,
-	rec_offs*	clust_offsets,
-	dict_index_t*	index,
-	const dtuple_t* ientry,
-	roll_ptr_t	roll_ptr,
-	trx_id_t	trx_id,
-	mem_heap_t*	v_heap,
-	dtuple_t**	vrow,
-	mtr_t*		mtr)
-{
-	const rec_t*	version;
-	rec_t*          prev_version;
-	mem_heap_t*	heap2;
-	mem_heap_t*	heap = NULL;
-	mem_heap_t*	tuple_heap;
-	ulint		num_v = dict_table_get_n_v_cols(index->table);
-	bool		compare[REC_MAX_N_FIELDS];
-	ulint		n_fields = dtuple_get_n_fields(ientry);
-	ulint		n_non_v_col = 0;
-	ulint		n_cmp_v_col = 0;
-	const dfield_t* field1;
-	dfield_t*	field2;
-	ulint		i;
-
-	/* First compare non-virtual columns (primary keys) */
-	ut_ad(index->n_fields == n_fields);
-	ut_ad(n_fields == dtuple_get_n_fields(icentry));
-	ut_ad(mtr->memo_contains_page_flagged(rec,
-					      MTR_MEMO_PAGE_S_FIX
-					      | MTR_MEMO_PAGE_X_FIX));
-
-	{
-		const dfield_t* a = ientry->fields;
-		const dfield_t* b = icentry->fields;
-
-		for (const dict_field_t *ifield = index->fields,
-			     *const end = &index->fields[index->n_fields];
-		     ifield != end; ifield++, a++, b++) {
-			if (!ifield->col->is_virtual()) {
-				if (cmp_dfield_dfield(a, b)) {
-					return false;
-				}
-				n_non_v_col++;
-			}
-		}
-	}
-
-	tuple_heap = mem_heap_create(1024);
-
-	ut_ad(n_fields > n_non_v_col);
-
-	*vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v);
-	dtuple_init_v_fld(*vrow);
-
-	for (i = 0; i < num_v; i++) {
-		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
-			 = DATA_MISSING;
-		compare[i] = false;
-	}
-
-	version = rec;
-
-	while (n_cmp_v_col < n_fields - n_non_v_col) {
-		heap2 = heap;
-		heap = mem_heap_create(1024);
-		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
-			version, clust_index, clust_offsets);
-
-		ut_ad(cur_roll_ptr != 0);
-		ut_ad(roll_ptr != 0);
-
-		trx_undo_prev_version_build(
-			version, clust_index, clust_offsets,
-			heap, &prev_version, NULL, vrow,
-			TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
-
-		if (heap2) {
-			mem_heap_free(heap2);
-		}
-
-		if (!prev_version) {
-			/* Versions end here */
-			goto func_exit;
-		}
-
-		clust_offsets = rec_get_offsets(prev_version, clust_index,
-						NULL,
-						clust_index->n_core_fields,
-						ULINT_UNDEFINED, &heap);
-
-		ulint	entry_len = dict_index_get_n_fields(index);
-
-		for (i = 0; i < entry_len; i++) {
-			const dict_field_t*	ind_field
-				 = dict_index_get_nth_field(index, i);
-			const dict_col_t*	col = ind_field->col;
-			field1 = dtuple_get_nth_field(ientry, i);
-
-			if (!col->is_virtual()) {
-				continue;
-			}
-
-			const dict_v_col_t*     v_col
-                                = reinterpret_cast<const dict_v_col_t*>(col);
-			field2
-				= dtuple_get_nth_v_field(*vrow, v_col->v_pos);
-
-			if ((dfield_get_type(field2)->mtype != DATA_MISSING)
-			    && (!compare[v_col->v_pos])) {
-
-				if (ind_field->prefix_len != 0
-				    && !dfield_is_null(field2)) {
-					field2->len = unsigned(
-						dtype_get_at_most_n_mbchars(
-							field2->type.prtype,
-							field2->type.mbminlen,
-							field2->type.mbmaxlen,
-							ind_field->prefix_len,
-							field2->len,
-							static_cast<char*>
-							(field2->data)));
-				}
-
-				/* The index field mismatch */
-				if (v_heap
-				    || cmp_dfield_dfield(field2, field1) != 0) {
-					if (v_heap) {
-						dtuple_dup_v_fld(*vrow, v_heap);
-					}
-
-					mem_heap_free(tuple_heap);
-					mem_heap_free(heap);
-					return(false);
-				}
-
-				compare[v_col->v_pos] = true;
-				n_cmp_v_col++;
-			}
-		}
-
-		trx_id_t	rec_trx_id = row_get_rec_trx_id(
-			prev_version, clust_index, clust_offsets);
-
-		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
-			break;
-		}
-
-		version = prev_version;
-	}
-
-func_exit:
-	if (n_cmp_v_col == 0) {
-		*vrow = NULL;
-	}
-
-	mem_heap_free(tuple_heap);
-	mem_heap_free(heap);
-
-	/* FIXME: In the case of n_cmp_v_col is not the same as
-	n_fields - n_non_v_col, callback is needed to compare the rest
-	columns. At the timebeing, we will need to return true */
-	return (true);
-}
-
 /** Build a dtuple contains virtual column data for current cluster index
 @param[in]	in_purge	called by purge thread
 @param[in]	rec		cluster index rec
 @param[in]	clust_index	cluster index
 @param[in]	clust_offsets	cluster rec offset
 @param[in]	index		secondary index
+@param[in]	trx_id		transaction ID on the purging record,
+				or 0 if called outside purge
 @param[in]	roll_ptr	roll_ptr for the purge record
-@param[in]	trx_id		transaction ID on the purging record
 @param[in,out]	heap		heap memory
-@param[in,out]	v_heap		heap memory to keep virtual colum dtuple
-@param[in]	mtr		mtr holding the latch on rec
+@param[in,out]	v_heap		heap memory to keep virtual column tuple
+@param[in,out]	mtr		mini-transaction
 @return dtuple contains virtual column data */
-static
 dtuple_t*
 row_vers_build_cur_vrow(
-	bool			in_purge,
 	const rec_t*		rec,
 	dict_index_t*		clust_index,
 	rec_offs**		clust_offsets,
 	dict_index_t*		index,
-	roll_ptr_t		roll_ptr,
 	trx_id_t		trx_id,
+	roll_ptr_t		roll_ptr,
 	mem_heap_t*		heap,
 	mem_heap_t*		v_heap,
 	mtr_t*			mtr)
@@ -841,8 +654,8 @@ row_vers_build_cur_vrow(
 	} else {
 		/* Try to fetch virtual column data from undo log */
 		row_vers_build_cur_vrow_low(
-			in_purge, rec, clust_index, *clust_offsets,
-			index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr);
+			rec, clust_index, *clust_offsets,
+			index, trx_id, roll_ptr, v_heap, &cur_vrow, mtr);
 	}
 
 	*clust_offsets = rec_get_offsets(rec, clust_index, NULL,
@@ -856,292 +669,23 @@ for indexed virtual column.
 @param tuple   data tuple
 @param index   virtual index
 @return true if tuple has missing column type */
-static bool dtuple_vcol_data_missing(const dtuple_t &tuple,
-                                     dict_index_t *index)
+bool dtuple_vcol_data_missing(const dtuple_t &tuple,
+                              const dict_index_t &index)
 {
-  for (ulint i= 0; i < index->n_uniq; i++)
+  for (ulint i= 0; i < index.n_uniq; i++)
   {
-    dict_col_t *col= index->fields[i].col;
+    dict_col_t *col= index.fields[i].col;
     if (!col->is_virtual())
       continue;
     dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
-    for (ulint j= 0; j < index->table->n_v_cols; j++)
-    {
-      if (vcol == &index->table->v_cols[j]
-          && tuple.v_fields[j].type.mtype == DATA_MISSING)
+    for (ulint j= 0; j < index.table->n_v_cols; j++)
+      if (vcol == &index.table->v_cols[j] &&
+          tuple.v_fields[j].type.mtype == DATA_MISSING)
         return true;
-    }
   }
   return false;
 }
 
-/** Finds out if a version of the record, where the version >= the current
-purge_sys.view, should have ientry as its secondary index entry. We check
-if there is any not delete marked version of the record where the trx
-id >= purge view, and the secondary index entry == ientry; exactly in
-this case we return TRUE.
-@param[in]	also_curr	TRUE if also rec is included in the versions
-				to search; otherwise only versions prior
-				to it are searched
-@param[in]	rec		record in the clustered index; the caller
-				must have a latch on the page
-@param[in]	mtr		mtr holding the latch on rec; it will
-				also hold the latch on purge_view
-@param[in]	index		secondary index
-@param[in]	ientry		secondary index entry
-@param[in]	roll_ptr	roll_ptr for the purge record
-@param[in]	trx_id		transaction ID on the purging record
-@return TRUE if earlier version should have */
-bool
-row_vers_old_has_index_entry(
-	bool			also_curr,
-	const rec_t*		rec,
-	mtr_t*			mtr,
-	dict_index_t*		index,
-	const dtuple_t*		ientry,
-	roll_ptr_t		roll_ptr,
-	trx_id_t		trx_id)
-{
-	const rec_t*	version;
-	rec_t*		prev_version;
-	dict_index_t*	clust_index;
-	rec_offs*	clust_offsets;
-	mem_heap_t*	heap;
-	mem_heap_t*	heap2;
-	dtuple_t*	row;
-	const dtuple_t*	entry;
-	ulint		comp;
-	dtuple_t*	vrow = NULL;
-	mem_heap_t*	v_heap = NULL;
-	dtuple_t*	cur_vrow = NULL;
-
-	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
-					      | MTR_MEMO_PAGE_S_FIX));
-	clust_index = dict_table_get_first_index(index->table);
-
-	comp = page_rec_is_comp(rec);
-	ut_ad(!dict_table_is_comp(index->table) == !comp);
-	heap = mem_heap_create(1024);
-	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
-					clust_index->n_core_fields,
-					ULINT_UNDEFINED, &heap);
-
-	if (dict_index_has_virtual(index)) {
-		v_heap = mem_heap_create(100);
-	}
-
-	DBUG_EXECUTE_IF("ib_purge_virtual_index_crash",
-			DBUG_SUICIDE(););
-
-	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
-		row_ext_t*	ext;
-
-		/* The top of the stack of versions is locked by the
-		mtr holding a latch on the page containing the
-		clustered index record. The bottom of the stack is
-		locked by the fact that the purge_sys.view must
-		'overtake' any read view of an active transaction.
-		Thus, it is safe to fetch the prefixes for
-		externally stored columns. */
-		row = row_build(ROW_COPY_POINTERS, clust_index,
-				rec, clust_offsets,
-				NULL, NULL, NULL, &ext, heap);
-
-		if (dict_index_has_virtual(index)) {
-
-
-#ifdef DBUG_OFF
-# define dbug_v_purge false
-#else /* DBUG_OFF */
-                        bool    dbug_v_purge = false;
-#endif /* DBUG_OFF */
-
-			DBUG_EXECUTE_IF(
-				"ib_purge_virtual_index_callback",
-				dbug_v_purge = true;);
-
-			roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
-				rec, clust_index, clust_offsets);
-
-			/* if the row is newly inserted, then the virtual
-			columns need to be computed */
-			if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
-			    || dbug_v_purge) {
-
-				if (!row_vers_build_clust_v_col(
-					    row, clust_index, index, heap)) {
-					goto unsafe_to_purge;
-				}
-
-				entry = row_build_index_entry(
-					row, ext, index, heap);
-				if (entry && !dtuple_coll_cmp(ientry, entry)) {
-					goto unsafe_to_purge;
-				}
-			} else {
-				/* Build index entry out of row */
-				entry = row_build_index_entry(row, ext, index, heap);
-				/* entry could only be NULL if
-				the clustered index record is an uncommitted
-				inserted record whose BLOBs have not been
-				written yet. The secondary index record
-				can be safely removed, because it cannot
-				possibly refer to this incomplete
-				clustered index record. (Insert would
-				always first be completed for the
-				clustered index record, then proceed to
-				secondary indexes.) */
-
-				if (entry && row_vers_vc_matches_cluster(
-					    rec, entry,
-					    clust_index, clust_offsets,
-					    index, ientry, roll_ptr,
-					    trx_id, NULL, &vrow, mtr)) {
-					goto unsafe_to_purge;
-				}
-			}
-			clust_offsets = rec_get_offsets(rec, clust_index, NULL,
-							clust_index
-							->n_core_fields,
-							ULINT_UNDEFINED, &heap);
-		} else {
-
-			entry = row_build_index_entry(
-				row, ext, index, heap);
-
-			/* If entry == NULL, the record contains unset BLOB
-			pointers.  This must be a freshly inserted record.  If
-			this is called from
-			row_purge_remove_sec_if_poss_low(), the thread will
-			hold latches on the clustered index and the secondary
-			index.  Because the insert works in three steps:
-
-				(1) insert the record to clustered index
-				(2) store the BLOBs and update BLOB pointers
-				(3) insert records to secondary indexes
-
-			the purge thread can safely ignore freshly inserted
-			records and delete the secondary index record.  The
-			thread that inserted the new record will be inserting
-			the secondary index records. */
-
-			/* NOTE that we cannot do the comparison as binary
-			fields because the row is maybe being modified so that
-			the clustered index record has already been updated to
-			a different binary value in a char field, but the
-			collation identifies the old and new value anyway! */
-			if (entry && !dtuple_coll_cmp(ientry, entry)) {
-unsafe_to_purge:
-				mem_heap_free(heap);
-
-				if (v_heap) {
-					mem_heap_free(v_heap);
-				}
-				return true;
-			}
-		}
-	} else if (dict_index_has_virtual(index)) {
-		/* The current cluster index record could be
-		deleted, but the previous version of it might not. We will
-		need to get the virtual column data from undo record
-		associated with current cluster index */
-
-		cur_vrow = row_vers_build_cur_vrow(
-			also_curr, rec, clust_index, &clust_offsets,
-			index, roll_ptr, trx_id, heap, v_heap, mtr);
-	}
-
-	version = rec;
-
-	for (;;) {
-		heap2 = heap;
-		heap = mem_heap_create(1024);
-		vrow = NULL;
-
-		trx_undo_prev_version_build(version,
-					    clust_index, clust_offsets,
-					    heap, &prev_version, nullptr,
-					    dict_index_has_virtual(index)
-					    ? &vrow : nullptr,
-					    TRX_UNDO_CHECK_PURGEABILITY);
-		mem_heap_free(heap2); /* free version and clust_offsets */
-
-		if (!prev_version) {
-			/* Versions end here */
-			mem_heap_free(heap);
-
-			if (v_heap) {
-				mem_heap_free(v_heap);
-			}
-
-			return false;
-		}
-
-		clust_offsets = rec_get_offsets(prev_version, clust_index,
-						NULL,
-						clust_index->n_core_fields,
-						ULINT_UNDEFINED, &heap);
-
-		if (dict_index_has_virtual(index)) {
-			if (vrow) {
-				if (dtuple_vcol_data_missing(*vrow, index)) {
-					goto nochange_index;
-				}
-				/* Keep the virtual row info for the next
-				version, unless it is changed */
-				mem_heap_empty(v_heap);
-				cur_vrow = dtuple_copy(vrow, v_heap);
-				dtuple_dup_v_fld(cur_vrow, v_heap);
-			}
-
-			if (!cur_vrow) {
-				/* Nothing for this index has changed,
-				continue */
-nochange_index:
-				version = prev_version;
-				continue;
-			}
-		}
-
-		if (!rec_get_deleted_flag(prev_version, comp)) {
-			row_ext_t*	ext;
-
-			/* The stack of versions is locked by mtr.
-			Thus, it is safe to fetch the prefixes for
-			externally stored columns. */
-			row = row_build(ROW_COPY_POINTERS, clust_index,
-					prev_version, clust_offsets,
-					NULL, NULL, NULL, &ext, heap);
-
-			if (dict_index_has_virtual(index)) {
-				ut_ad(cur_vrow);
-				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
-				dtuple_copy_v_fields(row, cur_vrow);
-			}
-
-			entry = row_build_index_entry(row, ext, index, heap);
-
-			/* If entry == NULL, the record contains unset
-			BLOB pointers.  This must be a freshly
-			inserted record that we can safely ignore.
-			For the justification, see the comments after
-			the previous row_build_index_entry() call. */
-
-			/* NOTE that we cannot do the comparison as binary
-			fields because maybe the secondary index record has
-			already been updated to a different binary value in
-			a char field, but the collation identifies the old
-			and new value anyway! */
-
-			if (entry && !dtuple_coll_cmp(ientry, entry)) {
-				goto unsafe_to_purge;
-			}
-		}
-
-		version = prev_version;
-	}
-}
-
 /*****************************************************************//**
 Constructs the version of a clustered index record which a consistent
 read should see. We assume that the trx id stored in rec is such that
@@ -1208,7 +752,7 @@ row_vers_build_for_consistent_read(
 
 		err = trx_undo_prev_version_build(
 			version, index, *offsets, heap,
-			&prev_version, NULL, vrow, 0);
+			&prev_version, mtr, 0, NULL, vrow);
 
 		if (prev_heap != NULL) {
 			mem_heap_free(prev_heap);
@@ -1370,8 +914,8 @@ committed_version_trx:
 		heap = mem_heap_create(1024);
 
 		if (trx_undo_prev_version_build(version, index, *offsets, heap,
-						&prev_version, in_heap, vrow,
-						0) != DB_SUCCESS) {
+						&prev_version, mtr, 0,
+						in_heap, vrow) != DB_SUCCESS) {
 			mem_heap_free(heap);
 			heap = heap2;
 			heap2 = NULL;
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index c7be15bad20..ad5c0f36f50 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -776,26 +776,18 @@ not_free:
 
 buf_block_t *purge_sys_t::get_page(page_id_t id)
 {
+  ut_ad(!recv_sys.recovery_on);
+
   buf_block_t*& undo_page= pages[id];
 
-  if (undo_page)
-    return undo_page;
-
-  mtr_t mtr;
-  mtr.start();
-  undo_page=
-    buf_page_get_gen(id, 0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr);
-
-  if (UNIV_LIKELY(undo_page != nullptr))
+  if (!undo_page)
   {
-    undo_page->fix();
-    mtr.commit();
-    return undo_page;
+    undo_page= buf_pool.page_fix(id); // batch_cleanup() will unfix()
+    if (!undo_page)
+      pages.erase(id);
   }
 
-  mtr.commit();
-  pages.erase(id);
-  return nullptr;
+  return undo_page;
 }
 
 bool purge_sys_t::rseg_get_next_history_log()
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index e05c018c734..2ff6135227b 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -2038,170 +2038,128 @@ err_exit:
 
 /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
 
-/** Copy an undo record to heap.
-@param[in]	roll_ptr	roll pointer to a record that exists
-@param[in,out]	heap		memory heap where copied */
-static
-trx_undo_rec_t*
-trx_undo_get_undo_rec_low(
-	roll_ptr_t		roll_ptr,
-	mem_heap_t*		heap)
+static dberr_t trx_undo_prev_version(const rec_t *rec, dict_index_t *index,
+                                     rec_offs *offsets, mem_heap_t *heap,
+                                     rec_t **old_vers, mem_heap_t *v_heap,
+                                     dtuple_t **vrow, ulint v_status,
+                                     const trx_undo_rec_t *undo_rec);
+
+inline const buf_block_t *
+purge_sys_t::view_guard::get(const page_id_t id, mtr_t *mtr)
 {
-  ulint rseg_id;
-  uint32_t page_no;
-  uint16_t offset;
-  bool is_insert;
-  mtr_t mtr;
-
-  trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset);
-  ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
-  ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-  trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
-  ut_ad(rseg->is_persistent());
-
-  mtr.start();
-
-  trx_undo_rec_t *undo_rec= nullptr;
-  if (buf_block_t* undo_page=
-      buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr))
+  buf_block_t *block;
+  ut_ad(mtr->is_active());
+  if (!latch)
   {
-    buf_page_make_young_if_needed(&undo_page->page);
-    undo_rec= undo_page->page.frame + offset;
-    const size_t end= mach_read_from_2(undo_rec);
-    if (UNIV_UNLIKELY(end <= offset ||
-                      end >= srv_page_size - FIL_PAGE_DATA_END))
-      undo_rec= nullptr;
-    else
+    decltype(purge_sys.pages)::const_iterator i= purge_sys.pages.find(id);
+    if (i != purge_sys.pages.end())
     {
-      size_t len{end - offset};
-      undo_rec=
-        static_cast<trx_undo_rec_t*>(mem_heap_dup(heap, undo_rec, len));
-      mach_write_to_2(undo_rec, len);
+      block= i->second;
+      ut_ad(block);
+      return block;
     }
   }
-
-  mtr.commit();
-  return undo_rec;
-}
-
-/** Copy an undo record to heap, to check if a secondary index record
-can be safely purged.
-@param trx_id   DB_TRX_ID corresponding to roll_ptr
-@param name     table name
-@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
-@param heap     memory heap for allocation
-@return copy of the record
-@retval nullptr if the version is visible to purge_sys.view */
-static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id,
-                                                     const table_name_t &name,
-                                                     roll_ptr_t roll_ptr,
-                                                     mem_heap_t* heap)
-{
+  block= buf_pool.page_fix(id);
+  if (block)
   {
-    purge_sys_t::view_guard check;
-    if (!check.view().changes_visible(trx_id))
-      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+    mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+    if (latch)
+      /* In MVCC operations (outside purge tasks), we will refresh the
+      buf_pool.LRU position. In purge, we expect the page to be freed
+      soon, at the end of the current batch. */
+      buf_page_make_young_if_needed(&block->page);
   }
-  return nullptr;
-}
-
-/** Copy an undo record to heap.
-@param trx_id   DB_TRX_ID corresponding to roll_ptr
-@param name     table name
-@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
-@param heap     memory heap for allocation
-@return copy of the record
-@retval nullptr if the undo log is not available */
-static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id,
-                                             const table_name_t &name,
-                                             roll_ptr_t roll_ptr,
-                                             mem_heap_t *heap)
-{
-  {
-    purge_sys_t::end_view_guard check;
-    if (!check.view().changes_visible(trx_id))
-      return trx_undo_get_undo_rec_low(roll_ptr, heap);
-  }
-  return nullptr;
+  return block;
 }
 
 /** Build a previous version of a clustered index record. The caller
 must hold a latch on the index page of the clustered index record.
-@param	rec		version of a clustered index record
-@param	index		clustered index
-@param	offsets		rec_get_offsets(rec, index)
-@param	heap		memory heap from which the memory needed is
-			allocated
-@param	old_vers	previous version or NULL if rec is the
-			first inserted version, or if history data
-			has been deleted (an error), or if the purge
-			could have removed the version
-			though it has not yet done so
-@param	v_heap		memory heap used to create vrow
-			dtuple if it is not yet created. This heap
-			diffs from "heap" above in that it could be
-			prebuilt->old_vers_heap for selection
-@param	v_row		virtual column info, if any
-@param	v_status	status determine if it is going into this
-			function by purge thread or not.
-			And if we read "after image" of undo log
-@param	undo_block	undo log block which was cached during
-			online dml apply or nullptr
+@param rec       version of a clustered index record
+@param index     clustered index
+@param offsets   rec_get_offsets(rec, index)
+@param heap      memory heap from which the memory needed is allocated
+@param old_vers  previous version, or NULL if rec is the first inserted
+                 version, or if history data has been deleted (an error),
+                 or if the purge could have removed the version though
+                 it has not yet done so
+@param mtr       mini-transaction
+@param v_status  TRX_UNDO_PREV_IN_PURGE, ...
+@param v_heap    memory heap used to create vrow dtuple if it is not yet
+                 created. This heap diffs from "heap" above in that it could be
+                 prebuilt->old_vers_heap for selection
+@param vrow      virtual column info, if any
 @return error code
 @retval DB_SUCCESS if previous version was successfully built,
 or if it was an insert or the undo record refers to the table before rebuild
 @retval DB_MISSING_HISTORY if the history is missing */
 TRANSACTIONAL_TARGET
-dberr_t
-trx_undo_prev_version_build(
-	const rec_t 	*rec,
-	dict_index_t	*index,
-	rec_offs	*offsets,
-	mem_heap_t	*heap,
-	rec_t		**old_vers,
-	mem_heap_t	*v_heap,
-	dtuple_t	**vrow,
-	ulint		v_status)
+dberr_t trx_undo_prev_version_build(const rec_t *rec, dict_index_t *index,
+                                    rec_offs *offsets, mem_heap_t *heap,
+                                    rec_t **old_vers, mtr_t *mtr,
+                                    ulint v_status,
+                                    mem_heap_t *v_heap, dtuple_t **vrow)
 {
-	dtuple_t*	entry;
-	trx_id_t	rec_trx_id;
-	undo_no_t	undo_no;
-	table_id_t	table_id;
-	trx_id_t	trx_id;
-	roll_ptr_t	roll_ptr;
-	upd_t*		update;
-	byte		type;
-	byte		info_bits;
-	byte		cmpl_info;
-	bool		dummy_extern;
-	byte*		buf;
+  ut_ad(!index->table->is_temporary());
+  ut_ad(rec_offs_validate(rec, index, offsets));
 
-	ut_ad(!index->table->is_temporary());
-	ut_ad(rec_offs_validate(rec, index, offsets));
+  const roll_ptr_t roll_ptr= row_get_rec_roll_ptr(rec, index, offsets);
+  *old_vers= nullptr;
 
-	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+  if (trx_undo_roll_ptr_is_insert(roll_ptr))
+    /* The record rec is the first inserted version */
+    return DB_SUCCESS;
 
-	*old_vers = NULL;
+  ut_ad(roll_ptr < 1ULL << 55);
+  ut_ad(uint16_t(roll_ptr) >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  ut_ad(uint32_t(roll_ptr >> 16) >= FSP_FIRST_INODE_PAGE_NO);
 
-	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
-		/* The record rec is the first inserted version */
-		return DB_SUCCESS;
-	}
+  const trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
 
-	mariadb_increment_undo_records_read();
-	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+  ut_ad(!index->table->skip_alter_undo);
 
-	ut_ad(!index->table->skip_alter_undo);
+  mariadb_increment_undo_records_read();
+  const auto savepoint= mtr->get_savepoint();
+  dberr_t err= DB_MISSING_HISTORY;
+  purge_sys_t::view_guard check{v_status == TRX_UNDO_CHECK_PURGE_PAGES
+                                ? purge_sys_t::view_guard::PURGE
+                                : v_status == TRX_UNDO_CHECK_PURGEABILITY
+                                ? purge_sys_t::view_guard::VIEW
+                                : purge_sys_t::view_guard::END_VIEW};
+  if (!check.view().changes_visible(rec_trx_id))
+  {
+    trx_undo_rec_t *undo_rec= nullptr;
+    static_assert(ROLL_PTR_RSEG_ID_POS == 48, "");
+    static_assert(ROLL_PTR_PAGE_POS == 16, "");
+    if (const buf_block_t *undo_page=
+        check.get(page_id_t{trx_sys.rseg_array[(roll_ptr >> 48) & 0x7f].
+                            space->id,
+                            uint32_t(roll_ptr >> 16)}, mtr))
+    {
+      static_assert(ROLL_PTR_BYTE_POS == 0, "");
+      const uint16_t offset{uint16_t(roll_ptr)};
+      undo_rec= undo_page->page.frame + offset;
+      const size_t end= mach_read_from_2(undo_rec);
+      if (UNIV_UNLIKELY(end > offset &&
+                        end < srv_page_size - FIL_PAGE_DATA_END))
+        err= trx_undo_prev_version(rec, index, offsets, heap,
+                                   old_vers, v_heap, vrow, v_status, undo_rec);
+    }
+  }
 
-	trx_undo_rec_t*	undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY
-		? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name,
-						roll_ptr, heap)
-		: trx_undo_get_undo_rec(rec_trx_id, index->table->name,
-					roll_ptr, heap);
-	if (!undo_rec) {
-		return DB_MISSING_HISTORY;
-	}
+  mtr->rollback_to_savepoint(savepoint);
+  return err;
+}
 
+static dberr_t trx_undo_prev_version(const rec_t *rec, dict_index_t *index,
+                                     rec_offs *offsets, mem_heap_t *heap,
+                                     rec_t **old_vers, mem_heap_t *v_heap,
+                                     dtuple_t **vrow, ulint v_status,
+                                     const trx_undo_rec_t *undo_rec)
+{
+	byte type, cmpl_info;
+	bool dummy_extern;
+	undo_no_t undo_no;
+	table_id_t table_id;
 	const byte *ptr =
 		trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
 				      &dummy_extern, &undo_no, &table_id);
@@ -2213,6 +2171,10 @@ trx_undo_prev_version_build(
 		return DB_SUCCESS;
 	}
 
+	trx_id_t trx_id;
+	roll_ptr_t roll_ptr;
+	byte info_bits;
+
 	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
 					       &info_bits);
 
@@ -2240,10 +2202,12 @@ trx_undo_prev_version_build(
 
 	ptr = trx_undo_rec_skip_row_ref(ptr, index);
 
+	upd_t* update;
 	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
 					     roll_ptr, info_bits,
 					     heap, &update);
 	ut_a(ptr);
+	byte* buf;
 
 	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
 		/* We should confirm the existence of disowned external data,
@@ -2269,9 +2233,10 @@ trx_undo_prev_version_build(
 		those fields that update updates to become externally stored
 		fields. Store the info: */
 
-		entry = row_rec_to_index_entry(rec, index, offsets, heap);
+		dtuple_t* entry = row_rec_to_index_entry(rec, index, offsets,
+							 heap);
 		/* The page containing the clustered index record
-		corresponding to entry is latched in mtr.  Thus the
+		corresponding to entry is latched.  Thus the
 		following call is safe. */
 		if (!row_upd_index_replace_new_col_vals(entry, *index, update,
 							heap)) {

From 76f6b6d818dda85b01a4cb0a62e45a341329dc5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Mon, 26 Aug 2024 12:23:17 +0300
Subject: [PATCH 10/22] MDEV-34515: Reduce context switching in purge

Before this patch, the InnoDB purge coordinator task submitted
innodb_purge_threads-1 tasks even if there was not sufficient amount
of work for all of them. For example, if there are undo log records
only for 1 table, only 1 task can be employed, and that task had better
be the purge coordinator.

srv_purge_worker_task_low(): Split from purge_worker_callback().

trx_purge_attach_undo_recs(): Remove the parameter n_purge_threads,
and add the parameter n_work_items, to keep track of the amount of
work.

trx_purge(): Launch purge worker tasks only if necessary. The work of
one thread will be executed by this purge coordinator thread.

que_fork_scheduler_round_robin(): Merged to trx_purge().

Thanks to Vladislav Vaintroub for supplying a prototype of this.

Reviewed by: Debarun Banerjee
---
 storage/innobase/include/que0que.h |  11 -
 storage/innobase/include/srv0srv.h |   9 +
 storage/innobase/que/que0que.cc    |  34 ---
 storage/innobase/srv/srv0srv.cc    |  11 +-
 storage/innobase/trx/trx0purge.cc  | 320 +++++++++++++++--------------
 5 files changed, 180 insertions(+), 205 deletions(-)

diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index c60f390a092..6485e21e7fc 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -209,17 +209,6 @@ que_eval_sql(
 	const char*	sql,	/*!< in: SQL string */
 	trx_t*		trx);	/*!< in: trx */
 
-/**********************************************************************//**
-Round robin scheduler.
-@return a query thread of the graph moved to QUE_THR_RUNNING state, or
-NULL; the query thread should be executed by que_run_threads by the
-caller */
-que_thr_t*
-que_fork_scheduler_round_robin(
-/*===========================*/
-	que_fork_t*	fork,		/*!< in: a query fork */
-	que_thr_t*	thr);		/*!< in: current pos */
-
 /** Query thread states */
 enum que_thr_state_t {
 	/** in selects this means that the thread is at the end of its
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 4c3c17a1a16..59cb3bf525b 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -624,6 +624,15 @@ Complete the shutdown tasks such as background DROP TABLE,
 and optionally change buffer merge (on innodb_fast_shutdown=0). */
 void srv_shutdown(bool ibuf_merge);
 
+/**
+ Fetches and executes tasks from the purge work queue,
+ until this queue is empty.
+ This is main part of purge worker task, but also
+ executed in coordinator.
+ @note needs current_thd to be set beforehand.
+*/
+void srv_purge_worker_task_low();
+
 } /* extern "C" */
 
 #ifdef UNIV_DEBUG
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
index d910ee2a881..5e1e0686c97 100644
--- a/storage/innobase/que/que0que.cc
+++ b/storage/innobase/que/que0que.cc
@@ -166,40 +166,6 @@ que_thr_init_command(
 	thr->state = QUE_THR_RUNNING;
 }
 
-/**********************************************************************//**
-Round robin scheduler.
-@return a query thread of the graph moved to QUE_THR_RUNNING state, or
-NULL; the query thread should be executed by que_run_threads by the
-caller */
-que_thr_t*
-que_fork_scheduler_round_robin(
-/*===========================*/
-	que_fork_t*	fork,		/*!< in: a query fork */
-	que_thr_t*	thr)		/*!< in: current pos */
-{
-	fork->trx->mutex_lock();
-
-	/* If no current, start first available. */
-	if (thr == NULL) {
-		thr = UT_LIST_GET_FIRST(fork->thrs);
-	} else {
-		thr = UT_LIST_GET_NEXT(thrs, thr);
-	}
-
-	if (thr) {
-
-		fork->state = QUE_FORK_ACTIVE;
-
-		fork->last_sel_node = NULL;
-		ut_ad(thr->state == QUE_THR_COMPLETED);
-		que_thr_init_command(thr);
-	}
-
-	fork->trx->mutex_unlock();
-
-	return(thr);
-}
-
 /**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 8921f17ef6a..1cea2d9ffe8 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -1556,7 +1556,6 @@ static bool srv_purge_should_exit(size_t old_history_size)
 
 /*********************************************************************//**
 Fetch and execute a task from the work queue.
-@param [in,out]	slot	purge worker thread slot
 @return true if a task was executed */
 static bool srv_task_execute()
 {
@@ -1696,6 +1695,13 @@ static void release_thd(THD *thd, void *ctx)
 	set_current_thd(0);
 }
 
+void srv_purge_worker_task_low()
+{
+  ut_ad(current_thd);
+  while (srv_task_execute())
+    ut_ad(purge_sys.running());
+}
+
 static void purge_worker_callback(void*)
 {
   ut_ad(!current_thd);
@@ -1703,8 +1709,7 @@ static void purge_worker_callback(void*)
   ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
   void *ctx;
   THD *thd= acquire_thd(&ctx);
-  while (srv_task_execute())
-    ut_ad(purge_sys.running());
+  srv_purge_worker_task_low();
   release_thd(thd,ctx);
 }
 
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index ad5c0f36f50..d9550a19cc6 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -1199,123 +1199,108 @@ dict_table_t *purge_sys_t::close_and_reopen(table_id_t id, THD *thd,
 
 /** Run a purge batch.
 @param n_purge_threads	number of purge threads
+@param thd              purge coordinator thread handle
+@param n_work_items     number of work items (currently tables) to process
 @return new purge_sys.head */
-static purge_sys_t::iterator
-trx_purge_attach_undo_recs(ulint n_purge_threads, THD *thd)
+static purge_sys_t::iterator trx_purge_attach_undo_recs(THD *thd,
+                                                        ulint *n_work_items)
 {
-	que_thr_t*	thr;
-	ulint		i;
+  que_thr_t *thr;
+  purge_sys_t::iterator head= purge_sys.tail;
 
-	ut_a(n_purge_threads > 0);
-	ut_a(UT_LIST_GET_LEN(purge_sys.query->thrs) >= n_purge_threads);
+  /* Fetch and parse the UNDO records. The UNDO records are added
+  to a per purge node vector. */
+  thr= nullptr;
 
-	purge_sys_t::iterator head = purge_sys.tail;
+  std::unordered_map<table_id_t, purge_node_t *>
+    table_id_map(TRX_PURGE_TABLE_BUCKETS);
+  purge_sys.m_active= true;
+
+  MDL_context *const mdl_context=
+    static_cast<MDL_context*>(thd_mdl_context(thd));
+  ut_ad(mdl_context);
+
+  const size_t max_pages=
+    std::min(buf_pool.curr_size * 3 / 4, size_t{srv_purge_batch_size});
+
+  while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown)
+  {
+    /* Track the max {trx_id, undo_no} for truncating the
+    UNDO logs once we have purged the records. */
+
+    if (head <= purge_sys.tail)
+      head= purge_sys.tail;
+
+    /* Fetch the next record, and advance the purge_sys.tail. */
+    trx_purge_rec_t purge_rec= purge_sys.fetch_next_rec();
+
+    if (!purge_rec.undo_rec)
+    {
+      if (!purge_rec.roll_ptr)
+        break;
+      ut_ad(purge_rec.roll_ptr == 1);
+      continue;
+    }
+
+    table_id_t table_id= trx_undo_rec_get_table_id(purge_rec.undo_rec);
+
+    purge_node_t *&table_node= table_id_map[table_id];
+    if (table_node)
+      ut_ad(!table_node->in_progress);
+    if (!table_node)
+    {
+      std::pair<dict_table_t *, MDL_ticket *> p;
+      p.first= trx_purge_table_open(table_id, mdl_context, &p.second);
+      if (p.first == reinterpret_cast<dict_table_t *>(-1))
+        p.first= purge_sys.close_and_reopen(table_id, thd, &p.second);
+
+      if (!thr || !(thr= UT_LIST_GET_NEXT(thrs, thr)))
+        thr= UT_LIST_GET_FIRST(purge_sys.query->thrs);
+      ++*n_work_items;
+      table_node= static_cast<purge_node_t *>(thr->child);
+
+      ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE);
+      ut_d(auto pair=) table_node->tables.emplace(table_id, p);
+      ut_ad(pair.second);
+      if (p.first)
+        goto enqueue;
+    }
+    else if (table_node->tables[table_id].first)
+    {
+    enqueue:
+      table_node->undo_recs.push(purge_rec);
+      ut_ad(!table_node->in_progress);
+    }
+
+    if (purge_sys.n_pages_handled() >= max_pages)
+      break;
+  }
+
+  purge_sys.m_active= false;
 
 #ifdef UNIV_DEBUG
-	i = 0;
-	/* Debug code to validate some pre-requisites and reset done flag. */
-	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
-	     thr != NULL && i < n_purge_threads;
-	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+  thr= UT_LIST_GET_FIRST(purge_sys.query->thrs);
+  for (ulint i= 0; thr && i < *n_work_items;
+       i++, thr= UT_LIST_GET_NEXT(thrs, thr))
+  {
+    purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+    ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+    ut_ad(!node->in_progress);
+    node->in_progress= true;
+  }
 
-		purge_node_t*		node;
-
-		/* Get the purge node. */
-		node = (purge_node_t*) thr->child;
-
-		ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
-		ut_ad(node->undo_recs.empty());
-		ut_ad(!node->in_progress);
-		ut_d(node->in_progress = true);
-	}
-
-	/* There should never be fewer nodes than threads, the inverse
-	however is allowed because we only use purge threads as needed. */
-	ut_ad(i == n_purge_threads);
+  for (; thr; thr= UT_LIST_GET_NEXT(thrs, thr))
+  {
+    purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+    ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+    ut_ad(!node->in_progress);
+    ut_ad(node->undo_recs.empty());
+  }
 #endif
 
-	/* Fetch and parse the UNDO records. The UNDO records are added
-	to a per purge node vector. */
-	thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+  ut_ad(head <= purge_sys.tail);
 
-	ut_ad(head <= purge_sys.tail);
-
-	i = 0;
-
-	std::unordered_map<table_id_t, purge_node_t*>
-		table_id_map(TRX_PURGE_TABLE_BUCKETS);
-	purge_sys.m_active = true;
-
-	MDL_context* const mdl_context
-		= static_cast<MDL_context*>(thd_mdl_context(thd));
-	ut_ad(mdl_context);
-
-	const size_t max_pages = std::min(buf_pool.curr_size * 3 / 4,
-					  size_t{srv_purge_batch_size});
-
-	while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
-		/* Track the max {trx_id, undo_no} for truncating the
-		UNDO logs once we have purged the records. */
-
-		if (head <= purge_sys.tail) {
-			head = purge_sys.tail;
-		}
-
-		/* Fetch the next record, and advance the purge_sys.tail. */
-		trx_purge_rec_t purge_rec = purge_sys.fetch_next_rec();
-
-		if (!purge_rec.undo_rec) {
-			if (!purge_rec.roll_ptr) {
-				break;
-			}
-			ut_ad(purge_rec.roll_ptr == 1);
-			continue;
-		}
-
-		table_id_t table_id = trx_undo_rec_get_table_id(
-			purge_rec.undo_rec);
-
-		purge_node_t*& table_node = table_id_map[table_id];
-
-		if (!table_node) {
-			std::pair<dict_table_t*,MDL_ticket*> p;
-			p.first = trx_purge_table_open(table_id, mdl_context,
-						       &p.second);
-			if (p.first == reinterpret_cast<dict_table_t*>(-1)) {
-				p.first = purge_sys.close_and_reopen(
-					table_id, thd, &p.second);
-			}
-
-			thr = UT_LIST_GET_NEXT(thrs, thr);
-
-			if (!(++i % n_purge_threads)) {
-				thr = UT_LIST_GET_FIRST(
-					purge_sys.query->thrs);
-			}
-
-			table_node = static_cast<purge_node_t*>(thr->child);
-			ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE);
-			ut_d(auto i=)
-			table_node->tables.emplace(table_id, p);
-			ut_ad(i.second);
-			if (p.first) {
-				goto enqueue;
-			}
-		} else if (table_node->tables[table_id].first) {
-enqueue:
-			table_node->undo_recs.push(purge_rec);
-		}
-
-		if (purge_sys.n_pages_handled() >= max_pages) {
-			break;
-		}
-	}
-
-	purge_sys.m_active = false;
-
-	ut_ad(head <= purge_sys.tail);
-
-	return head;
+  return head;
 }
 
 extern tpool::waitable_task purge_worker_task;
@@ -1373,68 +1358,89 @@ Run a purge batch.
 @return number of undo log pages handled in the batch */
 TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, ulint history_size)
 {
-	ut_ad(n_tasks > 0);
+  ut_ad(n_tasks > 0);
 
-	purge_sys.clone_oldest_view();
+  purge_sys.clone_oldest_view();
 
-#ifdef UNIV_DEBUG
-	if (srv_purge_view_update_only_debug) {
-		return(0);
-	}
-#endif /* UNIV_DEBUG */
+  ut_d(if (srv_purge_view_update_only_debug) return 0);
 
-	THD* const thd = current_thd;
+  THD *const thd= current_thd;
 
-	/* Fetch the UNDO recs that need to be purged. */
-	const purge_sys_t::iterator head
-		=  trx_purge_attach_undo_recs(n_tasks, thd);
-	const size_t n_pages = purge_sys.n_pages_handled();
+  /* Fetch the UNDO recs that need to be purged. */
+  ulint n_work= 0;
+  const purge_sys_t::iterator head= trx_purge_attach_undo_recs(thd, &n_work);
+  const size_t n_pages= purge_sys.n_pages_handled();
 
-	{
-		ulint delay = n_pages ? srv_max_purge_lag : 0;
-		if (UNIV_UNLIKELY(delay)) {
-			if (delay >= history_size) {
-		no_throttle:
-				delay = 0;
-			} else if (const ulint max_delay =
-				   srv_max_purge_lag_delay) {
-				delay = std::min(max_delay,
-						 10000 * history_size / delay
-						 - 5000);
-			} else {
-				goto no_throttle;
-			}
-		}
-		srv_dml_needed_delay = delay;
-	}
+  {
+    ulint delay= n_pages ? srv_max_purge_lag : 0;
+    if (UNIV_UNLIKELY(delay))
+    {
+      if (delay >= history_size)
+      no_throttle:
+        delay= 0;
+      else if (const ulint max_delay= srv_max_purge_lag_delay)
+        delay= std::min(max_delay, 10000 * history_size / delay - 5000);
+      else
+        goto no_throttle;
+    }
+    srv_dml_needed_delay= delay;
+  }
 
-	que_thr_t* thr = nullptr;
+  ut_ad(n_tasks);
+  que_thr_t *thr= nullptr;
 
-	/* Submit tasks to workers queue if using multi-threaded purge. */
-	for (ulint i = n_tasks; --i; ) {
-		thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
-		ut_a(thr);
-		srv_que_task_enqueue_low(thr);
-		srv_thread_pool->submit_task(&purge_worker_task);
-	}
+  if (n_work)
+  {
+    for (auto i= n_work; i--; )
+    {
+      if (!thr)
+	thr= UT_LIST_GET_FIRST(purge_sys.query->thrs);
+      else
+	thr= UT_LIST_GET_NEXT(thrs, thr);
 
-	thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+      if (!thr)
+	break;
 
-	que_run_threads(thr);
+      ut_ad(thr->state == QUE_THR_COMPLETED);
+      thr->state= QUE_THR_RUNNING;
+      thr->run_node= thr;
+      thr->prev_node= thr->common.parent;
+      purge_sys.query->state= QUE_FORK_ACTIVE;
+      purge_sys.query->last_sel_node= nullptr;
+      srv_que_task_enqueue_low(thr);
+    }
 
-	trx_purge_wait_for_workers_to_complete();
+    /*
+      To reduce context switches we only submit at most n_tasks-1 worker task.
+      (we can use less tasks, if there is not enough work)
 
-	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
-	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
-		purge_node_t* node = static_cast<purge_node_t*>(thr->child);
-		trx_purge_close_tables(node, thd);
-		node->tables.clear();
-	}
+      The coordinator does worker's job, instead of waiting and sitting idle,
+      then waits for all others to finish.
 
-	purge_sys.batch_cleanup(head);
+      This also means if innodb_purge_threads=1, the coordinator does all
+      the work alone.
+    */
+    const ulint workers{std::min(n_work, n_tasks) - 1};
+    for (ulint i= 0; i < workers; i++)
+      srv_thread_pool->submit_task(&purge_worker_task);
+    srv_purge_worker_task_low();
 
-	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
-	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages);
+    if (workers)
+      trx_purge_wait_for_workers_to_complete();
 
-	return n_pages;
+    for (thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr && n_work--;
+         thr= UT_LIST_GET_NEXT(thrs, thr))
+    {
+      purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+      trx_purge_close_tables(node, thd);
+      node->tables.clear();
+    }
+  }
+
+  purge_sys.batch_cleanup(head);
+
+  MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+  MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages);
+
+  return n_pages;
 }

From b4c2e239545f0162f8613acb0a66aee8b5dce0d4 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Sat, 3 Aug 2024 03:51:47 +0200
Subject: [PATCH 11/22] MDEV-34696: do_gco_wait() completes too early on InnoDB
 dict stats updates

Before doing mark_start_commit(), check that there is no pending deadlock
kill. If there is a pending kill, we won't commit (we will abort, roll back,
and retry). Then we should not mark the commit as started, since that could
potentially make the following GCO start too early, before we completed the
commit after the retry.

This condition could trigger in some corner cases, where InnoDB would take
temporarily table/row locks that are released again immediately, not held
until the transaction commits. This happens with dict_stats updates and
possibly auto-increment locks.

Such locks can be passed to thd_rpl_deadlock_check() and cause a deadlock
kill to be scheduled in the background. But since the blocking locks are
held only temporarily, they can be released before the background kill
happens. This way, the kill can be delayed until after mark_start_commit()
has been called. Thus we need to check the synchronous indication
rgi->killed_for_retry, not just the asynchroneous thd->killed.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 sql/rpl_parallel.cc | 20 ++++++++++++++++----
 sql/rpl_rli.cc      | 17 +++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 1cfdf96ee3b..9c4222d7817 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -1450,11 +1450,23 @@ handle_rpl_parallel_thread(void *arg)
           after mark_start_commit(), we have to unmark, which has at least a
           theoretical possibility of leaving a window where it looks like all
           transactions in a GCO have started committing, while in fact one
-          will need to rollback and retry. This is not supposed to be possible
-          (since there is a deadlock, at least one transaction should be
-          blocked from reaching commit), but this seems a fragile ensurance,
-          and there were historically a number of subtle bugs in this area.
+          will need to rollback and retry.
+
+          Normally this will not happen, since the kill is there to resolve a
+          deadlock that is preventing at least one transaction from proceeding.
+          One case it can happen is with InnoDB dict stats update, which can
+          temporarily cause transactions to block each other, but locks are
+          released immediately, they don't linger until commit. There could be
+          other similar cases, there were historically a number of subtle bugs
+          in this area.
+
+          But once we start the commit, we can expect that no new lock
+          conflicts will be introduced. So by handling any lingering deadlock
+          kill at this point just before mark_start_commit(), we should be
+          robust even towards spurious deadlock kills.
         */
+        if (rgi->killed_for_retry != rpl_group_info::RETRY_KILL_NONE)
+          wait_for_pending_deadlock_kill(thd, rgi);
         if (!thd->killed)
         {
           DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index 9d4e09a362c..8284b07f7ff 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -2518,6 +2518,23 @@ rpl_group_info::unmark_start_commit()
 
   e= this->parallel_entry;
   mysql_mutex_lock(&e->LOCK_parallel_entry);
+  /*
+    Assert that we have not already wrongly completed this GCO and signalled
+    the next one to start, only to now unmark and make the signal invalid.
+    This is to catch problems like MDEV-34696.
+
+    The error inject rpl_parallel_simulate_temp_err_xid is used to test this
+    precise situation, that we handle it gracefully if it somehow occurs in a
+    release build. So disable the assert in this case.
+  */
+#ifndef DBUG_OFF
+  bool allow_unmark_after_complete= false;
+  DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
+                  allow_unmark_after_complete= true;);
+  DBUG_ASSERT(!gco->next_gco ||
+              gco->next_gco->wait_count > e->count_committing_event_groups ||
+              allow_unmark_after_complete);
+#endif
   --e->count_committing_event_groups;
   mysql_mutex_unlock(&e->LOCK_parallel_entry);
 }

From 33854d7324b982e0dea1378c9c153cd1e276ddfc Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Sat, 3 Aug 2024 12:16:32 +0200
Subject: [PATCH 12/22] Restore skiping rpl.rpl_mdev6020 under Valgrind

(Revert a change done by mistake when XtraDB was removed.)

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 mysql-test/suite/rpl/t/rpl_mdev6020.test | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mysql-test/suite/rpl/t/rpl_mdev6020.test b/mysql-test/suite/rpl/t/rpl_mdev6020.test
index ec3fd92f817..314059a5a49 100644
--- a/mysql-test/suite/rpl/t/rpl_mdev6020.test
+++ b/mysql-test/suite/rpl/t/rpl_mdev6020.test
@@ -1,3 +1,5 @@
+# Test applies a large binlog, takes long under Valgrind with little benefit.
+--source include/not_valgrind.inc
 --source include/have_innodb.inc
 --source include/have_partition.inc
 --source include/have_binlog_format_mixed_or_row.inc

From 7dc4ea5649eed222c2427798893f90664cbec647 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Fri, 16 Aug 2024 21:58:49 +0200
Subject: [PATCH 13/22] Fix sporadic test failure in rpl.rpl_create_drop_event

Depending on timing, an extra event run could start just when the event
scheduler is shut down and delay running until after the table has been
dropped; this would cause the test to fail with a "table does not exist"
error in the log.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 mysql-test/suite/rpl/t/rpl_create_drop_event.test | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mysql-test/suite/rpl/t/rpl_create_drop_event.test b/mysql-test/suite/rpl/t/rpl_create_drop_event.test
index 96a7e82d6f7..79bb0ffec90 100644
--- a/mysql-test/suite/rpl/t/rpl_create_drop_event.test
+++ b/mysql-test/suite/rpl/t/rpl_create_drop_event.test
@@ -14,6 +14,12 @@ SET GLOBAL event_scheduler=on;
 let $wait_condition= SELECT count(*)>0 FROM t1;
 --source include/wait_condition.inc
 SET GLOBAL event_scheduler=off;
+# If the time rolls to the next whole second just at this point, a new event
+# run may be scheduled. Wait for this to disappear, otherwise we see occasional
+# test failures if the table gets dropped before the extra event run completes.
+# Expect 5 connections: default, master, master1, server_1, binlog dump thread
+--let $wait_condition= SELECT COUNT(*) = 5 FROM INFORMATION_SCHEMA.PROCESSLIST;
+--source include/wait_condition.inc
 SELECT DISTINCT a FROM t1;
 DELETE FROM t1;
 

From 214e6c5b3d05812e9f1de6d0a22955d18911a076 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Mon, 19 Aug 2024 20:37:34 +0200
Subject: [PATCH 14/22] Fix sporadic failure of test case rpl.rpl_old_master

Remove the test for MDEV-14528. This is supposed to test that parallel
replication from pre-10.0 master will update Seconds_Behind_Master. But
after MDEV-12179 the SQL thread is blocked from even beginning to fetch
events from the relay log due to FLUSH TABLES WITH READ LOCK, so the test
case is no longer testing what is was intended to. And pre-10.0 versions are
long since out of support, so does not seem worthwhile to try to rewrite the
test to work another way.

The root cause of the test failure is MDEV-34778. Briefly, depending on
exact timing during slave stop, the rli->sql_thread_caught_up flag may end
up with different value. If it ends up as "true", this causes
Seconds_Behind_Master to be 0 during next slave start; and this caused test
case timeout as the test was waiting for Seconds_Behind_Master to become
non-zero.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 mysql-test/suite/rpl/r/rpl_old_master.result | 3 ---
 mysql-test/suite/rpl/t/rpl_old_master.test   | 7 -------
 2 files changed, 10 deletions(-)

diff --git a/mysql-test/suite/rpl/r/rpl_old_master.result b/mysql-test/suite/rpl/r/rpl_old_master.result
index f985bee6832..dd3de4d327b 100644
--- a/mysql-test/suite/rpl/r/rpl_old_master.result
+++ b/mysql-test/suite/rpl/r/rpl_old_master.result
@@ -9,10 +9,7 @@ connection slave;
 SET @old_parallel= @@GLOBAL.slave_parallel_threads;
 SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_host='127.0.0.1', master_port=SERVER_MYPORT_1, master_user='root', master_log_file='master-bin.000001', master_log_pos=4;
-FLUSH TABLES WITH READ LOCK;
 include/start_slave.inc
-include/wait_for_slave_param.inc [Seconds_Behind_Master]
-UNLOCK TABLES;
 connection master;
 CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
 INSERT INTO t2 VALUES (1);
diff --git a/mysql-test/suite/rpl/t/rpl_old_master.test b/mysql-test/suite/rpl/t/rpl_old_master.test
index 6ddc227fc14..97721d5387a 100644
--- a/mysql-test/suite/rpl/t/rpl_old_master.test
+++ b/mysql-test/suite/rpl/t/rpl_old_master.test
@@ -28,14 +28,7 @@ SET GLOBAL slave_parallel_threads=10;
 --replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1
 eval CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_log_file='master-bin.000001', master_log_pos=4;
 
-# Block execution yet when the blocked query timestamp has been already accounted
-FLUSH TABLES WITH READ LOCK;
 --source include/start_slave.inc
---let $slave_param           = Seconds_Behind_Master
---let $slave_param_value     = 1
---let $slave_param_comparison= >=
---source include/wait_for_slave_param.inc
-UNLOCK TABLES;
 
 --connection master
 CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;

From 25e0224814c8883dc22cf602586e8d2c76f5e359 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Mon, 19 Aug 2024 20:58:09 +0200
Subject: [PATCH 15/22] Skip mariabackup.slave_provision_nolock in --valgrind,
 it uses a lot of CPU

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 mysql-test/suite/mariabackup/slave_provision_nolock.test | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mysql-test/suite/mariabackup/slave_provision_nolock.test b/mysql-test/suite/mariabackup/slave_provision_nolock.test
index 618f313290c..0253a6c0c2d 100644
--- a/mysql-test/suite/mariabackup/slave_provision_nolock.test
+++ b/mysql-test/suite/mariabackup/slave_provision_nolock.test
@@ -1,5 +1,7 @@
 --source include/have_innodb.inc
 --source include/have_log_bin.inc
+# Test does a lot of queries that take a lot of CPU under Valgrind.
+--source include/not_valgrind.inc
 
 call mtr.add_suppression("Can't init tc log");
 call mtr.add_suppression("Aborting");

From 8642453ce6ab1d8028ff1ac9e10f7b08418c12cb Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knielsen@knielsen-hq.org>
Date: Mon, 19 Aug 2024 22:27:44 +0200
Subject: [PATCH 16/22] Fix sporadic failure of test case
 rpl.rpl_start_stop_slave

The test was expecting the I/O thread to be in a specific state, but thread
scheduling may cause it to not yet have reached that state. So just have a
loop that waits for the expected state to occur.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
---
 mysql-test/suite/rpl/t/rpl_start_stop_slave.test | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test
index 23b25b1bf85..ce7d51ca43d 100644
--- a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test
+++ b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test
@@ -19,7 +19,17 @@
 --source include/master-slave.inc
 
 connection slave;
---let $connection_id=`SELECT id FROM information_schema.processlist where state LIKE 'Waiting for master to send event'`
+--let $i= 100
+while ($i > 0) {
+  dec $i;
+  --let $connection_id=`SELECT id FROM information_schema.processlist where state LIKE 'Waiting for master to send event'`
+  if ($connection_id) {
+    let $i= 0;
+  }
+  if ($i > 0) {
+    --sleep 0.1
+  }
+}
 
 if(!$connection_id)
 {

From 36ab75a49884d89fe733014826b5adf3e07ca9cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 27 Aug 2024 07:27:24 +0300
Subject: [PATCH 17/22] MDEV-34515: Fix a bogus debug assertion

purge_sys_t::stop_FTS(): Fix an incorrect debug assertion that
commit d58734d7812f7fccbb1a2cd802f8ce9cd0c693ba added.
The assertion would fail if there had been prior invocations of
purge_sys.stop_SYS() without purge_sys.resume_SYS().
The intention of the assertion is to check that number of pending
stop_FTS() stays below 65536.
---
 storage/innobase/srv/srv0srv.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 1cea2d9ffe8..c24be1a141e 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -1299,7 +1299,7 @@ bool purge_sys_t::running()
 void purge_sys_t::stop_FTS()
 {
   ut_d(const auto paused=) m_FTS_paused.fetch_add(1);
-  ut_ad(paused < PAUSED_SYS);
+  ut_ad((paused + 1) & ~PAUSED_SYS);
   while (m_active.load(std::memory_order_acquire))
     std::this_thread::sleep_for(std::chrono::seconds(1));
 }

From 58bc83e1a72b50dd009d5e6c370803b7c3578f9c Mon Sep 17 00:00:00 2001
From: Yuchen Pei <ycp@mariadb.com>
Date: Tue, 27 Aug 2024 15:36:39 +1000
Subject: [PATCH 18/22] [fixup] Spider: Restored lines accidentally deleted in
 MDEV-32157

Also restored a change that resulted in off-by-one, as well as
appending the correctly indexed key_hint.
---
 storage/spider/spd_db_conn.cc  | 4 +++-
 storage/spider/spd_db_mysql.cc | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/storage/spider/spd_db_conn.cc b/storage/spider/spd_db_conn.cc
index 22989bf88b9..401774ed508 100644
--- a/storage/spider/spd_db_conn.cc
+++ b/storage/spider/spd_db_conn.cc
@@ -1512,6 +1512,7 @@ int spider_db_append_key_hint(
     if (str->reserve(
           hint_str_len - 2 + SPIDER_SQL_INDEX_USE_LEN +
           SPIDER_SQL_OPEN_PAREN_LEN + SPIDER_SQL_CLOSE_PAREN_LEN))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     hint_str += 2;
     str->q_append(SPIDER_SQL_INDEX_USE_STR, SPIDER_SQL_INDEX_USE_LEN);
     str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
@@ -1524,10 +1525,11 @@ int spider_db_append_key_hint(
     if (str->reserve(
           hint_str_len - 3 + SPIDER_SQL_INDEX_IGNORE_LEN +
           SPIDER_SQL_OPEN_PAREN_LEN + SPIDER_SQL_CLOSE_PAREN_LEN))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     hint_str += 3;
     str->q_append(SPIDER_SQL_INDEX_IGNORE_STR, SPIDER_SQL_INDEX_IGNORE_LEN);
     str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
-    str->q_append(hint_str, hint_str_len - 2);
+    str->q_append(hint_str, hint_str_len - 3);
     str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
   } else if (str->reserve(hint_str_len + SPIDER_SQL_SPACE_LEN))
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
diff --git a/storage/spider/spd_db_mysql.cc b/storage/spider/spd_db_mysql.cc
index 7ea91073cf2..04f4bfd22ee 100644
--- a/storage/spider/spd_db_mysql.cc
+++ b/storage/spider/spd_db_mysql.cc
@@ -7612,8 +7612,8 @@ int spider_mbase_share::convert_key_hint_str()
       roop_count < (int) table_share->keys; roop_count++, tmp_key_hint++)
     {
       tmp_key_hint->length(0);
-      if (tmp_key_hint->append(spider_share->key_hint->ptr(),
-        spider_share->key_hint->length(), system_charset_info))
+      if (tmp_key_hint->append(spider_share->key_hint[roop_count].ptr(),
+        spider_share->key_hint[roop_count].length(), system_charset_info))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     }
   } else {

From 8cc822283e7cd0c1fc1c8f1633abd506d7d431d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 27 Aug 2024 08:51:45 +0300
Subject: [PATCH 19/22] MDEV-34515 fixup: innodb.innodb_defrag_concurrent fails

Let us avoid EXTENDED in the CHECK TABLE after a defragmentation,
because it would occasionally report an orphan delete-marked record
in the index "third". That error does not seem to be reproducible
when using the regular OPTIMIZE TABLE.

Also, let us make the test --repeat safe by removing the defragmentation
related statistics after DROP TABLE.

The defragmentation feature was removed in later releases in
commit 7ca89af6f8faf1f8ec6ede01a9353ac499d37711 (MDEV-30545)
along with this test case.
---
 mysql-test/suite/innodb/r/innodb_defrag_concurrent.result | 3 ++-
 mysql-test/suite/innodb/t/innodb_defrag_concurrent.test   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result
index 07c96e76213..942c9c94026 100644
--- a/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result
+++ b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result
@@ -75,7 +75,7 @@ disconnect con4;
 optimize table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	optimize	status	OK
-check table t1 extended;
+check table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
 select count(*) from t1;
@@ -97,6 +97,7 @@ select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like
 count(stat_value) > 0
 1
 drop table t1;
+delete from mysql.innodb_index_stats where table_name='t1';
 SET GLOBAL innodb_defragment_n_pages = @n_pages;
 SET GLOBAL innodb_defragment_stats_accuracy = @accuracy;
 SET GLOBAL innodb_stats_persistent = @sp;
diff --git a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test
index 1e4e14eb7c6..f5b7448032a 100644
--- a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test
+++ b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test
@@ -124,7 +124,7 @@ disconnect con3;
 disconnect con4;
 
 optimize table t1;
-check table t1 extended;
+check table t1;
 
 select count(*) from t1;
 select count(*) from t1 force index (second);
@@ -136,6 +136,7 @@ select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like
 select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and  stat_name  in ('n_leaf_pages_defrag');
 
 drop table t1;
+delete from mysql.innodb_index_stats where table_name='t1';
 
 # reset system
 SET GLOBAL innodb_defragment_n_pages = @n_pages;

From e7bb9b7c556e91b9ac7ddedf279d5db75d5e3a03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 27 Aug 2024 18:06:24 +0300
Subject: [PATCH 20/22] MDEV-24923 fixup: Correct a function comment

---
 storage/innobase/lock/lock0lock.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 3dd4b5fa6f6..0a125bf15c7 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -608,14 +608,13 @@ lock_rec_get_insert_intention(
 	return(lock->type_mode & LOCK_INSERT_INTENTION);
 }
 
-#ifdef UNIV_DEBUG
-#ifdef WITH_WSREP
+#if defined UNIV_DEBUG && defined WITH_WSREP
 /** Check if both conflicting lock transaction and other transaction
 requesting record lock are brute force (BF). If they are check is
 this BF-BF wait correct and if not report BF wait and assert.
 
-@param[in]	lock_rec	other waiting record lock
-@param[in]	trx		trx requesting conflicting record lock
+@param lock  other waiting lock
+@param trx   transaction requesting conflicting lock
 */
 static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 {
@@ -682,8 +681,7 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 	/* BF-BF wait is a bug */
 	ut_error;
 }
-#endif /* WITH_WSREP */
-#endif /* UNIV_DEBUG */
+#endif
 
 /*********************************************************************//**
 Checks if a lock request for a new lock has to wait for request lock2.

From bda40ccb854cc1531ec7bd1ab779504c7e00bd87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Wed, 28 Aug 2024 07:18:03 +0300
Subject: [PATCH 21/22] MDEV-34803 innodb_lru_flush_size is no longer used

In commit fa8a46eb68299776589e769844372813ebb16a99 (MDEV-33613)
the parameter innodb_lru_flush_size ceased to have any effect.

Let us declare the parameter as deprecated and additionally as
MARIADB_REMOVED_OPTION, so that there will be a warning written
to the error log in case the option is specified in the command line.

Let us also do the same for the parameter
innodb_purge_rseg_truncate_frequency
that was deprecated&ignored earlier in MDEV-32050.

Reviewed by: Debarun Banerjee
---
 .../suite/sys_vars/r/sysvars_innodb,32bit.rdiff  |  2 +-
 .../suite/sys_vars/r/sysvars_innodb.result       |  6 +++---
 sql/mysqld.cc                                    |  2 ++
 storage/innobase/buf/buf0lru.cc                  | 13 +++----------
 storage/innobase/handler/ha_innodb.cc            | 16 +++++++++-------
 storage/innobase/include/buf0lru.h               | 13 +++----------
 6 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff
index 71e745369e3..531bae3fbdd 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff
@@ -241,7 +241,7 @@
  VARIABLE_SCOPE	GLOBAL
 -VARIABLE_TYPE	BIGINT UNSIGNED
 +VARIABLE_TYPE	INT UNSIGNED
- VARIABLE_COMMENT	How many pages to flush on LRU eviction
+ VARIABLE_COMMENT	Deprecated parameter with no effect
  NUMERIC_MIN_VALUE	1
 -NUMERIC_MAX_VALUE	18446744073709551615
 +NUMERIC_MAX_VALUE	4294967295
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 893761e0ce8..0ed1051a393 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -1044,13 +1044,13 @@ SESSION_VALUE	NULL
 DEFAULT_VALUE	32
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	BIGINT UNSIGNED
-VARIABLE_COMMENT	How many pages to flush on LRU eviction
+VARIABLE_COMMENT	Deprecated parameter with no effect
 NUMERIC_MIN_VALUE	1
 NUMERIC_MAX_VALUE	18446744073709551615
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
-COMMAND_LINE_ARGUMENT	REQUIRED
+COMMAND_LINE_ARGUMENT	NULL
 VARIABLE_NAME	INNODB_LRU_SCAN_DEPTH
 SESSION_VALUE	NULL
 DEFAULT_VALUE	1536
@@ -1314,7 +1314,7 @@ NUMERIC_MAX_VALUE	128
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
-COMMAND_LINE_ARGUMENT	OPTIONAL
+COMMAND_LINE_ARGUMENT	NULL
 VARIABLE_NAME	INNODB_PURGE_THREADS
 SESSION_VALUE	NULL
 DEFAULT_VALUE	4
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 110e2c29797..8c1def600b2 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -5262,7 +5262,9 @@ static int init_server_components()
       MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"),
       MARIADB_REMOVED_OPTION("innodb-log-files-in-group"),
       MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"),
+      MARIADB_REMOVED_OPTION("innodb-lru-flush-size"),
       MARIADB_REMOVED_OPTION("innodb-page-cleaners"),
+      MARIADB_REMOVED_OPTION("innodb-purge-truncate-frequency"),
       MARIADB_REMOVED_OPTION("innodb-replication-delay"),
       MARIADB_REMOVED_OPTION("innodb-scrub-log"),
       MARIADB_REMOVED_OPTION("innodb-scrub-log-speed"),
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 9ffd4c5db20..49a088b425f 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -39,9 +39,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "srv0mon.h"
 #include "my_cpu.h"
 
-/** Flush this many pages in buf_LRU_get_free_block() */
-size_t innodb_lru_flush_size;
-
 /** The number of blocks from the LRU_old pointer onward, including
 the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
 of the whole LRU list length, except that the tolerance defined below
@@ -369,17 +366,13 @@ block to read in a page. Note that we only ever get a block from
 the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
-  * get a block from the buf_pool.free list, success:done
+  * get a block from the buf_pool.free list
   * if buf_pool.try_LRU_scan is set
     * scan LRU up to 100 pages to free a clean block
     * success:retry the free list
-  * flush up to innodb_lru_flush_size LRU blocks to data files
-    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
-    * on buf_page_write_complete() the blocks will put on buf_pool.free list
-    * success: retry the free list
+  * invoke buf_pool.page_cleaner_wakeup(true) and wait its completion
 * subsequent iterations: same as iteration 0 except:
-  * scan whole LRU list
-  * scan LRU list even if buf_pool.try_LRU_scan is not set
+  * scan the entire LRU list
 
 @param have_mutex  whether buf_pool.mutex is already being held
 @return the free control block, in state BUF_BLOCK_MEMORY */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index f569f4ba6d1..9869d0d0356 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -19251,11 +19251,6 @@ static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
   "How deep to scan LRU to keep it clean",
   NULL, NULL, 1536, 100, ~0UL, 0);
 
-static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size,
-  PLUGIN_VAR_RQCMDARG,
-  "How many pages to flush on LRU eviction",
-  NULL, NULL, 32, 1, SIZE_T_MAX, 0);
-
 static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
   PLUGIN_VAR_OPCMDARG,
   "Set to 0 (don't flush neighbors from buffer pool),"
@@ -19468,14 +19463,21 @@ static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
   10 << 20, 10 << 20,
   1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0);
 
-static ulong innodb_purge_rseg_truncate_frequency;
+static ulong innodb_purge_rseg_truncate_frequency= 128;
 
 static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency,
   innodb_purge_rseg_truncate_frequency,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED | PLUGIN_VAR_NOCMDOPT,
   "Deprecated parameter with no effect",
   NULL, NULL, 128, 1, 128, 0);
 
+static size_t innodb_lru_flush_size;
+
+static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_DEPRECATED | PLUGIN_VAR_NOCMDOPT,
+  "Deprecated parameter with no effect",
+  NULL, NULL, 32, 1, SIZE_T_MAX, 0);
+
 static void innodb_undo_log_truncate_update(THD *thd, struct st_mysql_sys_var*,
                                             void*, const void *save)
 {
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 28410276217..53a5f136fd2 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -33,9 +33,6 @@ Created 11/5/1995 Heikki Tuuri
 struct trx_t;
 struct fil_space_t;
 
-/** Flush this many pages in buf_LRU_get_free_block() */
-extern size_t innodb_lru_flush_size;
-
 /*#######################################################################
 These are low-level functions
 #########################################################################*/
@@ -71,17 +68,13 @@ block to read in a page. Note that we only ever get a block from
 the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
-  * get a block from the buf_pool.free list, success:done
+  * get a block from the buf_pool.free list
   * if buf_pool.try_LRU_scan is set
     * scan LRU up to 100 pages to free a clean block
     * success:retry the free list
-  * flush up to innodb_lru_flush_size LRU blocks to data files
-    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
-    * on buf_page_write_complete() the blocks will put on buf_pool.free list
-    * success: retry the free list
+  * invoke buf_pool.page_cleaner_wakeup(true) and wait its completion
 * subsequent iterations: same as iteration 0 except:
-  * scan whole LRU list
-  * scan LRU list even if buf_pool.try_LRU_scan is not set
+  * scan the entire LRU list
 
 @param have_mutex  whether buf_pool.mutex is already being held
 @return the free control block, in state BUF_BLOCK_MEMORY */

From 1ff6b6f0b43b4e028f2763eab95fab7b6e99791f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Wed, 28 Aug 2024 15:44:42 +0300
Subject: [PATCH 22/22] MDEV-34802 Recovery fails to note some log corruption

recv_recovery_from_checkpoint_start(): Abort startup due to log
corruption if we were unable to parse the entire log between
the latest log checkpoint and the corresponding FILE_CHECKPOINT record.

Also, reduce some code bloat related to log output and log_sys.mutex.

Reviewed by: Debarun Banerjee
---
 storage/innobase/log/log0recv.cc | 120 ++++++++++++++-----------------
 1 file changed, 54 insertions(+), 66 deletions(-)

diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 366cf524f41..e605a082fb5 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -3474,10 +3474,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 
 	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
-
-		ib::info() << "innodb_force_recovery=6 skips redo log apply";
-
-		return(DB_SUCCESS);
+		sql_print_information("InnoDB: innodb_force_recovery=6"
+				      " skips redo log apply");
+		return err;
 	}
 
 	mysql_mutex_lock(&log_sys.mutex);
@@ -3493,13 +3492,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 
 	ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size);
 
-
 	ut_ad(recv_sys.pages.empty());
 	contiguous_lsn = checkpoint_lsn;
 	switch (log_sys.log.format) {
-	case 0:
-		mysql_mutex_unlock(&log_sys.mutex);
-		return DB_SUCCESS;
 	default:
 		if (end_lsn == 0) {
 			break;
@@ -3509,8 +3504,13 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			break;
 		}
 		recv_sys.found_corrupt_log = true;
+	err_exit:
+		err = DB_ERROR;
+		/* fall through */
+	func_exit:
+	case 0:
 		mysql_mutex_unlock(&log_sys.mutex);
-		return(DB_ERROR);
+		return err;
 	}
 
 	size_t sizeof_checkpoint;
@@ -3527,14 +3527,15 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	ut_ad(!recv_sys.found_corrupt_fs || !srv_force_recovery);
 
 	if (srv_read_only_mode && recv_needed_recovery) {
-		mysql_mutex_unlock(&log_sys.mutex);
-		return(DB_READ_ONLY);
+	read_only:
+		err = DB_READ_ONLY;
+		goto func_exit;
 	}
 
 	if (recv_sys.found_corrupt_log && !srv_force_recovery) {
-		mysql_mutex_unlock(&log_sys.mutex);
-		ib::warn() << "Log scan aborted at LSN " << contiguous_lsn;
-		return(DB_ERROR);
+		sql_print_warning("InnoDB: Log scan aborted at LSN " LSN_PF,
+				  contiguous_lsn);
+		goto err_exit;
 	}
 
 	/* If we fail to open a tablespace while looking for FILE_CHECKPOINT, we
@@ -3542,14 +3543,12 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	would not be able to open an encrypted tablespace and the flag could be
 	set. */
 	if (recv_sys.found_corrupt_fs) {
-		mysql_mutex_unlock(&log_sys.mutex);
-		return DB_ERROR;
+		goto err_exit;
 	}
 
 	if (recv_sys.mlog_checkpoint_lsn == 0) {
 		lsn_t scan_lsn = log_sys.log.scanned_lsn;
 		if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) {
-			mysql_mutex_unlock(&log_sys.mutex);
 			ib::error err;
 			err << "Missing FILE_CHECKPOINT";
 			if (end_lsn) {
@@ -3557,7 +3556,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			}
 			err << " between the checkpoint " << checkpoint_lsn
 			    << " and the end " << scan_lsn << ".";
-			return(DB_ERROR);
+			goto err_exit;
 		}
 
 		log_sys.log.scanned_lsn = checkpoint_lsn;
@@ -3568,8 +3567,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 
 		if ((recv_sys.found_corrupt_log && !srv_force_recovery)
 		    || recv_sys.found_corrupt_fs) {
-			mysql_mutex_unlock(&log_sys.mutex);
-			return(DB_ERROR);
+			goto err_exit;
 		}
 	}
 
@@ -3599,19 +3597,17 @@ completed:
 		}
 
 		if (!recv_needed_recovery) {
-
-			ib::info()
-				<< "The log sequence number " << flush_lsn
-				<< " in the system tablespace does not match"
-				   " the log sequence number "
-				<< checkpoint_lsn << " in the "
-				<< LOG_FILE_NAME << "!";
+			sql_print_information(
+				"InnoDB: The log sequence number " LSN_PF
+				" in the system tablespace does not match"
+				" the log sequence number " LSN_PF
+				" in the ib_logfile0!",
+				flush_lsn, checkpoint_lsn);
 
 			if (srv_read_only_mode) {
-				ib::error() << "innodb_read_only"
-					" prevents crash recovery";
-				mysql_mutex_unlock(&log_sys.mutex);
-				return(DB_READ_ONLY);
+				sql_print_error("InnoDB: innodb_read_only"
+						" prevents crash recovery");
+				goto read_only;
 			}
 
 			recv_needed_recovery = true;
@@ -3632,8 +3628,7 @@ completed:
 			rescan, missing_tablespace);
 
 		if (err != DB_SUCCESS) {
-			mysql_mutex_unlock(&log_sys.mutex);
-			return(err);
+			goto func_exit;
 		}
 
 		/* If there is any missing tablespace and rescan is needed
@@ -3662,8 +3657,7 @@ completed:
 					rescan, missing_tablespace);
 
 			if (err != DB_SUCCESS) {
-				mysql_mutex_unlock(&log_sys.mutex);
-				return err;
+				goto func_exit;
 			}
 
 			rescan = true;
@@ -3686,8 +3680,7 @@ completed:
 			if ((recv_sys.found_corrupt_log
 			     && !srv_force_recovery)
 			    || recv_sys.found_corrupt_fs) {
-				mysql_mutex_unlock(&log_sys.mutex);
-				return(DB_ERROR);
+				goto err_exit;
 			}
 
 			/* In case of multi-batch recovery,
@@ -3699,26 +3692,26 @@ completed:
 		ut_ad(!rescan || recv_sys.pages.empty());
 	}
 
-	if (log_sys.is_physical()
-	    && (log_sys.log.scanned_lsn < checkpoint_lsn
-		|| log_sys.log.scanned_lsn < recv_max_page_lsn)) {
-
-		ib::error() << "We scanned the log up to "
-			<< log_sys.log.scanned_lsn
-			<< ". A checkpoint was at " << checkpoint_lsn << " and"
-			" the maximum LSN on a database page was "
-			<< recv_max_page_lsn << ". It is possible that the"
-			" database is now corrupt!";
-	}
-
-	if (recv_sys.recovered_lsn < checkpoint_lsn) {
-		mysql_mutex_unlock(&log_sys.mutex);
-
-		ib::error() << "Recovered only to lsn:"
-			    << recv_sys.recovered_lsn
-			    << " checkpoint_lsn: " << checkpoint_lsn;
-
-		return(DB_ERROR);
+	if (!log_sys.is_physical()) {
+	} else if (recv_sys.recovered_lsn < checkpoint_lsn
+		   || recv_sys.recovered_lsn < end_lsn) {
+		sql_print_error("InnoDB: The log was only scanned up to "
+				LSN_PF ", while the current LSN at the "
+				"time of the latest checkpoint " LSN_PF
+				" was " LSN_PF "!",
+				recv_sys.recovered_lsn,
+				checkpoint_lsn, end_lsn);
+		goto err_exit;
+	} else if (log_sys.log.scanned_lsn < checkpoint_lsn
+		   || log_sys.log.scanned_lsn < end_lsn
+		   || log_sys.log.scanned_lsn < recv_max_page_lsn) {
+		sql_print_error("InnoDB: We scanned the log up to " LSN_PF
+				". A checkpoint was at " LSN_PF
+				" and the maximum LSN on a database page was "
+				LSN_PF ". It is possible that the"
+				" database is now corrupt!",
+				log_sys.log.scanned_lsn, checkpoint_lsn,
+				recv_max_page_lsn);
 	}
 
 	log_sys.next_checkpoint_lsn = checkpoint_lsn;
@@ -3750,9 +3743,7 @@ completed:
 
 	log_sys.next_checkpoint_no = ++checkpoint_no;
 
-	DBUG_EXECUTE_IF("before_final_redo_apply",
-			mysql_mutex_unlock(&log_sys.mutex);
-			return DB_ERROR;);
+	DBUG_EXECUTE_IF("before_final_redo_apply", goto err_exit;);
 	mutex_enter(&recv_sys.mutex);
 
 	recv_sys.apply_log_recs = true;
@@ -3760,17 +3751,14 @@ completed:
 	ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
 	     || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
-	mutex_exit(&recv_sys.mutex);
-
-	mysql_mutex_unlock(&log_sys.mutex);
-
 	recv_lsn_checks_on = true;
+	mutex_exit(&recv_sys.mutex);
 
 	/* The database is now ready to start almost normal processing of user
 	transactions: transaction rollbacks and the application of the log
 	records in the hash table can be run in background. */
-
-	return(DB_SUCCESS);
+	ut_ad(err == DB_SUCCESS);
+	goto func_exit;
 }
 
 bool recv_dblwr_t::validate_page(const page_id_t page_id,