From 0f5e5ced376f8904cb3822525b60feba9f5f9105 Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Wed, 15 Oct 2025 14:29:08 -0600 Subject: [PATCH] MDEV-37662: Binlog Corruption When tmpdir is Full The binary log could be corrupted when committing a large transaction (i.e. one whose data exceeds the binlog_cache_size limit and spills into a tmp file) in binlog_format=row if the server's --tmp-dir is full. The corruption that happens is only the GTID of the errored transaction would be written into the binary log, without any body/finalizing events. This would happen because the content of the transaction wasn't flushed at the proper time, and the transaction's binlog cache data was not durable while trying to copy the content from the binlog cache file into the binary log itself. While switching the tmp file from a WRITE_CACHE to a READ_CACHE, the server would see there is still data to flush in the cache, and first try to flush it. This is not a valid time to flush that data to the temporary file though, as: 1. The GTID event has already been written directly to the binary log. So if this flushing fails, it leaves the binary log in a corrupted state. 2. This is done during group commit, and will slow down other concurrent transactions, which are otherwise ready to commit. This patch fixes these issues by ensuring all transaction data is fully flushed to its temporary file (if used) before starting any critical paths, i.e. in binlog_flush_cache(). Note that if the binlog cache is solely in-memory, this flush-to-temporary-file is skipped. Reviewed-by: Andrei Elkin Signed-off-by: Brandon Nesterenko --- ...pl_row_binlog_tmp_file_flush_enospc.result | 39 +++++++++ .../rpl_row_binlog_tmp_file_flush_enospc.test | 64 ++++++++++++++ mysys/my_write.c | 1 + sql/log.cc | 85 +++++++++++++++++-- 4 files changed, 181 insertions(+), 8 deletions(-) create mode 100644 mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result create mode 100644 mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test diff --git a/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result b/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result new file mode 100644 index 0000000000000..91e4c7f8e31b7 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result @@ -0,0 +1,39 @@ +include/master-slave.inc +[connection master] +connection master; +set @old_binlog_cache_size= @@global.binlog_cache_size; +set @@global.binlog_cache_size=4096; +CALL mtr.add_suppression('Error writing file'); +# +# Initialize test data +connection master; +create table t1 (a int, b longtext default NULL) engine=innodb; +# +# Create transaction with cache data larger than the binlog_cache_size +# so it spills into a tmp file, then simulate ENOSPC while flushing +# the tmp file. +# +set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush"; +insert into t1 values (2, repeat("y", 8192)); +ERROR HY000: Error writing file '.../tmp/' (Errcode: 28 "No space left on device") +set @@session.debug_dbug=""; +# +# Create another transaction to make sure the server/replication can +# continue working normally after the error +# +insert into t1 values (3, repeat("z", 8192)); +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/diff_tables.inc [master:test.t1,slave:test.t1] +# +# Cleanup +connection master; +drop table t1; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +connection master; +set @@global.binlog_cache_size= @old_binlog_cache_size; +include/rpl_end.inc +# End of rpl_row_binlog_tmp_file_flush_enospc.test diff --git a/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test b/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test new file mode 100644 index 0000000000000..d9cc4fa6ef9f5 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test @@ -0,0 +1,64 @@ +# +# This test ensures that a large transaction (i.e. one whose data exceeds the +# binlog_cache_size limit and spills into a tmp file) that errors while +# flushing the tmp file (e.g. due to no space on disk) does not corrupt the +# binlog, and that the server/replication can continue working normally after +# the error. To simulate the error, the test uses debug_dbug to inject a +# failure in the my_write() function, which is called when flushing the tmp +# file. +# +# References: +# MDEV-37662: Binlog Corruption When tmpdir is Full +# +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/have_binlog_format_row.inc +--source include/master-slave.inc + +--connection master +set @old_binlog_cache_size= @@global.binlog_cache_size; +set @@global.binlog_cache_size=4096; +CALL mtr.add_suppression('Error writing file'); + +--echo # +--echo # Initialize test data +--connection master +create table t1 (a int, b longtext default NULL) engine=innodb; + +--echo # +--echo # Create transaction with cache data larger than the binlog_cache_size +--echo # so it spills into a tmp file, then simulate ENOSPC while flushing +--echo # the tmp file. +--echo # +set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush"; +--replace_regex /Error writing file '[^']+\/tmp\/[^']+'/Error writing file '...\/tmp\/'/ +--error 3 +insert into t1 values (2, repeat("y", 8192)); +set @@session.debug_dbug=""; + +--echo # +--echo # Create another transaction to make sure the server/replication can +--echo # continue working normally after the error +--echo # +insert into t1 values (3, repeat("z", 8192)); +--source include/save_master_gtid.inc +--connection slave +--source include/sync_with_master_gtid.inc + +--let $diff_tables=master:test.t1,slave:test.t1 +--source include/diff_tables.inc + +--echo # +--echo # Cleanup +--connection master +drop table t1; +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc + +--connection master +set @@global.binlog_cache_size= @old_binlog_cache_size; + +--source include/rpl_end.inc +--echo # End of rpl_row_binlog_tmp_file_flush_enospc.test diff --git a/mysys/my_write.c b/mysys/my_write.c index 06914b2564108..68f568605fc8f 100644 --- a/mysys/my_write.c +++ b/mysys/my_write.c @@ -57,6 +57,7 @@ size_t my_write(File Filedes, const uchar *Buffer, size_t Count, myf MyFlags) if (!errors) { errno= ENOSPC; writtenbytes= (size_t) -1; + MyFlags&= ~MY_WAIT_IF_FULL; }); if (writtenbytes == Count) diff --git a/sql/log.cc b/sql/log.cc index 8339d7c5b2395..60290d14883ee 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -1899,6 +1899,31 @@ static int binlog_close_connection(handlerton *hton, THD *thd) DBUG_RETURN(0); } +/* + Ensures that the input IO Cache is consistent with where its data is stored, + i.e that the data is entirely either stored in-memory or backed by a + temporary file. In actuality, it is simple: if the IO Cache is actively + backed by a temporary file (i.e. the transaction or statement data is + sufficiently large to exceed its respective binlog_cache_size), then ensure + all data is flushed to the temporary file. Otherwise, the data is in-memory + by default, and we don't need to do anything. + + Returns TRUE on success, FALSE on error. +*/ +inline my_bool flush_write_buffer_if_file_backed(IO_CACHE *info) +{ + my_bool ret= 0; + DBUG_ENTER("binlog_flush_cache_log_to_disk"); + DBUG_ASSERT(info); + DBUG_ASSERT(!info->error); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("+d,simulate_file_write_error"); }); + ret= info->pos_in_file && flush_io_cache(info); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("-d,simulate_file_write_error"); }); + DBUG_RETURN(ret); +} + /* This function flushes a cache upon commit/rollback. @@ -1938,9 +1963,34 @@ binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr, (using_trx && !cache_mngr->trx_cache.empty()) || thd->transaction->xid_state.is_explicit_XA()) { - if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE)) + /* + thd->binlog_flush_pending_rows_event() ensures that the pending row event + is flushed into the respective IO cache. We also need to make sure that + the IO cache is consistent where its data is stored, i.e. it should + either be entirely in-memory or backed by a temporary file. So if + necessary (i.e if the cache data exceeds its binlog_cache_size), flush + the IO cache to its tmp file on disk. + + Technically, this reconciliation would happen automatically when writing + the cache data to the actual binlog file. We pre-empt it though because: + 1) we write the GTID event separately to the binlog directly before + moving the cache data, and if the reconciliation fails (e.g. if the + directory storing the tmp file is full), the binlog would get + corrupted with a standalone GTID event + 2) that happens during group commit with locks held, and other + ready-to-commit (concurrent) transactions could be stalled + */ + if (using_stmt && !thd->binlog_flush_pending_rows_event(TRUE, FALSE) && + flush_write_buffer_if_file_backed( + cache_mngr->get_binlog_cache_log(FALSE))) DBUG_RETURN(1); - if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE)) + + /* + See statment cache comment above. + */ + if (using_trx && !thd->binlog_flush_pending_rows_event(TRUE, TRUE) && + flush_write_buffer_if_file_backed( + cache_mngr->get_binlog_cache_log(TRUE))) DBUG_RETURN(1); #ifdef WITH_WSREP @@ -7708,8 +7758,12 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache) DBUG_ENTER("MYSQL_BIN_LOG::write_cache"); mysql_mutex_assert_owner(&LOCK_log); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("+d,simulate_file_write_error"); }); if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) DBUG_RETURN(ER_ERROR_ON_WRITE); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("-d,simulate_file_write_error"); }); size_t length= my_b_bytes_in_cache(cache), group, carry, hdr_offs; size_t val; size_t end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t @@ -8983,12 +9037,27 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, } #endif /* WITH_WSREP */ - /* - An error in the trx_cache will truncate the cache to the last good - statement, it won't leave a lingering error. Assert that this holds. - */ - DBUG_ASSERT(!(entry->using_trx_cache && !mngr->trx_cache.empty() && - mngr->get_binlog_cache_log(TRUE)->error)); + +#ifndef DBUG_OFF + if (entry->using_trx_cache) + { + IO_CACHE *cache= mngr->get_binlog_cache_log(TRUE); + /* + An error in the trx_cache will truncate the cache to the last good + statement, it won't leave a lingering error. Assert that this holds. + */ + DBUG_ASSERT(mngr->trx_cache.empty() || + !cache->error); + + /* + If the transaction uses the IO cache temporary file (i.e if it is + sufficiently large), it should be fully flushed to disk by now. + */ + DBUG_ASSERT(!cache->pos_in_file || + cache->write_pos == cache->write_buffer); + } +#endif + /* An error in the stmt_cache would be caught on the higher level and result in an incident event being written over a (possibly corrupt) cache content.